LLVM 22.0.0git
ARMISelLowering.cpp
Go to the documentation of this file.
1//===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that ARM uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "ARMISelLowering.h"
15#include "ARMBaseInstrInfo.h"
16#include "ARMBaseRegisterInfo.h"
17#include "ARMCallingConv.h"
20#include "ARMPerfectShuffle.h"
21#include "ARMRegisterInfo.h"
22#include "ARMSelectionDAGInfo.h"
23#include "ARMSubtarget.h"
27#include "Utils/ARMBaseInfo.h"
28#include "llvm/ADT/APFloat.h"
29#include "llvm/ADT/APInt.h"
30#include "llvm/ADT/ArrayRef.h"
31#include "llvm/ADT/BitVector.h"
32#include "llvm/ADT/DenseMap.h"
33#include "llvm/ADT/STLExtras.h"
36#include "llvm/ADT/Statistic.h"
38#include "llvm/ADT/StringRef.h"
40#include "llvm/ADT/Twine.h"
66#include "llvm/IR/Attributes.h"
67#include "llvm/IR/CallingConv.h"
68#include "llvm/IR/Constant.h"
69#include "llvm/IR/Constants.h"
70#include "llvm/IR/DataLayout.h"
71#include "llvm/IR/DebugLoc.h"
73#include "llvm/IR/Function.h"
74#include "llvm/IR/GlobalAlias.h"
75#include "llvm/IR/GlobalValue.h"
77#include "llvm/IR/IRBuilder.h"
78#include "llvm/IR/InlineAsm.h"
79#include "llvm/IR/Instruction.h"
82#include "llvm/IR/Intrinsics.h"
83#include "llvm/IR/IntrinsicsARM.h"
84#include "llvm/IR/Module.h"
85#include "llvm/IR/Type.h"
86#include "llvm/IR/User.h"
87#include "llvm/IR/Value.h"
88#include "llvm/MC/MCInstrDesc.h"
90#include "llvm/MC/MCSchedule.h"
97#include "llvm/Support/Debug.h"
105#include <algorithm>
106#include <cassert>
107#include <cstdint>
108#include <cstdlib>
109#include <iterator>
110#include <limits>
111#include <optional>
112#include <tuple>
113#include <utility>
114#include <vector>
115
116using namespace llvm;
117
118#define DEBUG_TYPE "arm-isel"
119
120STATISTIC(NumTailCalls, "Number of tail calls");
121STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
122STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
123STATISTIC(NumConstpoolPromoted,
124 "Number of constants with their storage promoted into constant pools");
125
126static cl::opt<bool>
127ARMInterworking("arm-interworking", cl::Hidden,
128 cl::desc("Enable / disable ARM interworking (for debugging only)"),
129 cl::init(true));
130
132 "arm-promote-constant", cl::Hidden,
133 cl::desc("Enable / disable promotion of unnamed_addr constants into "
134 "constant pools"),
135 cl::init(false)); // FIXME: set to true by default once PR32780 is fixed
137 "arm-promote-constant-max-size", cl::Hidden,
138 cl::desc("Maximum size of constant to promote into a constant pool"),
139 cl::init(64));
141 "arm-promote-constant-max-total", cl::Hidden,
142 cl::desc("Maximum size of ALL constants to promote into a constant pool"),
143 cl::init(128));
144
146MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden,
147 cl::desc("Maximum interleave factor for MVE VLDn to generate."),
148 cl::init(2));
149
151 "arm-max-base-updates-to-check", cl::Hidden,
152 cl::desc("Maximum number of base-updates to check generating postindex."),
153 cl::init(64));
154
155/// Value type used for "flags" operands / results (either CPSR or FPSCR_NZCV).
156constexpr MVT FlagsVT = MVT::i32;
157
158// The APCS parameter registers.
159static const MCPhysReg GPRArgRegs[] = {
160 ARM::R0, ARM::R1, ARM::R2, ARM::R3
161};
162
164 SelectionDAG &DAG, const SDLoc &DL) {
166 assert(Arg.ArgVT.bitsLT(MVT::i32));
167 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, Arg.ArgVT, Value);
168 SDValue Ext =
170 MVT::i32, Trunc);
171 return Ext;
172}
173
174void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) {
175 if (VT != PromotedLdStVT) {
176 setOperationAction(ISD::LOAD, VT, Promote);
177 AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
178
179 setOperationAction(ISD::STORE, VT, Promote);
180 AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
181 }
182
183 MVT ElemTy = VT.getVectorElementType();
184 if (ElemTy != MVT::f64)
188 if (ElemTy == MVT::i32) {
193 } else {
198 }
207 if (VT.isInteger()) {
211 }
212
213 // Neon does not support vector divide/remainder operations.
222
223 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
224 for (auto Opcode : {ISD::ABS, ISD::ABDS, ISD::ABDU, ISD::SMIN, ISD::SMAX,
226 setOperationAction(Opcode, VT, Legal);
227 if (!VT.isFloatingPoint())
228 for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT})
229 setOperationAction(Opcode, VT, Legal);
230}
231
232void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
233 addRegisterClass(VT, &ARM::DPRRegClass);
234 addTypeForNEON(VT, MVT::f64);
235}
236
237void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
238 addRegisterClass(VT, &ARM::DPairRegClass);
239 addTypeForNEON(VT, MVT::v2f64);
240}
241
242void ARMTargetLowering::setAllExpand(MVT VT) {
243 for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
245
246 // We support these really simple operations even on types where all
247 // the actual arithmetic has to be broken down into simpler
248 // operations or turned into library calls.
249 setOperationAction(ISD::BITCAST, VT, Legal);
250 setOperationAction(ISD::LOAD, VT, Legal);
251 setOperationAction(ISD::STORE, VT, Legal);
253}
254
255void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To,
256 LegalizeAction Action) {
257 setLoadExtAction(ISD::EXTLOAD, From, To, Action);
258 setLoadExtAction(ISD::ZEXTLOAD, From, To, Action);
259 setLoadExtAction(ISD::SEXTLOAD, From, To, Action);
260}
261
262void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
263 const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 };
264
265 for (auto VT : IntTypes) {
266 addRegisterClass(VT, &ARM::MQPRRegClass);
280 setOperationAction(ISD::MLOAD, VT, Custom);
281 setOperationAction(ISD::MSTORE, VT, Legal);
296
297 // No native support for these.
307
308 // Vector reductions
309 setOperationAction(ISD::VECREDUCE_ADD, VT, Legal);
310 setOperationAction(ISD::VECREDUCE_SMAX, VT, Legal);
311 setOperationAction(ISD::VECREDUCE_UMAX, VT, Legal);
312 setOperationAction(ISD::VECREDUCE_SMIN, VT, Legal);
313 setOperationAction(ISD::VECREDUCE_UMIN, VT, Legal);
314 setOperationAction(ISD::VECREDUCE_MUL, VT, Custom);
315 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
316 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
317 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
318
319 if (!HasMVEFP) {
324 } else {
327 }
328
329 // Pre and Post inc are supported on loads and stores
330 for (unsigned im = (unsigned)ISD::PRE_INC;
331 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
336 }
337 }
338
339 const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 };
340 for (auto VT : FloatTypes) {
341 addRegisterClass(VT, &ARM::MQPRRegClass);
342 if (!HasMVEFP)
343 setAllExpand(VT);
344
345 // These are legal or custom whether we have MVE.fp or not
354 setOperationAction(ISD::MLOAD, VT, Custom);
355 setOperationAction(ISD::MSTORE, VT, Legal);
358
359 // Pre and Post inc are supported on loads and stores
360 for (unsigned im = (unsigned)ISD::PRE_INC;
361 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
366 }
367
368 if (HasMVEFP) {
369 setOperationAction(ISD::FMINNUM, VT, Legal);
370 setOperationAction(ISD::FMAXNUM, VT, Legal);
371 setOperationAction(ISD::FROUND, VT, Legal);
372 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
373 setOperationAction(ISD::FRINT, VT, Legal);
374 setOperationAction(ISD::FTRUNC, VT, Legal);
375 setOperationAction(ISD::FFLOOR, VT, Legal);
376 setOperationAction(ISD::FCEIL, VT, Legal);
377 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
378 setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom);
379 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
380 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
381
382 // No native support for these.
385 setOperationAction(ISD::FSQRT, VT, Expand);
386 setOperationAction(ISD::FSIN, VT, Expand);
387 setOperationAction(ISD::FCOS, VT, Expand);
388 setOperationAction(ISD::FTAN, VT, Expand);
389 setOperationAction(ISD::FPOW, VT, Expand);
390 setOperationAction(ISD::FLOG, VT, Expand);
391 setOperationAction(ISD::FLOG2, VT, Expand);
392 setOperationAction(ISD::FLOG10, VT, Expand);
393 setOperationAction(ISD::FEXP, VT, Expand);
394 setOperationAction(ISD::FEXP2, VT, Expand);
395 setOperationAction(ISD::FEXP10, VT, Expand);
396 setOperationAction(ISD::FNEARBYINT, VT, Expand);
397 }
398 }
399
400 // Custom Expand smaller than legal vector reductions to prevent false zero
401 // items being added.
402 setOperationAction(ISD::VECREDUCE_FADD, MVT::v4f16, Custom);
403 setOperationAction(ISD::VECREDUCE_FMUL, MVT::v4f16, Custom);
404 setOperationAction(ISD::VECREDUCE_FMIN, MVT::v4f16, Custom);
405 setOperationAction(ISD::VECREDUCE_FMAX, MVT::v4f16, Custom);
406 setOperationAction(ISD::VECREDUCE_FADD, MVT::v2f16, Custom);
407 setOperationAction(ISD::VECREDUCE_FMUL, MVT::v2f16, Custom);
408 setOperationAction(ISD::VECREDUCE_FMIN, MVT::v2f16, Custom);
409 setOperationAction(ISD::VECREDUCE_FMAX, MVT::v2f16, Custom);
410
411 // We 'support' these types up to bitcast/load/store level, regardless of
412 // MVE integer-only / float support. Only doing FP data processing on the FP
413 // vector types is inhibited at integer-only level.
414 const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 };
415 for (auto VT : LongTypes) {
416 addRegisterClass(VT, &ARM::MQPRRegClass);
417 setAllExpand(VT);
423 }
425
426 // We can do bitwise operations on v2i64 vectors
427 setOperationAction(ISD::AND, MVT::v2i64, Legal);
428 setOperationAction(ISD::OR, MVT::v2i64, Legal);
429 setOperationAction(ISD::XOR, MVT::v2i64, Legal);
430
431 // It is legal to extload from v4i8 to v4i16 or v4i32.
432 addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal);
433 addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal);
434 addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal);
435
436 // It is legal to sign extend from v4i8/v4i16 to v4i32 or v8i8 to v8i16.
442
443 // Some truncating stores are legal too.
444 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
445 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
446 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
447
448 // Pre and Post inc on these are legal, given the correct extends
449 for (unsigned im = (unsigned)ISD::PRE_INC;
450 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
451 for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) {
456 }
457 }
458
459 // Predicate types
460 const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1};
461 for (auto VT : pTypes) {
462 addRegisterClass(VT, &ARM::VCCRRegClass);
471 setOperationAction(ISD::LOAD, VT, Custom);
472 setOperationAction(ISD::STORE, VT, Custom);
477
478 if (!HasMVEFP) {
483 }
484 }
488 setOperationAction(ISD::OR, MVT::v2i1, Expand);
494
503}
504
506 return static_cast<const ARMBaseTargetMachine &>(getTargetMachine());
507}
508
510 const ARMSubtarget &STI)
511 : TargetLowering(TM_, STI), Subtarget(&STI),
512 RegInfo(Subtarget->getRegisterInfo()),
513 Itins(Subtarget->getInstrItineraryData()) {
514 const auto &TM = static_cast<const ARMBaseTargetMachine &>(TM_);
515
518
519 const Triple &TT = TM.getTargetTriple();
520
521 if (Subtarget->isThumb1Only())
522 addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
523 else
524 addRegisterClass(MVT::i32, &ARM::GPRRegClass);
525
526 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() &&
527 Subtarget->hasFPRegs()) {
528 addRegisterClass(MVT::f32, &ARM::SPRRegClass);
529 addRegisterClass(MVT::f64, &ARM::DPRRegClass);
530
535
536 if (!Subtarget->hasVFP2Base()) {
537 setAllExpand(MVT::f32);
538 } else {
541 setOperationAction(Op, MVT::f32, Legal);
542 }
543 if (!Subtarget->hasFP64()) {
544 setAllExpand(MVT::f64);
545 } else {
548 setOperationAction(Op, MVT::f64, Legal);
549
551 }
552 }
553
554 if (Subtarget->hasFullFP16()) {
557 setOperationAction(Op, MVT::f16, Legal);
558
559 addRegisterClass(MVT::f16, &ARM::HPRRegClass);
560 setOperationAction(ISD::BITCAST, MVT::i16, Custom);
561 setOperationAction(ISD::BITCAST, MVT::f16, Custom);
562
563 setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
564 setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
567 }
568
569 if (Subtarget->hasBF16()) {
570 addRegisterClass(MVT::bf16, &ARM::HPRRegClass);
571 setAllExpand(MVT::bf16);
572 if (!Subtarget->hasFullFP16())
573 setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
574 } else {
575 setOperationAction(ISD::BF16_TO_FP, MVT::f32, Expand);
576 setOperationAction(ISD::BF16_TO_FP, MVT::f64, Expand);
577 setOperationAction(ISD::FP_TO_BF16, MVT::f32, Custom);
578 setOperationAction(ISD::FP_TO_BF16, MVT::f64, Custom);
579 }
580
582 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
583 setTruncStoreAction(VT, InnerVT, Expand);
584 addAllExtLoads(VT, InnerVT, Expand);
585 }
586
589
591 }
592
593 if (!Subtarget->isThumb1Only() && !Subtarget->hasV8_1MMainlineOps())
595
596 if (!Subtarget->hasV8_1MMainlineOps())
598
599 if (!Subtarget->isThumb1Only())
601
604
607
608 if (Subtarget->hasMVEIntegerOps())
609 addMVEVectorTypes(Subtarget->hasMVEFloatOps());
610
611 // Combine low-overhead loop intrinsics so that we can lower i1 types.
612 if (Subtarget->hasLOB()) {
613 setTargetDAGCombine({ISD::BRCOND, ISD::BR_CC});
614 }
615
616 if (Subtarget->hasNEON()) {
617 addDRTypeForNEON(MVT::v2f32);
618 addDRTypeForNEON(MVT::v8i8);
619 addDRTypeForNEON(MVT::v4i16);
620 addDRTypeForNEON(MVT::v2i32);
621 addDRTypeForNEON(MVT::v1i64);
622
623 addQRTypeForNEON(MVT::v4f32);
624 addQRTypeForNEON(MVT::v2f64);
625 addQRTypeForNEON(MVT::v16i8);
626 addQRTypeForNEON(MVT::v8i16);
627 addQRTypeForNEON(MVT::v4i32);
628 addQRTypeForNEON(MVT::v2i64);
629
630 if (Subtarget->hasFullFP16()) {
631 addQRTypeForNEON(MVT::v8f16);
632 addDRTypeForNEON(MVT::v4f16);
633 }
634
635 if (Subtarget->hasBF16()) {
636 addQRTypeForNEON(MVT::v8bf16);
637 addDRTypeForNEON(MVT::v4bf16);
638 }
639 }
640
641 if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) {
642 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
643 // none of Neon, MVE or VFP supports any arithmetic operations on it.
644 setOperationAction(ISD::FADD, MVT::v2f64, Expand);
645 setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
646 setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
647 // FIXME: Code duplication: FDIV and FREM are expanded always, see
648 // ARMTargetLowering::addTypeForNEON method for details.
649 setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
650 setOperationAction(ISD::FREM, MVT::v2f64, Expand);
651 // FIXME: Create unittest.
652 // In another words, find a way when "copysign" appears in DAG with vector
653 // operands.
655 // FIXME: Code duplication: SETCC has custom operation action, see
656 // ARMTargetLowering::addTypeForNEON method for details.
658 // FIXME: Create unittest for FNEG and for FABS.
659 setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
660 setOperationAction(ISD::FABS, MVT::v2f64, Expand);
661 setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
662 setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
663 setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
664 setOperationAction(ISD::FTAN, MVT::v2f64, Expand);
665 setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
666 setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
667 setOperationAction(ISD::FLOG2, MVT::v2f64, Expand);
668 setOperationAction(ISD::FLOG10, MVT::v2f64, Expand);
669 setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
670 setOperationAction(ISD::FEXP2, MVT::v2f64, Expand);
671 setOperationAction(ISD::FEXP10, MVT::v2f64, Expand);
672 setOperationAction(ISD::FCEIL, MVT::v2f64, Expand);
673 setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand);
674 setOperationAction(ISD::FRINT, MVT::v2f64, Expand);
675 setOperationAction(ISD::FROUNDEVEN, MVT::v2f64, Expand);
676 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
677 setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
678 setOperationAction(ISD::FMA, MVT::v2f64, Expand);
679 }
680
681 if (Subtarget->hasNEON()) {
682 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
683 // supported for v4f32.
684 setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
685 setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
686 setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
687 setOperationAction(ISD::FTAN, MVT::v4f32, Expand);
688 setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
689 setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
690 setOperationAction(ISD::FLOG2, MVT::v4f32, Expand);
691 setOperationAction(ISD::FLOG10, MVT::v4f32, Expand);
692 setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
693 setOperationAction(ISD::FEXP2, MVT::v4f32, Expand);
694 setOperationAction(ISD::FEXP10, MVT::v4f32, Expand);
695 setOperationAction(ISD::FCEIL, MVT::v4f32, Expand);
696 setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand);
697 setOperationAction(ISD::FRINT, MVT::v4f32, Expand);
698 setOperationAction(ISD::FROUNDEVEN, MVT::v4f32, Expand);
699 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
700 setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand);
701
702 // Mark v2f32 intrinsics.
703 setOperationAction(ISD::FSQRT, MVT::v2f32, Expand);
704 setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
705 setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
706 setOperationAction(ISD::FTAN, MVT::v2f32, Expand);
707 setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
708 setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
709 setOperationAction(ISD::FLOG2, MVT::v2f32, Expand);
710 setOperationAction(ISD::FLOG10, MVT::v2f32, Expand);
711 setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
712 setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
713 setOperationAction(ISD::FEXP10, MVT::v2f32, Expand);
714 setOperationAction(ISD::FCEIL, MVT::v2f32, Expand);
715 setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand);
716 setOperationAction(ISD::FRINT, MVT::v2f32, Expand);
717 setOperationAction(ISD::FROUNDEVEN, MVT::v2f32, Expand);
718 setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand);
719 setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand);
720
721 for (ISD::NodeType Op : {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL,
722 ISD::FRINT, ISD::FTRUNC, ISD::FROUNDEVEN}) {
723 setOperationAction(Op, MVT::v4f16, Expand);
724 setOperationAction(Op, MVT::v8f16, Expand);
725 }
726
727 // Neon does not support some operations on v1i64 and v2i64 types.
728 setOperationAction(ISD::MUL, MVT::v1i64, Expand);
729 // Custom handling for some quad-vector types to detect VMULL.
730 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
731 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
732 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
733 // Custom handling for some vector types to avoid expensive expansions
734 setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
736 setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
738 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
739 // a destination type that is wider than the source, and nor does
740 // it have a FP_TO_[SU]INT instruction with a narrower destination than
741 // source.
750
752 setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand);
753
754 // NEON does not have single instruction CTPOP for vectors with element
755 // types wider than 8-bits. However, custom lowering can leverage the
756 // v8i8/v16i8 vcnt instruction.
763
764 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
765 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
766
767 // NEON does not have single instruction CTTZ for vectors.
769 setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);
770 setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);
771 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
772
773 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
774 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
775 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
776 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
777
782
787
791 }
792
793 // NEON only has FMA instructions as of VFP4.
794 if (!Subtarget->hasVFP4Base()) {
795 setOperationAction(ISD::FMA, MVT::v2f32, Expand);
796 setOperationAction(ISD::FMA, MVT::v4f32, Expand);
797 }
798
800 ISD::FP_TO_UINT, ISD::FMUL, ISD::LOAD});
801
802 // It is legal to extload from v4i8 to v4i16 or v4i32.
803 for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
804 MVT::v2i32}) {
809 }
810 }
811
812 for (auto VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
813 MVT::v4i32}) {
814 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
815 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
816 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
817 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
818 }
819 }
820
821 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
827 ISD::INTRINSIC_VOID, ISD::VECREDUCE_ADD, ISD::ADD, ISD::BITCAST});
828 }
829 if (Subtarget->hasMVEIntegerOps()) {
831 ISD::FP_EXTEND, ISD::SELECT, ISD::SELECT_CC,
832 ISD::SETCC});
833 }
834 if (Subtarget->hasMVEFloatOps()) {
836 }
837
838 if (!Subtarget->hasFP64()) {
839 // When targeting a floating-point unit with only single-precision
840 // operations, f64 is legal for the few double-precision instructions which
841 // are present However, no double-precision operations other than moves,
842 // loads and stores are provided by the hardware.
851 setOperationAction(ISD::FNEG, MVT::f64, Expand);
852 setOperationAction(ISD::FABS, MVT::f64, Expand);
853 setOperationAction(ISD::FSQRT, MVT::f64, Expand);
854 setOperationAction(ISD::FSIN, MVT::f64, Expand);
855 setOperationAction(ISD::FCOS, MVT::f64, Expand);
856 setOperationAction(ISD::FPOW, MVT::f64, Expand);
857 setOperationAction(ISD::FLOG, MVT::f64, Expand);
858 setOperationAction(ISD::FLOG2, MVT::f64, Expand);
859 setOperationAction(ISD::FLOG10, MVT::f64, Expand);
860 setOperationAction(ISD::FEXP, MVT::f64, Expand);
861 setOperationAction(ISD::FEXP2, MVT::f64, Expand);
862 setOperationAction(ISD::FEXP10, MVT::f64, Expand);
863 setOperationAction(ISD::FCEIL, MVT::f64, Expand);
864 setOperationAction(ISD::FTRUNC, MVT::f64, Expand);
865 setOperationAction(ISD::FRINT, MVT::f64, Expand);
866 setOperationAction(ISD::FROUNDEVEN, MVT::f64, Expand);
867 setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand);
868 setOperationAction(ISD::FFLOOR, MVT::f64, Expand);
879 }
880
883
884 if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) {
885 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
887 if (Subtarget->hasFullFP16()) {
890 }
891 } else {
893 }
894
895 if (!Subtarget->hasFP16()) {
896 setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
898 } else {
901 }
902
903 computeRegisterProperties(Subtarget->getRegisterInfo());
904
905 // ARM does not have floating-point extending loads.
906 for (MVT VT : MVT::fp_valuetypes()) {
907 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
908 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
909 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
910 }
911
912 // ... or truncating stores
913 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
914 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
915 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
916 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
917 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
918
919 // ARM does not have i1 sign extending load.
920 for (MVT VT : MVT::integer_valuetypes())
922
923 // ARM supports all 4 flavors of integer indexed load / store.
924 if (!Subtarget->isThumb1Only()) {
925 for (unsigned im = (unsigned)ISD::PRE_INC;
927 setIndexedLoadAction(im, MVT::i1, Legal);
928 setIndexedLoadAction(im, MVT::i8, Legal);
929 setIndexedLoadAction(im, MVT::i16, Legal);
930 setIndexedLoadAction(im, MVT::i32, Legal);
931 setIndexedStoreAction(im, MVT::i1, Legal);
932 setIndexedStoreAction(im, MVT::i8, Legal);
933 setIndexedStoreAction(im, MVT::i16, Legal);
934 setIndexedStoreAction(im, MVT::i32, Legal);
935 }
936 } else {
937 // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
940 }
941
946
949 if (Subtarget->hasDSP()) {
958 }
959 if (Subtarget->hasBaseDSP()) {
962 }
963
964 // i64 operation support.
967 if (Subtarget->isThumb1Only()) {
970 }
971 if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
972 || (Subtarget->isThumb2() && !Subtarget->hasDSP()))
974
982 setOperationAction(ISD::LOAD, MVT::i64, Custom);
983 setOperationAction(ISD::STORE, MVT::i64, Custom);
984
985 // MVE lowers 64 bit shifts to lsll and lsrl
986 // assuming that ISD::SRL and SRA of i64 are already marked custom
987 if (Subtarget->hasMVEIntegerOps())
989
990 // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1.
991 if (Subtarget->isThumb1Only()) {
995 }
996
997 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
999
1000 // ARM does not have ROTL.
1005 }
1007 // TODO: These two should be set to LibCall, but this currently breaks
1008 // the Linux kernel build. See #101786.
1011 if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) {
1014 }
1015
1016 // @llvm.readcyclecounter requires the Performance Monitors extension.
1017 // Default to the 0 expansion on unsupported platforms.
1018 // FIXME: Technically there are older ARM CPUs that have
1019 // implementation-specific ways of obtaining this information.
1020 if (Subtarget->hasPerfMon())
1021 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
1022
1023 // Only ARMv6 has BSWAP.
1024 if (!Subtarget->hasV6Ops())
1026
1027 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
1028 : Subtarget->hasDivideInARMMode();
1029 if (!hasDivide) {
1030 // These are expanded into libcalls if the cpu doesn't have HW divider.
1033 }
1034
1035 if (TT.isOSWindows() && !Subtarget->hasDivideInThumbMode()) {
1038
1041 }
1042
1045
1046 // Register based DivRem for AEABI (RTABI 4.2)
1047 if (TT.isTargetAEABI() || TT.isAndroid() || TT.isTargetGNUAEABI() ||
1048 TT.isTargetMuslAEABI() || TT.isOSFuchsia() || TT.isOSWindows()) {
1051 HasStandaloneRem = false;
1052
1057 } else {
1060 }
1061
1066
1067 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1068 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
1069
1070 // Use the default implementation.
1071 setOperationAction(ISD::VASTART, MVT::Other, Custom);
1072 setOperationAction(ISD::VAARG, MVT::Other, Expand);
1073 setOperationAction(ISD::VACOPY, MVT::Other, Expand);
1074 setOperationAction(ISD::VAEND, MVT::Other, Expand);
1075 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
1076 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
1077
1078 if (TT.isOSWindows())
1079 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
1080 else
1081 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
1082
1083 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
1084 // the default expansion.
1085 InsertFencesForAtomic = false;
1086 if (Subtarget->hasAnyDataBarrier() &&
1087 (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
1088 // ATOMIC_FENCE needs custom lowering; the others should have been expanded
1089 // to ldrex/strex loops already.
1090 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
1091 if (!Subtarget->isThumb() || !Subtarget->isMClass())
1092 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
1093
1094 // On v8, we have particularly efficient implementations of atomic fences
1095 // if they can be combined with nearby atomic loads and stores.
1096 if (!Subtarget->hasAcquireRelease() ||
1097 getTargetMachine().getOptLevel() == CodeGenOptLevel::None) {
1098 // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
1099 InsertFencesForAtomic = true;
1100 }
1101 } else {
1102 // If there's anything we can use as a barrier, go through custom lowering
1103 // for ATOMIC_FENCE.
1104 // If target has DMB in thumb, Fences can be inserted.
1105 if (Subtarget->hasDataBarrier())
1106 InsertFencesForAtomic = true;
1107
1108 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other,
1109 Subtarget->hasAnyDataBarrier() ? Custom : Expand);
1110
1111 // Set them all for libcall, which will force libcalls.
1112 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, LibCall);
1113 setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, LibCall);
1114 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, LibCall);
1115 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, LibCall);
1116 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, LibCall);
1117 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, LibCall);
1118 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall);
1119 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, LibCall);
1120 setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, LibCall);
1121 setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, LibCall);
1122 setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, LibCall);
1123 setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, LibCall);
1124 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
1125 // Unordered/Monotonic case.
1126 if (!InsertFencesForAtomic) {
1127 setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
1128 setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
1129 }
1130 }
1131
1132 // Compute supported atomic widths.
1133 if (TT.isOSLinux() || (!Subtarget->isMClass() && Subtarget->hasV6Ops())) {
1134 // For targets where __sync_* routines are reliably available, we use them
1135 // if necessary.
1136 //
1137 // ARM Linux always supports 64-bit atomics through kernel-assisted atomic
1138 // routines (kernel 3.1 or later). FIXME: Not with compiler-rt?
1139 //
1140 // ARMv6 targets have native instructions in ARM mode. For Thumb mode,
1141 // such targets should provide __sync_* routines, which use the ARM mode
1142 // instructions. (ARMv6 doesn't have dmb, but it has an equivalent
1143 // encoding; see ARMISD::MEMBARRIER_MCR.)
1145 } else if ((Subtarget->isMClass() && Subtarget->hasV8MBaselineOps()) ||
1146 Subtarget->hasForced32BitAtomics()) {
1147 // Cortex-M (besides Cortex-M0) have 32-bit atomics.
1149 } else {
1150 // We can't assume anything about other targets; just use libatomic
1151 // routines.
1153 }
1154
1156
1157 setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
1158
1159 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
1160 if (!Subtarget->hasV6Ops()) {
1163 }
1165
1166 if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() &&
1167 !Subtarget->isThumb1Only()) {
1168 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
1169 // iff target supports vfp2.
1170 setOperationAction(ISD::BITCAST, MVT::i64, Custom);
1172 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
1173 setOperationAction(ISD::GET_FPENV, MVT::i32, Legal);
1174 setOperationAction(ISD::SET_FPENV, MVT::i32, Legal);
1175 setOperationAction(ISD::RESET_FPENV, MVT::Other, Legal);
1176 setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal);
1177 setOperationAction(ISD::SET_FPMODE, MVT::i32, Custom);
1178 setOperationAction(ISD::RESET_FPMODE, MVT::Other, Custom);
1179 }
1180
1181 // We want to custom lower some of our intrinsics.
1186
1196 if (Subtarget->hasFullFP16()) {
1200 }
1201
1203
1204 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
1205 setOperationAction(ISD::BR_CC, MVT::i32, Custom);
1206 if (Subtarget->hasFullFP16())
1207 setOperationAction(ISD::BR_CC, MVT::f16, Custom);
1208 setOperationAction(ISD::BR_CC, MVT::f32, Custom);
1209 setOperationAction(ISD::BR_CC, MVT::f64, Custom);
1210 setOperationAction(ISD::BR_JT, MVT::Other, Custom);
1211
1212 // We don't support sin/cos/fmod/copysign/pow
1213 setOperationAction(ISD::FSIN, MVT::f64, Expand);
1214 setOperationAction(ISD::FSIN, MVT::f32, Expand);
1215 setOperationAction(ISD::FCOS, MVT::f32, Expand);
1216 setOperationAction(ISD::FCOS, MVT::f64, Expand);
1217 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
1218 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
1221 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() &&
1222 !Subtarget->isThumb1Only()) {
1225 }
1226 setOperationAction(ISD::FPOW, MVT::f64, Expand);
1227 setOperationAction(ISD::FPOW, MVT::f32, Expand);
1228
1229 if (!Subtarget->hasVFP4Base()) {
1232 }
1233
1234 // Various VFP goodness
1235 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
1236 // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
1237 if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) {
1238 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
1239 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
1240 setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f64, Expand);
1241 setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f64, Expand);
1242 }
1243
1244 // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
1245 if (!Subtarget->hasFP16()) {
1246 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
1247 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
1248 setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f32, Expand);
1249 setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f32, Expand);
1250 }
1251
1252 // Strict floating-point comparisons need custom lowering.
1259 }
1260
1261 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
1262 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
1263
1264 // FP-ARMv8 implements a lot of rounding-like FP operations.
1265 if (Subtarget->hasFPARMv8Base()) {
1266 for (auto Op :
1267 {ISD::FFLOOR, ISD::FCEIL, ISD::FROUND,
1268 ISD::FTRUNC, ISD::FNEARBYINT, ISD::FRINT,
1269 ISD::FROUNDEVEN, ISD::FMINNUM, ISD::FMAXNUM,
1273 setOperationAction(Op, MVT::f32, Legal);
1274
1275 if (Subtarget->hasFP64())
1276 setOperationAction(Op, MVT::f64, Legal);
1277 }
1278
1279 if (Subtarget->hasNEON()) {
1280 setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal);
1281 setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal);
1282 setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
1283 setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
1284 }
1285 }
1286
1287 // FP16 often need to be promoted to call lib functions
1288 // clang-format off
1289 if (Subtarget->hasFullFP16()) {
1290 setOperationAction(ISD::LRINT, MVT::f16, Expand);
1291 setOperationAction(ISD::LROUND, MVT::f16, Expand);
1293
1294 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
1295 ISD::FCOS, ISD::FSIN, ISD::FSINCOS,
1296 ISD::FSINCOSPI, ISD::FMODF, ISD::FACOS,
1297 ISD::FASIN, ISD::FATAN, ISD::FATAN2,
1298 ISD::FCOSH, ISD::FSINH, ISD::FTANH,
1299 ISD::FTAN, ISD::FEXP, ISD::FEXP2,
1300 ISD::FEXP10, ISD::FLOG, ISD::FLOG2,
1301 ISD::FLOG10, ISD::STRICT_FREM, ISD::STRICT_FPOW,
1308 setOperationAction(Op, MVT::f16, Promote);
1309 }
1310
1311 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
1312 // because the result type is integer.
1314 setOperationAction(Op, MVT::f16, Custom);
1315
1316 for (auto Op : {ISD::FROUND, ISD::FROUNDEVEN, ISD::FTRUNC,
1317 ISD::FNEARBYINT, ISD::FRINT, ISD::FFLOOR,
1321 setOperationAction(Op, MVT::f16, Legal);
1322 }
1323 // clang-format on
1324 }
1325
1326 if (Subtarget->hasNEON()) {
1327 // vmin and vmax aren't available in a scalar form, so we can use
1328 // a NEON instruction with an undef lane instead.
1329 setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);
1330 setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);
1331 setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
1332 setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
1333 setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal);
1334 setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal);
1335 setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal);
1336 setOperationAction(ISD::FMAXIMUM, MVT::v4f32, Legal);
1337
1338 if (Subtarget->hasV8Ops()) {
1339 setOperationAction(ISD::FFLOOR, MVT::v2f32, Legal);
1340 setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
1341 setOperationAction(ISD::FROUND, MVT::v2f32, Legal);
1342 setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
1343 setOperationAction(ISD::FROUNDEVEN, MVT::v2f32, Legal);
1344 setOperationAction(ISD::FROUNDEVEN, MVT::v4f32, Legal);
1345 setOperationAction(ISD::FCEIL, MVT::v2f32, Legal);
1346 setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
1347 setOperationAction(ISD::FTRUNC, MVT::v2f32, Legal);
1348 setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
1349 setOperationAction(ISD::FRINT, MVT::v2f32, Legal);
1350 setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
1351 }
1352
1353 if (Subtarget->hasFullFP16()) {
1354 setOperationAction(ISD::FMINNUM, MVT::v4f16, Legal);
1355 setOperationAction(ISD::FMAXNUM, MVT::v4f16, Legal);
1356 setOperationAction(ISD::FMINNUM, MVT::v8f16, Legal);
1357 setOperationAction(ISD::FMAXNUM, MVT::v8f16, Legal);
1358
1359 setOperationAction(ISD::FMINIMUM, MVT::v4f16, Legal);
1360 setOperationAction(ISD::FMAXIMUM, MVT::v4f16, Legal);
1361 setOperationAction(ISD::FMINIMUM, MVT::v8f16, Legal);
1362 setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Legal);
1363
1364 setOperationAction(ISD::FFLOOR, MVT::v4f16, Legal);
1365 setOperationAction(ISD::FFLOOR, MVT::v8f16, Legal);
1366 setOperationAction(ISD::FROUND, MVT::v4f16, Legal);
1367 setOperationAction(ISD::FROUND, MVT::v8f16, Legal);
1368 setOperationAction(ISD::FROUNDEVEN, MVT::v4f16, Legal);
1369 setOperationAction(ISD::FROUNDEVEN, MVT::v8f16, Legal);
1370 setOperationAction(ISD::FCEIL, MVT::v4f16, Legal);
1371 setOperationAction(ISD::FCEIL, MVT::v8f16, Legal);
1372 setOperationAction(ISD::FTRUNC, MVT::v4f16, Legal);
1373 setOperationAction(ISD::FTRUNC, MVT::v8f16, Legal);
1374 setOperationAction(ISD::FRINT, MVT::v4f16, Legal);
1375 setOperationAction(ISD::FRINT, MVT::v8f16, Legal);
1376 }
1377 }
1378
1379 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
1380 // it, but it's just a wrapper around ldexp.
1381 if (TT.isOSWindows()) {
1382 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
1383 if (isOperationExpand(Op, MVT::f32))
1384 setOperationAction(Op, MVT::f32, Promote);
1385 }
1386
1387 // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
1388 // isn't legal.
1389 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
1390 if (isOperationExpand(Op, MVT::f16))
1391 setOperationAction(Op, MVT::f16, Promote);
1392
1393 // We have target-specific dag combine patterns for the following nodes:
1394 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine
1397
1398 if (Subtarget->hasMVEIntegerOps())
1400
1401 if (Subtarget->hasV6Ops())
1403 if (Subtarget->isThumb1Only())
1405 // Attempt to lower smin/smax to ssat/usat
1406 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) ||
1407 Subtarget->isThumb2()) {
1409 }
1410
1412
1413 if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
1414 !Subtarget->hasVFP2Base() || Subtarget->hasMinSize())
1416 else
1418
1419 //// temporary - rewrite interface to use type
1422 MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
1424 MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
1426
1427 // On ARM arguments smaller than 4 bytes are extended, so all arguments
1428 // are at least 4 bytes aligned.
1430
1431 // Prefer likely predicted branches to selects on out-of-order cores.
1432 PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
1433
1434 setPrefLoopAlignment(Align(1ULL << Subtarget->getPreferBranchLogAlignment()));
1436 Align(1ULL << Subtarget->getPreferBranchLogAlignment()));
1437
1438 setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4));
1439
1440 IsStrictFPEnabled = true;
1441}
1442
1444 return Subtarget->useSoftFloat();
1445}
1446
1448 return !Subtarget->isThumb1Only() && VT.getSizeInBits() <= 32;
1449}
1450
1451// FIXME: It might make sense to define the representative register class as the
1452// nearest super-register that has a non-null superset. For example, DPR_VFP2 is
1453// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
1454// SPR's representative would be DPR_VFP2. This should work well if register
1455// pressure tracking were modified such that a register use would increment the
1456// pressure of the register class's representative and all of it's super
1457// classes' representatives transitively. We have not implemented this because
1458// of the difficulty prior to coalescing of modeling operand register classes
1459// due to the common occurrence of cross class copies and subregister insertions
1460// and extractions.
1461std::pair<const TargetRegisterClass *, uint8_t>
1463 MVT VT) const {
1464 const TargetRegisterClass *RRC = nullptr;
1465 uint8_t Cost = 1;
1466 switch (VT.SimpleTy) {
1467 default:
1469 // Use DPR as representative register class for all floating point
1470 // and vector types. Since there are 32 SPR registers and 32 DPR registers so
1471 // the cost is 1 for both f32 and f64.
1472 case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
1473 case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
1474 RRC = &ARM::DPRRegClass;
1475 // When NEON is used for SP, only half of the register file is available
1476 // because operations that define both SP and DP results will be constrained
1477 // to the VFP2 class (D0-D15). We currently model this constraint prior to
1478 // coalescing by double-counting the SP regs. See the FIXME above.
1479 if (Subtarget->useNEONForSinglePrecisionFP())
1480 Cost = 2;
1481 break;
1482 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1483 case MVT::v4f32: case MVT::v2f64:
1484 RRC = &ARM::DPRRegClass;
1485 Cost = 2;
1486 break;
1487 case MVT::v4i64:
1488 RRC = &ARM::DPRRegClass;
1489 Cost = 4;
1490 break;
1491 case MVT::v8i64:
1492 RRC = &ARM::DPRRegClass;
1493 Cost = 8;
1494 break;
1495 }
1496 return std::make_pair(RRC, Cost);
1497}
1498
1500 EVT VT) const {
1501 if (!VT.isVector())
1502 return getPointerTy(DL);
1503
1504 // MVE has a predicate register.
1505 if ((Subtarget->hasMVEIntegerOps() &&
1506 (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
1507 VT == MVT::v16i8)) ||
1508 (Subtarget->hasMVEFloatOps() &&
1509 (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16)))
1510 return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
1512}
1513
1514/// getRegClassFor - Return the register class that should be used for the
1515/// specified value type.
1516const TargetRegisterClass *
1517ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
1518 (void)isDivergent;
1519 // Map v4i64 to QQ registers but do not make the type legal. Similarly map
1520 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
1521 // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive
1522 // MVE Q registers.
1523 if (Subtarget->hasNEON()) {
1524 if (VT == MVT::v4i64)
1525 return &ARM::QQPRRegClass;
1526 if (VT == MVT::v8i64)
1527 return &ARM::QQQQPRRegClass;
1528 }
1529 if (Subtarget->hasMVEIntegerOps()) {
1530 if (VT == MVT::v4i64)
1531 return &ARM::MQQPRRegClass;
1532 if (VT == MVT::v8i64)
1533 return &ARM::MQQQQPRRegClass;
1534 }
1536}
1537
1538// memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
1539// source/dest is aligned and the copy size is large enough. We therefore want
1540// to align such objects passed to memory intrinsics.
1542 Align &PrefAlign) const {
1543 if (!isa<MemIntrinsic>(CI))
1544 return false;
1545 MinSize = 8;
1546 // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
1547 // cycle faster than 4-byte aligned LDM.
1548 PrefAlign =
1549 (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? Align(8) : Align(4));
1550 return true;
1551}
1552
1553// Create a fast isel object.
1554FastISel *
1556 const TargetLibraryInfo *libInfo) const {
1557 return ARM::createFastISel(funcInfo, libInfo);
1558}
1559
1561 unsigned NumVals = N->getNumValues();
1562 if (!NumVals)
1563 return Sched::RegPressure;
1564
1565 for (unsigned i = 0; i != NumVals; ++i) {
1566 EVT VT = N->getValueType(i);
1567 if (VT == MVT::Glue || VT == MVT::Other)
1568 continue;
1569 if (VT.isFloatingPoint() || VT.isVector())
1570 return Sched::ILP;
1571 }
1572
1573 if (!N->isMachineOpcode())
1574 return Sched::RegPressure;
1575
1576 // Load are scheduled for latency even if there instruction itinerary
1577 // is not available.
1578 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1579 const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
1580
1581 if (MCID.getNumDefs() == 0)
1582 return Sched::RegPressure;
1583 if (!Itins->isEmpty() &&
1584 Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2U)
1585 return Sched::ILP;
1586
1587 return Sched::RegPressure;
1588}
1589
1590//===----------------------------------------------------------------------===//
1591// Lowering Code
1592//===----------------------------------------------------------------------===//
1593
1594static bool isSRL16(const SDValue &Op) {
1595 if (Op.getOpcode() != ISD::SRL)
1596 return false;
1597 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1598 return Const->getZExtValue() == 16;
1599 return false;
1600}
1601
1602static bool isSRA16(const SDValue &Op) {
1603 if (Op.getOpcode() != ISD::SRA)
1604 return false;
1605 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1606 return Const->getZExtValue() == 16;
1607 return false;
1608}
1609
1610static bool isSHL16(const SDValue &Op) {
1611 if (Op.getOpcode() != ISD::SHL)
1612 return false;
1613 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1614 return Const->getZExtValue() == 16;
1615 return false;
1616}
1617
1618// Check for a signed 16-bit value. We special case SRA because it makes it
1619// more simple when also looking for SRAs that aren't sign extending a
1620// smaller value. Without the check, we'd need to take extra care with
1621// checking order for some operations.
1622static bool isS16(const SDValue &Op, SelectionDAG &DAG) {
1623 if (isSRA16(Op))
1624 return isSHL16(Op.getOperand(0));
1625 return DAG.ComputeNumSignBits(Op) == 17;
1626}
1627
1628/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
1630 switch (CC) {
1631 default: llvm_unreachable("Unknown condition code!");
1632 case ISD::SETNE: return ARMCC::NE;
1633 case ISD::SETEQ: return ARMCC::EQ;
1634 case ISD::SETGT: return ARMCC::GT;
1635 case ISD::SETGE: return ARMCC::GE;
1636 case ISD::SETLT: return ARMCC::LT;
1637 case ISD::SETLE: return ARMCC::LE;
1638 case ISD::SETUGT: return ARMCC::HI;
1639 case ISD::SETUGE: return ARMCC::HS;
1640 case ISD::SETULT: return ARMCC::LO;
1641 case ISD::SETULE: return ARMCC::LS;
1642 }
1643}
1644
1645/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
1647 ARMCC::CondCodes &CondCode2) {
1648 CondCode2 = ARMCC::AL;
1649 switch (CC) {
1650 default: llvm_unreachable("Unknown FP condition!");
1651 case ISD::SETEQ:
1652 case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
1653 case ISD::SETGT:
1654 case ISD::SETOGT: CondCode = ARMCC::GT; break;
1655 case ISD::SETGE:
1656 case ISD::SETOGE: CondCode = ARMCC::GE; break;
1657 case ISD::SETOLT: CondCode = ARMCC::MI; break;
1658 case ISD::SETOLE: CondCode = ARMCC::LS; break;
1659 case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
1660 case ISD::SETO: CondCode = ARMCC::VC; break;
1661 case ISD::SETUO: CondCode = ARMCC::VS; break;
1662 case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
1663 case ISD::SETUGT: CondCode = ARMCC::HI; break;
1664 case ISD::SETUGE: CondCode = ARMCC::PL; break;
1665 case ISD::SETLT:
1666 case ISD::SETULT: CondCode = ARMCC::LT; break;
1667 case ISD::SETLE:
1668 case ISD::SETULE: CondCode = ARMCC::LE; break;
1669 case ISD::SETNE:
1670 case ISD::SETUNE: CondCode = ARMCC::NE; break;
1671 }
1672}
1673
1674//===----------------------------------------------------------------------===//
1675// Calling Convention Implementation
1676//===----------------------------------------------------------------------===//
1677
1678/// getEffectiveCallingConv - Get the effective calling convention, taking into
1679/// account presence of floating point hardware and calling convention
1680/// limitations, such as support for variadic functions.
1682ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
1683 bool isVarArg) const {
1684 switch (CC) {
1685 default:
1686 report_fatal_error("Unsupported calling convention");
1689 case CallingConv::GHC:
1691 return CC;
1697 case CallingConv::Swift:
1700 case CallingConv::C:
1701 case CallingConv::Tail:
1702 if (!getTM().isAAPCS_ABI())
1703 return CallingConv::ARM_APCS;
1704 else if (Subtarget->hasFPRegs() && !Subtarget->isThumb1Only() &&
1705 getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
1706 !isVarArg)
1708 else
1710 case CallingConv::Fast:
1712 if (!getTM().isAAPCS_ABI()) {
1713 if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg)
1714 return CallingConv::Fast;
1715 return CallingConv::ARM_APCS;
1716 } else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() &&
1717 !isVarArg)
1719 else
1721 }
1722}
1723
1725 bool isVarArg) const {
1726 return CCAssignFnForNode(CC, false, isVarArg);
1727}
1728
1730 bool isVarArg) const {
1731 return CCAssignFnForNode(CC, true, isVarArg);
1732}
1733
1734/// CCAssignFnForNode - Selects the correct CCAssignFn for the given
1735/// CallingConvention.
1736CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
1737 bool Return,
1738 bool isVarArg) const {
1739 switch (getEffectiveCallingConv(CC, isVarArg)) {
1740 default:
1741 report_fatal_error("Unsupported calling convention");
1743 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
1745 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1747 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
1748 case CallingConv::Fast:
1749 return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
1750 case CallingConv::GHC:
1751 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
1753 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1755 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1757 return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check);
1758 }
1759}
1760
1761SDValue ARMTargetLowering::MoveToHPR(const SDLoc &dl, SelectionDAG &DAG,
1762 MVT LocVT, MVT ValVT, SDValue Val) const {
1763 Val = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocVT.getSizeInBits()),
1764 Val);
1765 if (Subtarget->hasFullFP16()) {
1766 Val = DAG.getNode(ARMISD::VMOVhr, dl, ValVT, Val);
1767 } else {
1768 Val = DAG.getNode(ISD::TRUNCATE, dl,
1769 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
1770 Val = DAG.getNode(ISD::BITCAST, dl, ValVT, Val);
1771 }
1772 return Val;
1773}
1774
1775SDValue ARMTargetLowering::MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG,
1776 MVT LocVT, MVT ValVT,
1777 SDValue Val) const {
1778 if (Subtarget->hasFullFP16()) {
1779 Val = DAG.getNode(ARMISD::VMOVrh, dl,
1780 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
1781 } else {
1782 Val = DAG.getNode(ISD::BITCAST, dl,
1783 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
1784 Val = DAG.getNode(ISD::ZERO_EXTEND, dl,
1785 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
1786 }
1787 return DAG.getNode(ISD::BITCAST, dl, LocVT, Val);
1788}
1789
1790/// LowerCallResult - Lower the result values of a call into the
1791/// appropriate copies out of appropriate physical registers.
1792SDValue ARMTargetLowering::LowerCallResult(
1793 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
1794 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
1795 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
1796 SDValue ThisVal, bool isCmseNSCall) const {
1797 // Assign locations to each value returned by this call.
1799 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
1800 *DAG.getContext());
1801 CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg));
1802
1803 // Copy all of the result registers out of their specified physreg.
1804 for (unsigned i = 0; i != RVLocs.size(); ++i) {
1805 CCValAssign VA = RVLocs[i];
1806
1807 // Pass 'this' value directly from the argument to return value, to avoid
1808 // reg unit interference
1809 if (i == 0 && isThisReturn) {
1810 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
1811 "unexpected return calling convention register assignment");
1812 InVals.push_back(ThisVal);
1813 continue;
1814 }
1815
1816 SDValue Val;
1817 if (VA.needsCustom() &&
1818 (VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2f64)) {
1819 // Handle f64 or half of a v2f64.
1820 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
1821 InGlue);
1822 Chain = Lo.getValue(1);
1823 InGlue = Lo.getValue(2);
1824 VA = RVLocs[++i]; // skip ahead to next loc
1825 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
1826 InGlue);
1827 Chain = Hi.getValue(1);
1828 InGlue = Hi.getValue(2);
1829 if (!Subtarget->isLittle())
1830 std::swap (Lo, Hi);
1831 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
1832
1833 if (VA.getLocVT() == MVT::v2f64) {
1834 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
1835 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
1836 DAG.getConstant(0, dl, MVT::i32));
1837
1838 VA = RVLocs[++i]; // skip ahead to next loc
1839 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);
1840 Chain = Lo.getValue(1);
1841 InGlue = Lo.getValue(2);
1842 VA = RVLocs[++i]; // skip ahead to next loc
1843 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);
1844 Chain = Hi.getValue(1);
1845 InGlue = Hi.getValue(2);
1846 if (!Subtarget->isLittle())
1847 std::swap (Lo, Hi);
1848 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
1849 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
1850 DAG.getConstant(1, dl, MVT::i32));
1851 }
1852 } else {
1853 Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
1854 InGlue);
1855 Chain = Val.getValue(1);
1856 InGlue = Val.getValue(2);
1857 }
1858
1859 switch (VA.getLocInfo()) {
1860 default: llvm_unreachable("Unknown loc info!");
1861 case CCValAssign::Full: break;
1862 case CCValAssign::BCvt:
1863 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
1864 break;
1865 }
1866
1867 // f16 arguments have their size extended to 4 bytes and passed as if they
1868 // had been copied to the LSBs of a 32-bit register.
1869 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
1870 if (VA.needsCustom() &&
1871 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
1872 Val = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Val);
1873
1874 // On CMSE Non-secure Calls, call results (returned values) whose bitwidth
1875 // is less than 32 bits must be sign- or zero-extended after the call for
1876 // security reasons. Although the ABI mandates an extension done by the
1877 // callee, the latter cannot be trusted to follow the rules of the ABI.
1878 const ISD::InputArg &Arg = Ins[VA.getValNo()];
1879 if (isCmseNSCall && Arg.ArgVT.isScalarInteger() &&
1880 VA.getLocVT().isScalarInteger() && Arg.ArgVT.bitsLT(MVT::i32))
1881 Val = handleCMSEValue(Val, Arg, DAG, dl);
1882
1883 InVals.push_back(Val);
1884 }
1885
1886 return Chain;
1887}
1888
1889std::pair<SDValue, MachinePointerInfo> ARMTargetLowering::computeAddrForCallArg(
1890 const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, SDValue StackPtr,
1891 bool IsTailCall, int SPDiff) const {
1892 SDValue DstAddr;
1893 MachinePointerInfo DstInfo;
1894 int32_t Offset = VA.getLocMemOffset();
1895 MachineFunction &MF = DAG.getMachineFunction();
1896
1897 if (IsTailCall) {
1898 Offset += SPDiff;
1899 auto PtrVT = getPointerTy(DAG.getDataLayout());
1900 int Size = VA.getLocVT().getFixedSizeInBits() / 8;
1901 int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
1902 DstAddr = DAG.getFrameIndex(FI, PtrVT);
1903 DstInfo =
1905 } else {
1906 SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
1907 DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
1908 StackPtr, PtrOff);
1909 DstInfo =
1911 }
1912
1913 return std::make_pair(DstAddr, DstInfo);
1914}
1915
1916// Returns the type of copying which is required to set up a byval argument to
1917// a tail-called function. This isn't needed for non-tail calls, because they
1918// always need the equivalent of CopyOnce, but tail-calls sometimes need two to
1919// avoid clobbering another argument (CopyViaTemp), and sometimes can be
1920// optimised to zero copies when forwarding an argument from the caller's
1921// caller (NoCopy).
1922ARMTargetLowering::ByValCopyKind ARMTargetLowering::ByValNeedsCopyForTailCall(
1923 SelectionDAG &DAG, SDValue Src, SDValue Dst, ISD::ArgFlagsTy Flags) const {
1924 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
1925 ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
1926
1927 // Globals are always safe to copy from.
1929 return CopyOnce;
1930
1931 // Can only analyse frame index nodes, conservatively assume we need a
1932 // temporary.
1933 auto *SrcFrameIdxNode = dyn_cast<FrameIndexSDNode>(Src);
1934 auto *DstFrameIdxNode = dyn_cast<FrameIndexSDNode>(Dst);
1935 if (!SrcFrameIdxNode || !DstFrameIdxNode)
1936 return CopyViaTemp;
1937
1938 int SrcFI = SrcFrameIdxNode->getIndex();
1939 int DstFI = DstFrameIdxNode->getIndex();
1940 assert(MFI.isFixedObjectIndex(DstFI) &&
1941 "byval passed in non-fixed stack slot");
1942
1943 int64_t SrcOffset = MFI.getObjectOffset(SrcFI);
1944 int64_t DstOffset = MFI.getObjectOffset(DstFI);
1945
1946 // If the source is in the local frame, then the copy to the argument memory
1947 // is always valid.
1948 bool FixedSrc = MFI.isFixedObjectIndex(SrcFI);
1949 if (!FixedSrc ||
1950 (FixedSrc && SrcOffset < -(int64_t)AFI->getArgRegsSaveSize()))
1951 return CopyOnce;
1952
1953 // In the case of byval arguments split between registers and the stack,
1954 // computeAddrForCallArg returns a FrameIndex which corresponds only to the
1955 // stack portion, but the Src SDValue will refer to the full value, including
1956 // the local stack memory that the register portion gets stored into. We only
1957 // need to compare them for equality, so normalise on the full value version.
1958 uint64_t RegSize = Flags.getByValSize() - MFI.getObjectSize(DstFI);
1959 DstOffset -= RegSize;
1960
1961 // If the value is already in the correct location, then no copying is
1962 // needed. If not, then we need to copy via a temporary.
1963 if (SrcOffset == DstOffset)
1964 return NoCopy;
1965 else
1966 return CopyViaTemp;
1967}
1968
1969void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
1970 SDValue Chain, SDValue &Arg,
1971 RegsToPassVector &RegsToPass,
1972 CCValAssign &VA, CCValAssign &NextVA,
1973 SDValue &StackPtr,
1974 SmallVectorImpl<SDValue> &MemOpChains,
1975 bool IsTailCall,
1976 int SPDiff) const {
1977 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
1978 DAG.getVTList(MVT::i32, MVT::i32), Arg);
1979 unsigned id = Subtarget->isLittle() ? 0 : 1;
1980 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));
1981
1982 if (NextVA.isRegLoc())
1983 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
1984 else {
1985 assert(NextVA.isMemLoc());
1986 if (!StackPtr.getNode())
1987 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
1989
1990 SDValue DstAddr;
1991 MachinePointerInfo DstInfo;
1992 std::tie(DstAddr, DstInfo) =
1993 computeAddrForCallArg(dl, DAG, NextVA, StackPtr, IsTailCall, SPDiff);
1994 MemOpChains.push_back(
1995 DAG.getStore(Chain, dl, fmrrd.getValue(1 - id), DstAddr, DstInfo));
1996 }
1997}
1998
1999static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
2000 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
2002}
2003
2004/// LowerCall - Lowering a call into a callseq_start <-
2005/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
2006/// nodes.
2007SDValue
2008ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2009 SmallVectorImpl<SDValue> &InVals) const {
2010 SelectionDAG &DAG = CLI.DAG;
2011 SDLoc &dl = CLI.DL;
2012 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2013 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2014 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2015 SDValue Chain = CLI.Chain;
2016 SDValue Callee = CLI.Callee;
2017 bool &isTailCall = CLI.IsTailCall;
2018 CallingConv::ID CallConv = CLI.CallConv;
2019 bool doesNotRet = CLI.DoesNotReturn;
2020 bool isVarArg = CLI.IsVarArg;
2021 const CallBase *CB = CLI.CB;
2022
2023 MachineFunction &MF = DAG.getMachineFunction();
2024 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2025 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
2026 MachineFunction::CallSiteInfo CSInfo;
2027 bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
2028 bool isThisReturn = false;
2029 bool isCmseNSCall = false;
2030 bool isSibCall = false;
2031 bool PreferIndirect = false;
2032 bool GuardWithBTI = false;
2033
2034 // Analyze operands of the call, assigning locations to each operand.
2036 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2037 *DAG.getContext());
2038 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg));
2039
2040 // Lower 'returns_twice' calls to a pseudo-instruction.
2041 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) &&
2042 !Subtarget->noBTIAtReturnTwice())
2043 GuardWithBTI = AFI->branchTargetEnforcement();
2044
2045 // Set type id for call site info.
2046 if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall())
2047 CSInfo = MachineFunction::CallSiteInfo(*CB);
2048
2049 // Determine whether this is a non-secure function call.
2050 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr("cmse_nonsecure_call"))
2051 isCmseNSCall = true;
2052
2053 // Disable tail calls if they're not supported.
2054 if (!Subtarget->supportsTailCall())
2055 isTailCall = false;
2056
2057 // For both the non-secure calls and the returns from a CMSE entry function,
2058 // the function needs to do some extra work after the call, or before the
2059 // return, respectively, thus it cannot end with a tail call
2060 if (isCmseNSCall || AFI->isCmseNSEntryFunction())
2061 isTailCall = false;
2062
2063 if (isa<GlobalAddressSDNode>(Callee)) {
2064 // If we're optimizing for minimum size and the function is called three or
2065 // more times in this block, we can improve codesize by calling indirectly
2066 // as BLXr has a 16-bit encoding.
2067 auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
2068 if (CLI.CB) {
2069 auto *BB = CLI.CB->getParent();
2070 PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() &&
2071 count_if(GV->users(), [&BB](const User *U) {
2072 return isa<Instruction>(U) &&
2073 cast<Instruction>(U)->getParent() == BB;
2074 }) > 2;
2075 }
2076 }
2077 if (isTailCall) {
2078 // Check if it's really possible to do a tail call.
2079 isTailCall =
2080 IsEligibleForTailCallOptimization(CLI, CCInfo, ArgLocs, PreferIndirect);
2081
2082 if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt &&
2083 CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
2084 isSibCall = true;
2085
2086 // We don't support GuaranteedTailCallOpt for ARM, only automatically
2087 // detected sibcalls.
2088 if (isTailCall)
2089 ++NumTailCalls;
2090 }
2091
2092 if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall())
2093 report_fatal_error("failed to perform tail call elimination on a call "
2094 "site marked musttail");
2095
2096 // Get a count of how many bytes are to be pushed on the stack.
2097 unsigned NumBytes = CCInfo.getStackSize();
2098
2099 // SPDiff is the byte offset of the call's argument area from the callee's.
2100 // Stores to callee stack arguments will be placed in FixedStackSlots offset
2101 // by this amount for a tail call. In a sibling call it must be 0 because the
2102 // caller will deallocate the entire stack and the callee still expects its
2103 // arguments to begin at SP+0. Completely unused for non-tail calls.
2104 int SPDiff = 0;
2105
2106 if (isTailCall && !isSibCall) {
2107 auto FuncInfo = MF.getInfo<ARMFunctionInfo>();
2108 unsigned NumReusableBytes = FuncInfo->getArgumentStackSize();
2109
2110 // Since callee will pop argument stack as a tail call, we must keep the
2111 // popped size 16-byte aligned.
2112 MaybeAlign StackAlign = DAG.getDataLayout().getStackAlignment();
2113 assert(StackAlign && "data layout string is missing stack alignment");
2114 NumBytes = alignTo(NumBytes, *StackAlign);
2115
2116 // SPDiff will be negative if this tail call requires more space than we
2117 // would automatically have in our incoming argument space. Positive if we
2118 // can actually shrink the stack.
2119 SPDiff = NumReusableBytes - NumBytes;
2120
2121 // If this call requires more stack than we have available from
2122 // LowerFormalArguments, tell FrameLowering to reserve space for it.
2123 if (SPDiff < 0 && AFI->getArgRegsSaveSize() < (unsigned)-SPDiff)
2124 AFI->setArgRegsSaveSize(-SPDiff);
2125 }
2126
2127 if (isSibCall) {
2128 // For sibling tail calls, memory operands are available in our caller's stack.
2129 NumBytes = 0;
2130 } else {
2131 // Adjust the stack pointer for the new arguments...
2132 // These operations are automatically eliminated by the prolog/epilog pass
2133 Chain = DAG.getCALLSEQ_START(Chain, isTailCall ? 0 : NumBytes, 0, dl);
2134 }
2135
2137 DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
2138
2139 RegsToPassVector RegsToPass;
2140 SmallVector<SDValue, 8> MemOpChains;
2141
2142 // If we are doing a tail-call, any byval arguments will be written to stack
2143 // space which was used for incoming arguments. If any the values being used
2144 // are incoming byval arguments to this function, then they might be
2145 // overwritten by the stores of the outgoing arguments. To avoid this, we
2146 // need to make a temporary copy of them in local stack space, then copy back
2147 // to the argument area.
2148 DenseMap<unsigned, SDValue> ByValTemporaries;
2149 SDValue ByValTempChain;
2150 if (isTailCall) {
2151 SmallVector<SDValue, 8> ByValCopyChains;
2152 for (const CCValAssign &VA : ArgLocs) {
2153 unsigned ArgIdx = VA.getValNo();
2154 SDValue Src = OutVals[ArgIdx];
2155 ISD::ArgFlagsTy Flags = Outs[ArgIdx].Flags;
2156
2157 if (!Flags.isByVal())
2158 continue;
2159
2160 SDValue Dst;
2161 MachinePointerInfo DstInfo;
2162 std::tie(Dst, DstInfo) =
2163 computeAddrForCallArg(dl, DAG, VA, SDValue(), true, SPDiff);
2164 ByValCopyKind Copy = ByValNeedsCopyForTailCall(DAG, Src, Dst, Flags);
2165
2166 if (Copy == NoCopy) {
2167 // If the argument is already at the correct offset on the stack
2168 // (because we are forwarding a byval argument from our caller), we
2169 // don't need any copying.
2170 continue;
2171 } else if (Copy == CopyOnce) {
2172 // If the argument is in our local stack frame, no other argument
2173 // preparation can clobber it, so we can copy it to the final location
2174 // later.
2175 ByValTemporaries[ArgIdx] = Src;
2176 } else {
2177 assert(Copy == CopyViaTemp && "unexpected enum value");
2178 // If we might be copying this argument from the outgoing argument
2179 // stack area, we need to copy via a temporary in the local stack
2180 // frame.
2181 int TempFrameIdx = MFI.CreateStackObject(
2182 Flags.getByValSize(), Flags.getNonZeroByValAlign(), false);
2183 SDValue Temp =
2184 DAG.getFrameIndex(TempFrameIdx, getPointerTy(DAG.getDataLayout()));
2185
2186 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2187 SDValue AlignNode =
2188 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
2189
2190 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2191 SDValue Ops[] = {Chain, Temp, Src, SizeNode, AlignNode};
2192 ByValCopyChains.push_back(
2193 DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, Ops));
2194 ByValTemporaries[ArgIdx] = Temp;
2195 }
2196 }
2197 if (!ByValCopyChains.empty())
2198 ByValTempChain =
2199 DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ByValCopyChains);
2200 }
2201
2202 // During a tail call, stores to the argument area must happen after all of
2203 // the function's incoming arguments have been loaded because they may alias.
2204 // This is done by folding in a TokenFactor from LowerFormalArguments, but
2205 // there's no point in doing so repeatedly so this tracks whether that's
2206 // happened yet.
2207 bool AfterFormalArgLoads = false;
2208
2209 // Walk the register/memloc assignments, inserting copies/loads. In the case
2210 // of tail call optimization, arguments are handled later.
2211 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
2212 i != e;
2213 ++i, ++realArgIdx) {
2214 CCValAssign &VA = ArgLocs[i];
2215 SDValue Arg = OutVals[realArgIdx];
2216 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2217 bool isByVal = Flags.isByVal();
2218
2219 // Promote the value if needed.
2220 switch (VA.getLocInfo()) {
2221 default: llvm_unreachable("Unknown loc info!");
2222 case CCValAssign::Full: break;
2223 case CCValAssign::SExt:
2224 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
2225 break;
2226 case CCValAssign::ZExt:
2227 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
2228 break;
2229 case CCValAssign::AExt:
2230 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
2231 break;
2232 case CCValAssign::BCvt:
2233 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2234 break;
2235 }
2236
2237 if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) {
2238 Chain = DAG.getStackArgumentTokenFactor(Chain);
2239 if (ByValTempChain) {
2240 // In case of large byval copies, re-using the stackframe for tail-calls
2241 // can lead to overwriting incoming arguments on the stack. Force
2242 // loading these stack arguments before the copy to avoid that.
2243 SmallVector<SDValue, 8> IncomingLoad;
2244 for (unsigned I = 0; I < OutVals.size(); ++I) {
2245 if (Outs[I].Flags.isByVal())
2246 continue;
2247
2248 SDValue OutVal = OutVals[I];
2249 LoadSDNode *OutLN = dyn_cast_or_null<LoadSDNode>(OutVal);
2250 if (!OutLN)
2251 continue;
2252
2253 FrameIndexSDNode *FIN =
2255 if (!FIN)
2256 continue;
2257
2258 if (!MFI.isFixedObjectIndex(FIN->getIndex()))
2259 continue;
2260
2261 for (const CCValAssign &VA : ArgLocs) {
2262 if (VA.isMemLoc())
2263 IncomingLoad.push_back(OutVal.getValue(1));
2264 }
2265 }
2266
2267 // Update the chain to force loads for potentially clobbered argument
2268 // loads to happen before the byval copy.
2269 if (!IncomingLoad.empty()) {
2270 IncomingLoad.push_back(Chain);
2271 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, IncomingLoad);
2272 }
2273
2274 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chain,
2275 ByValTempChain);
2276 }
2277 AfterFormalArgLoads = true;
2278 }
2279
2280 // f16 arguments have their size extended to 4 bytes and passed as if they
2281 // had been copied to the LSBs of a 32-bit register.
2282 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2283 if (VA.needsCustom() &&
2284 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) {
2285 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
2286 } else {
2287 // f16 arguments could have been extended prior to argument lowering.
2288 // Mask them arguments if this is a CMSE nonsecure call.
2289 auto ArgVT = Outs[realArgIdx].ArgVT;
2290 if (isCmseNSCall && (ArgVT == MVT::f16)) {
2291 auto LocBits = VA.getLocVT().getSizeInBits();
2292 auto MaskValue = APInt::getLowBitsSet(LocBits, ArgVT.getSizeInBits());
2293 SDValue Mask =
2294 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
2295 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
2296 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
2297 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2298 }
2299 }
2300
2301 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
2302 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
2303 SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2304 DAG.getConstant(0, dl, MVT::i32));
2305 SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2306 DAG.getConstant(1, dl, MVT::i32));
2307
2308 PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i],
2309 StackPtr, MemOpChains, isTailCall, SPDiff);
2310
2311 VA = ArgLocs[++i]; // skip ahead to next loc
2312 if (VA.isRegLoc()) {
2313 PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i],
2314 StackPtr, MemOpChains, isTailCall, SPDiff);
2315 } else {
2316 assert(VA.isMemLoc());
2317 SDValue DstAddr;
2318 MachinePointerInfo DstInfo;
2319 std::tie(DstAddr, DstInfo) =
2320 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2321 MemOpChains.push_back(DAG.getStore(Chain, dl, Op1, DstAddr, DstInfo));
2322 }
2323 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
2324 PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
2325 StackPtr, MemOpChains, isTailCall, SPDiff);
2326 } else if (VA.isRegLoc()) {
2327 if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
2328 Outs[0].VT == MVT::i32) {
2329 assert(VA.getLocVT() == MVT::i32 &&
2330 "unexpected calling convention register assignment");
2331 assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
2332 "unexpected use of 'returned'");
2333 isThisReturn = true;
2334 }
2335 const TargetOptions &Options = DAG.getTarget().Options;
2336 if (Options.EmitCallSiteInfo)
2337 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
2338 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2339 } else if (isByVal) {
2340 assert(VA.isMemLoc());
2341 unsigned offset = 0;
2342
2343 // True if this byval aggregate will be split between registers
2344 // and memory.
2345 unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
2346 unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
2347
2348 SDValue ByValSrc;
2349 bool NeedsStackCopy;
2350 if (auto It = ByValTemporaries.find(realArgIdx);
2351 It != ByValTemporaries.end()) {
2352 ByValSrc = It->second;
2353 NeedsStackCopy = true;
2354 } else {
2355 ByValSrc = Arg;
2356 NeedsStackCopy = !isTailCall;
2357 }
2358
2359 // If part of the argument is in registers, load them.
2360 if (CurByValIdx < ByValArgsCount) {
2361 unsigned RegBegin, RegEnd;
2362 CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
2363
2364 EVT PtrVT = getPointerTy(DAG.getDataLayout());
2365 unsigned int i, j;
2366 for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
2367 SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
2368 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, ByValSrc, Const);
2369 SDValue Load =
2370 DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(),
2371 DAG.InferPtrAlign(AddArg));
2372 MemOpChains.push_back(Load.getValue(1));
2373 RegsToPass.push_back(std::make_pair(j, Load));
2374 }
2375
2376 // If parameter size outsides register area, "offset" value
2377 // helps us to calculate stack slot for remained part properly.
2378 offset = RegEnd - RegBegin;
2379
2380 CCInfo.nextInRegsParam();
2381 }
2382
2383 // If the memory part of the argument isn't already in the correct place
2384 // (which can happen with tail calls), copy it into the argument area.
2385 if (NeedsStackCopy && Flags.getByValSize() > 4 * offset) {
2386 auto PtrVT = getPointerTy(DAG.getDataLayout());
2387 SDValue Dst;
2388 MachinePointerInfo DstInfo;
2389 std::tie(Dst, DstInfo) =
2390 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2391 SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
2392 SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, ByValSrc, SrcOffset);
2393 SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
2394 MVT::i32);
2395 SDValue AlignNode =
2396 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
2397
2398 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2399 SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
2400 MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
2401 Ops));
2402 }
2403 } else {
2404 assert(VA.isMemLoc());
2405 SDValue DstAddr;
2406 MachinePointerInfo DstInfo;
2407 std::tie(DstAddr, DstInfo) =
2408 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2409
2410 SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo);
2411 MemOpChains.push_back(Store);
2412 }
2413 }
2414
2415 if (!MemOpChains.empty())
2416 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2417
2418 // Build a sequence of copy-to-reg nodes chained together with token chain
2419 // and flag operands which copy the outgoing args into the appropriate regs.
2420 SDValue InGlue;
2421 for (const auto &[Reg, N] : RegsToPass) {
2422 Chain = DAG.getCopyToReg(Chain, dl, Reg, N, InGlue);
2423 InGlue = Chain.getValue(1);
2424 }
2425
2426 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
2427 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
2428 // node so that legalize doesn't hack it.
2429 bool isDirect = false;
2430
2431 const TargetMachine &TM = getTargetMachine();
2432 const GlobalValue *GVal = nullptr;
2433 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
2434 GVal = G->getGlobal();
2435 bool isStub = !TM.shouldAssumeDSOLocal(GVal) && Subtarget->isTargetMachO();
2436
2437 bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
2438 bool isLocalARMFunc = false;
2439 auto PtrVt = getPointerTy(DAG.getDataLayout());
2440
2441 if (Subtarget->genLongCalls()) {
2442 assert((!isPositionIndependent() || Subtarget->isTargetWindows()) &&
2443 "long-calls codegen is not position independent!");
2444 // Handle a global address or an external symbol. If it's not one of
2445 // those, the target's already in a register, so we don't need to do
2446 // anything extra.
2447 if (isa<GlobalAddressSDNode>(Callee)) {
2448 if (Subtarget->genExecuteOnly()) {
2449 if (Subtarget->useMovt())
2450 ++NumMovwMovt;
2451 Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
2452 DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
2453 } else {
2454 // Create a constant pool entry for the callee address
2455 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2456 ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(
2457 GVal, ARMPCLabelIndex, ARMCP::CPValue, 0);
2458
2459 // Get the address of the callee into a register
2460 SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2461 Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
2462 Callee = DAG.getLoad(
2463 PtrVt, dl, DAG.getEntryNode(), Addr,
2465 }
2466 } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
2467 const char *Sym = S->getSymbol();
2468
2469 if (Subtarget->genExecuteOnly()) {
2470 if (Subtarget->useMovt())
2471 ++NumMovwMovt;
2472 Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
2473 DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
2474 } else {
2475 // Create a constant pool entry for the callee address
2476 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2477 ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(
2478 *DAG.getContext(), Sym, ARMPCLabelIndex, 0);
2479
2480 // Get the address of the callee into a register
2481 SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2482 Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
2483 Callee = DAG.getLoad(
2484 PtrVt, dl, DAG.getEntryNode(), Addr,
2486 }
2487 }
2488 } else if (isa<GlobalAddressSDNode>(Callee)) {
2489 if (!PreferIndirect) {
2490 isDirect = true;
2491 bool isDef = GVal->isStrongDefinitionForLinker();
2492
2493 // ARM call to a local ARM function is predicable.
2494 isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
2495 // tBX takes a register source operand.
2496 if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2497 assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
2498 Callee = DAG.getNode(
2499 ARMISD::WrapperPIC, dl, PtrVt,
2500 DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, ARMII::MO_NONLAZY));
2501 Callee = DAG.getLoad(
2502 PtrVt, dl, DAG.getEntryNode(), Callee,
2506 } else if (Subtarget->isTargetCOFF()) {
2507 assert(Subtarget->isTargetWindows() &&
2508 "Windows is the only supported COFF target");
2509 unsigned TargetFlags = ARMII::MO_NO_FLAG;
2510 if (GVal->hasDLLImportStorageClass())
2511 TargetFlags = ARMII::MO_DLLIMPORT;
2512 else if (!TM.shouldAssumeDSOLocal(GVal))
2513 TargetFlags = ARMII::MO_COFFSTUB;
2514 Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, /*offset=*/0,
2515 TargetFlags);
2516 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
2517 Callee =
2518 DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
2519 DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
2521 } else {
2522 Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, 0);
2523 }
2524 }
2525 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2526 isDirect = true;
2527 // tBX takes a register source operand.
2528 const char *Sym = S->getSymbol();
2529 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2530 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2531 ARMConstantPoolValue *CPV =
2533 ARMPCLabelIndex, 4);
2534 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2535 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2536 Callee = DAG.getLoad(
2537 PtrVt, dl, DAG.getEntryNode(), CPAddr,
2539 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2540 Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
2541 } else {
2542 Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0);
2543 }
2544 }
2545
2546 if (isCmseNSCall) {
2547 assert(!isARMFunc && !isDirect &&
2548 "Cannot handle call to ARM function or direct call");
2549 if (NumBytes > 0) {
2550 DAG.getContext()->diagnose(
2551 DiagnosticInfoUnsupported(DAG.getMachineFunction().getFunction(),
2552 "call to non-secure function would require "
2553 "passing arguments on stack",
2554 dl.getDebugLoc()));
2555 }
2556 if (isStructRet) {
2557 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
2559 "call to non-secure function would return value through pointer",
2560 dl.getDebugLoc()));
2561 }
2562 }
2563
2564 // FIXME: handle tail calls differently.
2565 unsigned CallOpc;
2566 if (Subtarget->isThumb()) {
2567 if (GuardWithBTI)
2568 CallOpc = ARMISD::t2CALL_BTI;
2569 else if (isCmseNSCall)
2570 CallOpc = ARMISD::tSECALL;
2571 else if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
2572 CallOpc = ARMISD::CALL_NOLINK;
2573 else
2574 CallOpc = ARMISD::CALL;
2575 } else {
2576 if (!isDirect && !Subtarget->hasV5TOps())
2577 CallOpc = ARMISD::CALL_NOLINK;
2578 else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
2579 // Emit regular call when code size is the priority
2580 !Subtarget->hasMinSize())
2581 // "mov lr, pc; b _foo" to avoid confusing the RSP
2582 CallOpc = ARMISD::CALL_NOLINK;
2583 else
2584 CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
2585 }
2586
2587 // We don't usually want to end the call-sequence here because we would tidy
2588 // the frame up *after* the call, however in the ABI-changing tail-call case
2589 // we've carefully laid out the parameters so that when sp is reset they'll be
2590 // in the correct location.
2591 if (isTailCall && !isSibCall) {
2592 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, dl);
2593 InGlue = Chain.getValue(1);
2594 }
2595
2596 std::vector<SDValue> Ops;
2597 Ops.push_back(Chain);
2598 Ops.push_back(Callee);
2599
2600 if (isTailCall) {
2601 Ops.push_back(DAG.getSignedTargetConstant(SPDiff, dl, MVT::i32));
2602 }
2603
2604 // Add argument registers to the end of the list so that they are known live
2605 // into the call.
2606 for (const auto &[Reg, N] : RegsToPass)
2607 Ops.push_back(DAG.getRegister(Reg, N.getValueType()));
2608
2609 // Add a register mask operand representing the call-preserved registers.
2610 const uint32_t *Mask;
2611 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
2612 if (isThisReturn) {
2613 // For 'this' returns, use the R0-preserving mask if applicable
2614 Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
2615 if (!Mask) {
2616 // Set isThisReturn to false if the calling convention is not one that
2617 // allows 'returned' to be modeled in this way, so LowerCallResult does
2618 // not try to pass 'this' straight through
2619 isThisReturn = false;
2620 Mask = ARI->getCallPreservedMask(MF, CallConv);
2621 }
2622 } else
2623 Mask = ARI->getCallPreservedMask(MF, CallConv);
2624
2625 assert(Mask && "Missing call preserved mask for calling convention");
2626 Ops.push_back(DAG.getRegisterMask(Mask));
2627
2628 if (InGlue.getNode())
2629 Ops.push_back(InGlue);
2630
2631 if (isTailCall) {
2633 SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, MVT::Other, Ops);
2634 if (CLI.CFIType)
2635 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
2636 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
2637 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
2638 return Ret;
2639 }
2640
2641 // Returns a chain and a flag for retval copy to use.
2642 Chain = DAG.getNode(CallOpc, dl, {MVT::Other, MVT::Glue}, Ops);
2643 if (CLI.CFIType)
2644 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
2645 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
2646 InGlue = Chain.getValue(1);
2647 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
2648
2649 // If we're guaranteeing tail-calls will be honoured, the callee must
2650 // pop its own argument stack on return. But this call is *not* a tail call so
2651 // we need to undo that after it returns to restore the status-quo.
2652 bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
2653 uint64_t CalleePopBytes =
2654 canGuaranteeTCO(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : -1U;
2655
2656 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, dl);
2657 if (!Ins.empty())
2658 InGlue = Chain.getValue(1);
2659
2660 // Handle result values, copying them out of physregs into vregs that we
2661 // return.
2662 return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG,
2663 InVals, isThisReturn,
2664 isThisReturn ? OutVals[0] : SDValue(), isCmseNSCall);
2665}
2666
2667/// HandleByVal - Every parameter *after* a byval parameter is passed
2668/// on the stack. Remember the next parameter register to allocate,
2669/// and then confiscate the rest of the parameter registers to insure
2670/// this.
2671void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
2672 Align Alignment) const {
2673 // Byval (as with any stack) slots are always at least 4 byte aligned.
2674 Alignment = std::max(Alignment, Align(4));
2675
2676 MCRegister Reg = State->AllocateReg(GPRArgRegs);
2677 if (!Reg)
2678 return;
2679
2680 unsigned AlignInRegs = Alignment.value() / 4;
2681 unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
2682 for (unsigned i = 0; i < Waste; ++i)
2683 Reg = State->AllocateReg(GPRArgRegs);
2684
2685 if (!Reg)
2686 return;
2687
2688 unsigned Excess = 4 * (ARM::R4 - Reg);
2689
2690 // Special case when NSAA != SP and parameter size greater than size of
2691 // all remained GPR regs. In that case we can't split parameter, we must
2692 // send it to stack. We also must set NCRN to R4, so waste all
2693 // remained registers.
2694 const unsigned NSAAOffset = State->getStackSize();
2695 if (NSAAOffset != 0 && Size > Excess) {
2696 while (State->AllocateReg(GPRArgRegs))
2697 ;
2698 return;
2699 }
2700
2701 // First register for byval parameter is the first register that wasn't
2702 // allocated before this method call, so it would be "reg".
2703 // If parameter is small enough to be saved in range [reg, r4), then
2704 // the end (first after last) register would be reg + param-size-in-regs,
2705 // else parameter would be splitted between registers and stack,
2706 // end register would be r4 in this case.
2707 unsigned ByValRegBegin = Reg;
2708 unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);
2709 State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
2710 // Note, first register is allocated in the beginning of function already,
2711 // allocate remained amount of registers we need.
2712 for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)
2713 State->AllocateReg(GPRArgRegs);
2714 // A byval parameter that is split between registers and memory needs its
2715 // size truncated here.
2716 // In the case where the entire structure fits in registers, we set the
2717 // size in memory to zero.
2718 Size = std::max<int>(Size - Excess, 0);
2719}
2720
2721/// IsEligibleForTailCallOptimization - Check whether the call is eligible
2722/// for tail call optimization. Targets which want to do tail call
2723/// optimization should implement this function. Note that this function also
2724/// processes musttail calls, so when this function returns false on a valid
2725/// musttail call, a fatal backend error occurs.
2726bool ARMTargetLowering::IsEligibleForTailCallOptimization(
2728 SmallVectorImpl<CCValAssign> &ArgLocs, const bool isIndirect) const {
2729 CallingConv::ID CalleeCC = CLI.CallConv;
2730 SDValue Callee = CLI.Callee;
2731 bool isVarArg = CLI.IsVarArg;
2732 const SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2733 const SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2734 const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2735 const SelectionDAG &DAG = CLI.DAG;
2736 MachineFunction &MF = DAG.getMachineFunction();
2737 const Function &CallerF = MF.getFunction();
2738 CallingConv::ID CallerCC = CallerF.getCallingConv();
2739
2740 assert(Subtarget->supportsTailCall());
2741
2742 // Indirect tail-calls require a register to hold the target address. That
2743 // register must be:
2744 // * Allocatable (i.e. r0-r7 if the target is Thumb1).
2745 // * Not callee-saved, so must be one of r0-r3 or r12.
2746 // * Not used to hold an argument to the tail-called function, which might be
2747 // in r0-r3.
2748 // * Not used to hold the return address authentication code, which is in r12
2749 // if enabled.
2750 // Sometimes, no register matches all of these conditions, so we can't do a
2751 // tail-call.
2752 if (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect) {
2753 SmallSet<MCPhysReg, 5> AddressRegisters = {ARM::R0, ARM::R1, ARM::R2,
2754 ARM::R3};
2755 if (!(Subtarget->isThumb1Only() ||
2756 MF.getInfo<ARMFunctionInfo>()->shouldSignReturnAddress(true)))
2757 AddressRegisters.insert(ARM::R12);
2758 for (const CCValAssign &AL : ArgLocs)
2759 if (AL.isRegLoc())
2760 AddressRegisters.erase(AL.getLocReg());
2761 if (AddressRegisters.empty()) {
2762 LLVM_DEBUG(dbgs() << "false (no reg to hold function pointer)\n");
2763 return false;
2764 }
2765 }
2766
2767 // Look for obvious safe cases to perform tail call optimization that do not
2768 // require ABI changes. This is what gcc calls sibcall.
2769
2770 // Exception-handling functions need a special set of instructions to indicate
2771 // a return to the hardware. Tail-calling another function would probably
2772 // break this.
2773 if (CallerF.hasFnAttribute("interrupt")) {
2774 LLVM_DEBUG(dbgs() << "false (interrupt attribute)\n");
2775 return false;
2776 }
2777
2778 if (canGuaranteeTCO(CalleeCC,
2779 getTargetMachine().Options.GuaranteedTailCallOpt)) {
2780 LLVM_DEBUG(dbgs() << (CalleeCC == CallerCC ? "true" : "false")
2781 << " (guaranteed tail-call CC)\n");
2782 return CalleeCC == CallerCC;
2783 }
2784
2785 // Also avoid sibcall optimization if either caller or callee uses struct
2786 // return semantics.
2787 bool isCalleeStructRet = Outs.empty() ? false : Outs[0].Flags.isSRet();
2788 bool isCallerStructRet = MF.getFunction().hasStructRetAttr();
2789 if (isCalleeStructRet != isCallerStructRet) {
2790 LLVM_DEBUG(dbgs() << "false (struct-ret)\n");
2791 return false;
2792 }
2793
2794 // Externally-defined functions with weak linkage should not be
2795 // tail-called on ARM when the OS does not support dynamic
2796 // pre-emption of symbols, as the AAELF spec requires normal calls
2797 // to undefined weak functions to be replaced with a NOP or jump to the
2798 // next instruction. The behaviour of branch instructions in this
2799 // situation (as used for tail calls) is implementation-defined, so we
2800 // cannot rely on the linker replacing the tail call with a return.
2801 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
2802 const GlobalValue *GV = G->getGlobal();
2803 const Triple &TT = getTargetMachine().getTargetTriple();
2804 if (GV->hasExternalWeakLinkage() &&
2805 (!TT.isOSWindows() || TT.isOSBinFormatELF() ||
2806 TT.isOSBinFormatMachO())) {
2807 LLVM_DEBUG(dbgs() << "false (external weak linkage)\n");
2808 return false;
2809 }
2810 }
2811
2812 // Check that the call results are passed in the same way.
2813 LLVMContext &C = *DAG.getContext();
2815 getEffectiveCallingConv(CalleeCC, isVarArg),
2816 getEffectiveCallingConv(CallerCC, CallerF.isVarArg()), MF, C, Ins,
2817 CCAssignFnForReturn(CalleeCC, isVarArg),
2818 CCAssignFnForReturn(CallerCC, CallerF.isVarArg()))) {
2819 LLVM_DEBUG(dbgs() << "false (incompatible results)\n");
2820 return false;
2821 }
2822 // The callee has to preserve all registers the caller needs to preserve.
2823 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
2824 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2825 if (CalleeCC != CallerCC) {
2826 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2827 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) {
2828 LLVM_DEBUG(dbgs() << "false (not all registers preserved)\n");
2829 return false;
2830 }
2831 }
2832
2833 // If Caller's vararg argument has been split between registers and stack, do
2834 // not perform tail call, since part of the argument is in caller's local
2835 // frame.
2836 const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
2837 if (CLI.IsVarArg && AFI_Caller->getArgRegsSaveSize()) {
2838 LLVM_DEBUG(dbgs() << "false (arg reg save area)\n");
2839 return false;
2840 }
2841
2842 // If the callee takes no arguments then go on to check the results of the
2843 // call.
2844 const MachineRegisterInfo &MRI = MF.getRegInfo();
2845 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) {
2846 LLVM_DEBUG(dbgs() << "false (parameters in CSRs do not match)\n");
2847 return false;
2848 }
2849
2850 // If the stack arguments for this call do not fit into our own save area then
2851 // the call cannot be made tail.
2852 if (CCInfo.getStackSize() > AFI_Caller->getArgumentStackSize())
2853 return false;
2854
2855 LLVM_DEBUG(dbgs() << "true\n");
2856 return true;
2857}
2858
2859bool
2860ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
2861 MachineFunction &MF, bool isVarArg,
2863 LLVMContext &Context, const Type *RetTy) const {
2865 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2866 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2867}
2868
2870 const SDLoc &DL, SelectionDAG &DAG) {
2871 const MachineFunction &MF = DAG.getMachineFunction();
2872 const Function &F = MF.getFunction();
2873
2874 StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString();
2875
2876 // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
2877 // version of the "preferred return address". These offsets affect the return
2878 // instruction if this is a return from PL1 without hypervisor extensions.
2879 // IRQ/FIQ: +4 "subs pc, lr, #4"
2880 // SWI: 0 "subs pc, lr, #0"
2881 // ABORT: +4 "subs pc, lr, #4"
2882 // UNDEF: +4/+2 "subs pc, lr, #0"
2883 // UNDEF varies depending on where the exception came from ARM or Thumb
2884 // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
2885
2886 int64_t LROffset;
2887 if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
2888 IntKind == "ABORT")
2889 LROffset = 4;
2890 else if (IntKind == "SWI" || IntKind == "UNDEF")
2891 LROffset = 0;
2892 else
2893 report_fatal_error("Unsupported interrupt attribute. If present, value "
2894 "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
2895
2896 RetOps.insert(RetOps.begin() + 1,
2897 DAG.getConstant(LROffset, DL, MVT::i32, false));
2898
2899 return DAG.getNode(ARMISD::INTRET_GLUE, DL, MVT::Other, RetOps);
2900}
2901
2902SDValue
2903ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2904 bool isVarArg,
2906 const SmallVectorImpl<SDValue> &OutVals,
2907 const SDLoc &dl, SelectionDAG &DAG) const {
2908 // CCValAssign - represent the assignment of the return value to a location.
2910
2911 // CCState - Info about the registers and stack slots.
2912 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2913 *DAG.getContext());
2914
2915 // Analyze outgoing return values.
2916 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2917
2918 SDValue Glue;
2920 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2921 bool isLittleEndian = Subtarget->isLittle();
2922
2923 MachineFunction &MF = DAG.getMachineFunction();
2924 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2925 AFI->setReturnRegsCount(RVLocs.size());
2926
2927 // Report error if cmse entry function returns structure through first ptr arg.
2928 if (AFI->isCmseNSEntryFunction() && MF.getFunction().hasStructRetAttr()) {
2929 // Note: using an empty SDLoc(), as the first line of the function is a
2930 // better place to report than the last line.
2931 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
2933 "secure entry function would return value through pointer",
2934 SDLoc().getDebugLoc()));
2935 }
2936
2937 // Copy the result values into the output registers.
2938 for (unsigned i = 0, realRVLocIdx = 0;
2939 i != RVLocs.size();
2940 ++i, ++realRVLocIdx) {
2941 CCValAssign &VA = RVLocs[i];
2942 assert(VA.isRegLoc() && "Can only return in registers!");
2943
2944 SDValue Arg = OutVals[realRVLocIdx];
2945 bool ReturnF16 = false;
2946
2947 if (Subtarget->hasFullFP16() && getTM().isTargetHardFloat()) {
2948 // Half-precision return values can be returned like this:
2949 //
2950 // t11 f16 = fadd ...
2951 // t12: i16 = bitcast t11
2952 // t13: i32 = zero_extend t12
2953 // t14: f32 = bitcast t13 <~~~~~~~ Arg
2954 //
2955 // to avoid code generation for bitcasts, we simply set Arg to the node
2956 // that produces the f16 value, t11 in this case.
2957 //
2958 if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) {
2959 SDValue ZE = Arg.getOperand(0);
2960 if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) {
2961 SDValue BC = ZE.getOperand(0);
2962 if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) {
2963 Arg = BC.getOperand(0);
2964 ReturnF16 = true;
2965 }
2966 }
2967 }
2968 }
2969
2970 switch (VA.getLocInfo()) {
2971 default: llvm_unreachable("Unknown loc info!");
2972 case CCValAssign::Full: break;
2973 case CCValAssign::BCvt:
2974 if (!ReturnF16)
2975 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2976 break;
2977 }
2978
2979 // Mask f16 arguments if this is a CMSE nonsecure entry.
2980 auto RetVT = Outs[realRVLocIdx].ArgVT;
2981 if (AFI->isCmseNSEntryFunction() && (RetVT == MVT::f16)) {
2982 if (VA.needsCustom() && VA.getValVT() == MVT::f16) {
2983 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
2984 } else {
2985 auto LocBits = VA.getLocVT().getSizeInBits();
2986 auto MaskValue = APInt::getLowBitsSet(LocBits, RetVT.getSizeInBits());
2987 SDValue Mask =
2988 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
2989 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
2990 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
2991 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2992 }
2993 }
2994
2995 if (VA.needsCustom() &&
2996 (VA.getLocVT() == MVT::v2f64 || VA.getLocVT() == MVT::f64)) {
2997 if (VA.getLocVT() == MVT::v2f64) {
2998 // Extract the first half and return it in two registers.
2999 SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3000 DAG.getConstant(0, dl, MVT::i32));
3001 SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
3002 DAG.getVTList(MVT::i32, MVT::i32), Half);
3003
3004 Chain =
3005 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3006 HalfGPRs.getValue(isLittleEndian ? 0 : 1), Glue);
3007 Glue = Chain.getValue(1);
3008 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3009 VA = RVLocs[++i]; // skip ahead to next loc
3010 Chain =
3011 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3012 HalfGPRs.getValue(isLittleEndian ? 1 : 0), Glue);
3013 Glue = Chain.getValue(1);
3014 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3015 VA = RVLocs[++i]; // skip ahead to next loc
3016
3017 // Extract the 2nd half and fall through to handle it as an f64 value.
3018 Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3019 DAG.getConstant(1, dl, MVT::i32));
3020 }
3021 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is
3022 // available.
3023 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
3024 DAG.getVTList(MVT::i32, MVT::i32), Arg);
3025 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3026 fmrrd.getValue(isLittleEndian ? 0 : 1), Glue);
3027 Glue = Chain.getValue(1);
3028 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3029 VA = RVLocs[++i]; // skip ahead to next loc
3030 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3031 fmrrd.getValue(isLittleEndian ? 1 : 0), Glue);
3032 } else
3033 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Glue);
3034
3035 // Guarantee that all emitted copies are
3036 // stuck together, avoiding something bad.
3037 Glue = Chain.getValue(1);
3038 RetOps.push_back(DAG.getRegister(
3039 VA.getLocReg(), ReturnF16 ? Arg.getValueType() : VA.getLocVT()));
3040 }
3041 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
3042 const MCPhysReg *I =
3043 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3044 if (I) {
3045 for (; *I; ++I) {
3046 if (ARM::GPRRegClass.contains(*I))
3047 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3048 else if (ARM::DPRRegClass.contains(*I))
3050 else
3051 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3052 }
3053 }
3054
3055 // Update chain and glue.
3056 RetOps[0] = Chain;
3057 if (Glue.getNode())
3058 RetOps.push_back(Glue);
3059
3060 // CPUs which aren't M-class use a special sequence to return from
3061 // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
3062 // though we use "subs pc, lr, #N").
3063 //
3064 // M-class CPUs actually use a normal return sequence with a special
3065 // (hardware-provided) value in LR, so the normal code path works.
3066 if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") &&
3067 !Subtarget->isMClass()) {
3068 if (Subtarget->isThumb1Only())
3069 report_fatal_error("interrupt attribute is not supported in Thumb1");
3070 return LowerInterruptReturn(RetOps, dl, DAG);
3071 }
3072
3073 unsigned RetNode =
3074 AFI->isCmseNSEntryFunction() ? ARMISD::SERET_GLUE : ARMISD::RET_GLUE;
3075 return DAG.getNode(RetNode, dl, MVT::Other, RetOps);
3076}
3077
3078bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
3079 if (N->getNumValues() != 1)
3080 return false;
3081 if (!N->hasNUsesOfValue(1, 0))
3082 return false;
3083
3084 SDValue TCChain = Chain;
3085 SDNode *Copy = *N->user_begin();
3086 if (Copy->getOpcode() == ISD::CopyToReg) {
3087 // If the copy has a glue operand, we conservatively assume it isn't safe to
3088 // perform a tail call.
3089 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3090 return false;
3091 TCChain = Copy->getOperand(0);
3092 } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
3093 SDNode *VMov = Copy;
3094 // f64 returned in a pair of GPRs.
3095 SmallPtrSet<SDNode*, 2> Copies;
3096 for (SDNode *U : VMov->users()) {
3097 if (U->getOpcode() != ISD::CopyToReg)
3098 return false;
3099 Copies.insert(U);
3100 }
3101 if (Copies.size() > 2)
3102 return false;
3103
3104 for (SDNode *U : VMov->users()) {
3105 SDValue UseChain = U->getOperand(0);
3106 if (Copies.count(UseChain.getNode()))
3107 // Second CopyToReg
3108 Copy = U;
3109 else {
3110 // We are at the top of this chain.
3111 // If the copy has a glue operand, we conservatively assume it
3112 // isn't safe to perform a tail call.
3113 if (U->getOperand(U->getNumOperands() - 1).getValueType() == MVT::Glue)
3114 return false;
3115 // First CopyToReg
3116 TCChain = UseChain;
3117 }
3118 }
3119 } else if (Copy->getOpcode() == ISD::BITCAST) {
3120 // f32 returned in a single GPR.
3121 if (!Copy->hasOneUse())
3122 return false;
3123 Copy = *Copy->user_begin();
3124 if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
3125 return false;
3126 // If the copy has a glue operand, we conservatively assume it isn't safe to
3127 // perform a tail call.
3128 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3129 return false;
3130 TCChain = Copy->getOperand(0);
3131 } else {
3132 return false;
3133 }
3134
3135 bool HasRet = false;
3136 for (const SDNode *U : Copy->users()) {
3137 if (U->getOpcode() != ARMISD::RET_GLUE &&
3138 U->getOpcode() != ARMISD::INTRET_GLUE)
3139 return false;
3140 HasRet = true;
3141 }
3142
3143 if (!HasRet)
3144 return false;
3145
3146 Chain = TCChain;
3147 return true;
3148}
3149
3150bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3151 if (!Subtarget->supportsTailCall())
3152 return false;
3153
3154 if (!CI->isTailCall())
3155 return false;
3156
3157 return true;
3158}
3159
3160// Trying to write a 64 bit value so need to split into two 32 bit values first,
3161// and pass the lower and high parts through.
3163 SDLoc DL(Op);
3164 SDValue WriteValue = Op->getOperand(2);
3165
3166 // This function is only supposed to be called for i64 type argument.
3167 assert(WriteValue.getValueType() == MVT::i64
3168 && "LowerWRITE_REGISTER called for non-i64 type argument.");
3169
3170 SDValue Lo, Hi;
3171 std::tie(Lo, Hi) = DAG.SplitScalar(WriteValue, DL, MVT::i32, MVT::i32);
3172 SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi };
3173 return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);
3174}
3175
3176// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
3177// their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
3178// one of the above mentioned nodes. It has to be wrapped because otherwise
3179// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
3180// be used to form addressing mode. These wrapped nodes will be selected
3181// into MOVi.
3182SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
3183 SelectionDAG &DAG) const {
3184 EVT PtrVT = Op.getValueType();
3185 // FIXME there is no actual debug info here
3186 SDLoc dl(Op);
3187 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
3188 SDValue Res;
3189
3190 // When generating execute-only code Constant Pools must be promoted to the
3191 // global data section. It's a bit ugly that we can't share them across basic
3192 // blocks, but this way we guarantee that execute-only behaves correct with
3193 // position-independent addressing modes.
3194 if (Subtarget->genExecuteOnly()) {
3195 auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
3196 auto *T = CP->getType();
3197 auto C = const_cast<Constant*>(CP->getConstVal());
3198 auto M = DAG.getMachineFunction().getFunction().getParent();
3199 auto GV = new GlobalVariable(
3200 *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C,
3201 Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" +
3202 Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" +
3203 Twine(AFI->createPICLabelUId())
3204 );
3206 dl, PtrVT);
3207 return LowerGlobalAddress(GA, DAG);
3208 }
3209
3210 // The 16-bit ADR instruction can only encode offsets that are multiples of 4,
3211 // so we need to align to at least 4 bytes when we don't have 32-bit ADR.
3212 Align CPAlign = CP->getAlign();
3213 if (Subtarget->isThumb1Only())
3214 CPAlign = std::max(CPAlign, Align(4));
3215 if (CP->isMachineConstantPoolEntry())
3216 Res =
3217 DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CPAlign);
3218 else
3219 Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CPAlign);
3220 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
3221}
3222
3224 // If we don't have a 32-bit pc-relative branch instruction then the jump
3225 // table consists of block addresses. Usually this is inline, but for
3226 // execute-only it must be placed out-of-line.
3227 if (Subtarget->genExecuteOnly() && !Subtarget->hasV8MBaselineOps())
3230}
3231
3232SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
3233 SelectionDAG &DAG) const {
3236 unsigned ARMPCLabelIndex = 0;
3237 SDLoc DL(Op);
3238 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3239 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
3240 SDValue CPAddr;
3241 bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI();
3242 if (!IsPositionIndependent) {
3243 CPAddr = DAG.getTargetConstantPool(BA, PtrVT, Align(4));
3244 } else {
3245 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
3246 ARMPCLabelIndex = AFI->createPICLabelUId();
3248 ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
3249 ARMCP::CPBlockAddress, PCAdj);
3250 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3251 }
3252 CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
3253 SDValue Result = DAG.getLoad(
3254 PtrVT, DL, DAG.getEntryNode(), CPAddr,
3256 if (!IsPositionIndependent)
3257 return Result;
3258 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
3259 return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
3260}
3261
3262/// Convert a TLS address reference into the correct sequence of loads
3263/// and calls to compute the variable's address for Darwin, and return an
3264/// SDValue containing the final node.
3265
3266/// Darwin only has one TLS scheme which must be capable of dealing with the
3267/// fully general situation, in the worst case. This means:
3268/// + "extern __thread" declaration.
3269/// + Defined in a possibly unknown dynamic library.
3270///
3271/// The general system is that each __thread variable has a [3 x i32] descriptor
3272/// which contains information used by the runtime to calculate the address. The
3273/// only part of this the compiler needs to know about is the first word, which
3274/// contains a function pointer that must be called with the address of the
3275/// entire descriptor in "r0".
3276///
3277/// Since this descriptor may be in a different unit, in general access must
3278/// proceed along the usual ARM rules. A common sequence to produce is:
3279///
3280/// movw rT1, :lower16:_var$non_lazy_ptr
3281/// movt rT1, :upper16:_var$non_lazy_ptr
3282/// ldr r0, [rT1]
3283/// ldr rT2, [r0]
3284/// blx rT2
3285/// [...address now in r0...]
3286SDValue
3287ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
3288 SelectionDAG &DAG) const {
3289 assert(Subtarget->isTargetDarwin() &&
3290 "This function expects a Darwin target");
3291 SDLoc DL(Op);
3292
3293 // First step is to get the address of the actua global symbol. This is where
3294 // the TLS descriptor lives.
3295 SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);
3296
3297 // The first entry in the descriptor is a function pointer that we must call
3298 // to obtain the address of the variable.
3299 SDValue Chain = DAG.getEntryNode();
3300 SDValue FuncTLVGet = DAG.getLoad(
3301 MVT::i32, DL, Chain, DescAddr,
3305 Chain = FuncTLVGet.getValue(1);
3306
3307 MachineFunction &F = DAG.getMachineFunction();
3308 MachineFrameInfo &MFI = F.getFrameInfo();
3309 MFI.setAdjustsStack(true);
3310
3311 // TLS calls preserve all registers except those that absolutely must be
3312 // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
3313 // silly).
3314 auto TRI =
3316 auto ARI = static_cast<const ARMRegisterInfo *>(TRI);
3317 const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction());
3318
3319 // Finally, we can make the call. This is just a degenerate version of a
3320 // normal AArch64 call node: r0 takes the address of the descriptor, and
3321 // returns the address of the variable in this thread.
3322 Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());
3323 Chain =
3324 DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
3325 Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),
3326 DAG.getRegisterMask(Mask), Chain.getValue(1));
3327 return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
3328}
3329
3330SDValue
3331ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
3332 SelectionDAG &DAG) const {
3333 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
3334
3335 SDValue Chain = DAG.getEntryNode();
3336 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3337 SDLoc DL(Op);
3338
3339 // Load the current TEB (thread environment block)
3340 SDValue Ops[] = {Chain,
3341 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
3342 DAG.getTargetConstant(15, DL, MVT::i32),
3343 DAG.getTargetConstant(0, DL, MVT::i32),
3344 DAG.getTargetConstant(13, DL, MVT::i32),
3345 DAG.getTargetConstant(0, DL, MVT::i32),
3346 DAG.getTargetConstant(2, DL, MVT::i32)};
3347 SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
3348 DAG.getVTList(MVT::i32, MVT::Other), Ops);
3349
3350 SDValue TEB = CurrentTEB.getValue(0);
3351 Chain = CurrentTEB.getValue(1);
3352
3353 // Load the ThreadLocalStoragePointer from the TEB
3354 // A pointer to the TLS array is located at offset 0x2c from the TEB.
3355 SDValue TLSArray =
3356 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL));
3357 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
3358
3359 // The pointer to the thread's TLS data area is at the TLS Index scaled by 4
3360 // offset into the TLSArray.
3361
3362 // Load the TLS index from the C runtime
3363 SDValue TLSIndex =
3364 DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG);
3365 TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex);
3366 TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo());
3367
3368 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
3369 DAG.getConstant(2, DL, MVT::i32));
3370 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
3371 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
3372 MachinePointerInfo());
3373
3374 // Get the offset of the start of the .tls section (section base)
3375 const auto *GA = cast<GlobalAddressSDNode>(Op);
3376 auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL);
3377 SDValue Offset = DAG.getLoad(
3378 PtrVT, DL, Chain,
3379 DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
3380 DAG.getTargetConstantPool(CPV, PtrVT, Align(4))),
3382
3383 return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset);
3384}
3385
3386// Lower ISD::GlobalTLSAddress using the "general dynamic" model
3387SDValue
3388ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
3389 SelectionDAG &DAG) const {
3390 SDLoc dl(GA);
3391 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3392 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3393 MachineFunction &MF = DAG.getMachineFunction();
3394 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3395 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3396 ARMConstantPoolValue *CPV =
3397 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3398 ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
3399 SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3400 Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
3401 Argument = DAG.getLoad(
3402 PtrVT, dl, DAG.getEntryNode(), Argument,
3404 SDValue Chain = Argument.getValue(1);
3405
3406 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3407 Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
3408
3409 // call __tls_get_addr.
3411 Args.emplace_back(Argument, Type::getInt32Ty(*DAG.getContext()));
3412
3413 // FIXME: is there useful debug info available here?
3414 TargetLowering::CallLoweringInfo CLI(DAG);
3415 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
3417 DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));
3418
3419 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
3420 return CallResult.first;
3421}
3422
3423// Lower ISD::GlobalTLSAddress using the "initial exec" or
3424// "local exec" model.
3425SDValue
3426ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
3427 SelectionDAG &DAG,
3428 TLSModel::Model model) const {
3429 const GlobalValue *GV = GA->getGlobal();
3430 SDLoc dl(GA);
3432 SDValue Chain = DAG.getEntryNode();
3433 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3434 // Get the Thread Pointer
3435 SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
3436
3437 if (model == TLSModel::InitialExec) {
3438 MachineFunction &MF = DAG.getMachineFunction();
3439 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3440 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3441 // Initial exec model.
3442 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3443 ARMConstantPoolValue *CPV =
3444 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3446 true);
3447 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3448 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3449 Offset = DAG.getLoad(
3450 PtrVT, dl, Chain, Offset,
3452 Chain = Offset.getValue(1);
3453
3454 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3455 Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
3456
3457 Offset = DAG.getLoad(
3458 PtrVT, dl, Chain, Offset,
3460 } else {
3461 // local exec model
3462 assert(model == TLSModel::LocalExec);
3463 ARMConstantPoolValue *CPV =
3465 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3466 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3467 Offset = DAG.getLoad(
3468 PtrVT, dl, Chain, Offset,
3470 }
3471
3472 // The address of the thread local variable is the add of the thread
3473 // pointer with the offset of the variable.
3474 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
3475}
3476
3477SDValue
3478ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
3479 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
3480 if (DAG.getTarget().useEmulatedTLS())
3481 return LowerToTLSEmulatedModel(GA, DAG);
3482
3483 if (Subtarget->isTargetDarwin())
3484 return LowerGlobalTLSAddressDarwin(Op, DAG);
3485
3486 if (Subtarget->isTargetWindows())
3487 return LowerGlobalTLSAddressWindows(Op, DAG);
3488
3489 // TODO: implement the "local dynamic" model
3490 assert(Subtarget->isTargetELF() && "Only ELF implemented here");
3492
3493 switch (model) {
3496 return LowerToTLSGeneralDynamicModel(GA, DAG);
3499 return LowerToTLSExecModels(GA, DAG, model);
3500 }
3501 llvm_unreachable("bogus TLS model");
3502}
3503
3504/// Return true if all users of V are within function F, looking through
3505/// ConstantExprs.
3506static bool allUsersAreInFunction(const Value *V, const Function *F) {
3507 SmallVector<const User*,4> Worklist(V->users());
3508 while (!Worklist.empty()) {
3509 auto *U = Worklist.pop_back_val();
3510 if (isa<ConstantExpr>(U)) {
3511 append_range(Worklist, U->users());
3512 continue;
3513 }
3514
3515 auto *I = dyn_cast<Instruction>(U);
3516 if (!I || I->getParent()->getParent() != F)
3517 return false;
3518 }
3519 return true;
3520}
3521
3523 const GlobalValue *GV, SelectionDAG &DAG,
3524 EVT PtrVT, const SDLoc &dl) {
3525 // If we're creating a pool entry for a constant global with unnamed address,
3526 // and the global is small enough, we can emit it inline into the constant pool
3527 // to save ourselves an indirection.
3528 //
3529 // This is a win if the constant is only used in one function (so it doesn't
3530 // need to be duplicated) or duplicating the constant wouldn't increase code
3531 // size (implying the constant is no larger than 4 bytes).
3532 const Function &F = DAG.getMachineFunction().getFunction();
3533
3534 // We rely on this decision to inline being idemopotent and unrelated to the
3535 // use-site. We know that if we inline a variable at one use site, we'll
3536 // inline it elsewhere too (and reuse the constant pool entry). Fast-isel
3537 // doesn't know about this optimization, so bail out if it's enabled else
3538 // we could decide to inline here (and thus never emit the GV) but require
3539 // the GV from fast-isel generated code.
3542 return SDValue();
3543
3544 auto *GVar = dyn_cast<GlobalVariable>(GV);
3545 if (!GVar || !GVar->hasInitializer() ||
3546 !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() ||
3547 !GVar->hasLocalLinkage())
3548 return SDValue();
3549
3550 // If we inline a value that contains relocations, we move the relocations
3551 // from .data to .text. This is not allowed in position-independent code.
3552 auto *Init = GVar->getInitializer();
3553 if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) &&
3554 Init->needsDynamicRelocation())
3555 return SDValue();
3556
3557 // The constant islands pass can only really deal with alignment requests
3558 // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote
3559 // any type wanting greater alignment requirements than 4 bytes. We also
3560 // can only promote constants that are multiples of 4 bytes in size or
3561 // are paddable to a multiple of 4. Currently we only try and pad constants
3562 // that are strings for simplicity.
3563 auto *CDAInit = dyn_cast<ConstantDataArray>(Init);
3564 unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType());
3565 Align PrefAlign = DAG.getDataLayout().getPreferredAlign(GVar);
3566 unsigned RequiredPadding = 4 - (Size % 4);
3567 bool PaddingPossible =
3568 RequiredPadding == 4 || (CDAInit && CDAInit->isString());
3569 if (!PaddingPossible || PrefAlign > 4 || Size > ConstpoolPromotionMaxSize ||
3570 Size == 0)
3571 return SDValue();
3572
3573 unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding);
3575 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3576
3577 // We can't bloat the constant pool too much, else the ConstantIslands pass
3578 // may fail to converge. If we haven't promoted this global yet (it may have
3579 // multiple uses), and promoting it would increase the constant pool size (Sz
3580 // > 4), ensure we have space to do so up to MaxTotal.
3581 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4)
3582 if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >=
3584 return SDValue();
3585
3586 // This is only valid if all users are in a single function; we can't clone
3587 // the constant in general. The LLVM IR unnamed_addr allows merging
3588 // constants, but not cloning them.
3589 //
3590 // We could potentially allow cloning if we could prove all uses of the
3591 // constant in the current function don't care about the address, like
3592 // printf format strings. But that isn't implemented for now.
3593 if (!allUsersAreInFunction(GVar, &F))
3594 return SDValue();
3595
3596 // We're going to inline this global. Pad it out if needed.
3597 if (RequiredPadding != 4) {
3598 StringRef S = CDAInit->getAsString();
3599
3601 std::copy(S.bytes_begin(), S.bytes_end(), V.begin());
3602 while (RequiredPadding--)
3603 V.push_back(0);
3605 }
3606
3607 auto CPVal = ARMConstantPoolConstant::Create(GVar, Init);
3608 SDValue CPAddr = DAG.getTargetConstantPool(CPVal, PtrVT, Align(4));
3609 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) {
3612 PaddedSize - 4);
3613 }
3614 ++NumConstpoolPromoted;
3615 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3616}
3617
3619 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
3620 if (!(GV = GA->getAliaseeObject()))
3621 return false;
3622 if (const auto *V = dyn_cast<GlobalVariable>(GV))
3623 return V->isConstant();
3624 return isa<Function>(GV);
3625}
3626
3627SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op,
3628 SelectionDAG &DAG) const {
3629 switch (Subtarget->getTargetTriple().getObjectFormat()) {
3630 default: llvm_unreachable("unknown object format");
3631 case Triple::COFF:
3632 return LowerGlobalAddressWindows(Op, DAG);
3633 case Triple::ELF:
3634 return LowerGlobalAddressELF(Op, DAG);
3635 case Triple::MachO:
3636 return LowerGlobalAddressDarwin(Op, DAG);
3637 }
3638}
3639
3640SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
3641 SelectionDAG &DAG) const {
3642 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3643 SDLoc dl(Op);
3644 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3645 bool IsRO = isReadOnly(GV);
3646
3647 // promoteToConstantPool only if not generating XO text section
3648 if (GV->isDSOLocal() && !Subtarget->genExecuteOnly())
3649 if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl))
3650 return V;
3651
3652 if (isPositionIndependent()) {
3654 GV, dl, PtrVT, 0, GV->isDSOLocal() ? 0 : ARMII::MO_GOT);
3655 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3656 if (!GV->isDSOLocal())
3657 Result =
3658 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3660 return Result;
3661 } else if (Subtarget->isROPI() && IsRO) {
3662 // PC-relative.
3663 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
3664 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3665 return Result;
3666 } else if (Subtarget->isRWPI() && !IsRO) {
3667 // SB-relative.
3668 SDValue RelAddr;
3669 if (Subtarget->useMovt()) {
3670 ++NumMovwMovt;
3671 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL);
3672 RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G);
3673 } else { // use literal pool for address constant
3674 ARMConstantPoolValue *CPV =
3676 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3677 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3678 RelAddr = DAG.getLoad(
3679 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3681 }
3682 SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT);
3683 SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr);
3684 return Result;
3685 }
3686
3687 // If we have T2 ops, we can materialize the address directly via movt/movw
3688 // pair. This is always cheaper. If need to generate Execute Only code, and we
3689 // only have Thumb1 available, we can't use a constant pool and are forced to
3690 // use immediate relocations.
3691 if (Subtarget->useMovt() || Subtarget->genExecuteOnly()) {
3692 if (Subtarget->useMovt())
3693 ++NumMovwMovt;
3694 // FIXME: Once remat is capable of dealing with instructions with register
3695 // operands, expand this into two nodes.
3696 return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
3697 DAG.getTargetGlobalAddress(GV, dl, PtrVT));
3698 } else {
3699 SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, Align(4));
3700 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3701 return DAG.getLoad(
3702 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3704 }
3705}
3706
3707SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
3708 SelectionDAG &DAG) const {
3709 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3710 "ROPI/RWPI not currently supported for Darwin");
3711 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3712 SDLoc dl(Op);
3713 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3714
3715 if (Subtarget->useMovt())
3716 ++NumMovwMovt;
3717
3718 // FIXME: Once remat is capable of dealing with instructions with register
3719 // operands, expand this into multiple nodes
3720 unsigned Wrapper =
3721 isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper;
3722
3723 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY);
3724 SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G);
3725
3726 if (Subtarget->isGVIndirectSymbol(GV))
3727 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3729 return Result;
3730}
3731
3732SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
3733 SelectionDAG &DAG) const {
3734 assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported");
3735 assert(Subtarget->useMovt() &&
3736 "Windows on ARM expects to use movw/movt");
3737 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3738 "ROPI/RWPI not currently supported for Windows");
3739
3740 const TargetMachine &TM = getTargetMachine();
3741 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3742 ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG;
3743 if (GV->hasDLLImportStorageClass())
3744 TargetFlags = ARMII::MO_DLLIMPORT;
3745 else if (!TM.shouldAssumeDSOLocal(GV))
3746 TargetFlags = ARMII::MO_COFFSTUB;
3747 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3749 SDLoc DL(Op);
3750
3751 ++NumMovwMovt;
3752
3753 // FIXME: Once remat is capable of dealing with instructions with register
3754 // operands, expand this into two nodes.
3755 Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
3756 DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*offset=*/0,
3757 TargetFlags));
3758 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
3759 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
3761 return Result;
3762}
3763
3764SDValue
3765ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
3766 SDLoc dl(Op);
3767 SDValue Val = DAG.getConstant(0, dl, MVT::i32);
3768 return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
3769 DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
3770 Op.getOperand(1), Val);
3771}
3772
3773SDValue
3774ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
3775 SDLoc dl(Op);
3776 return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
3777 Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32));
3778}
3779
3780SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
3781 SelectionDAG &DAG) const {
3782 SDLoc dl(Op);
3783 return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other,
3784 Op.getOperand(0));
3785}
3786
3787SDValue ARMTargetLowering::LowerINTRINSIC_VOID(
3788 SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const {
3789 unsigned IntNo =
3790 Op.getConstantOperandVal(Op.getOperand(0).getValueType() == MVT::Other);
3791 switch (IntNo) {
3792 default:
3793 return SDValue(); // Don't custom lower most intrinsics.
3794 case Intrinsic::arm_gnu_eabi_mcount: {
3795 MachineFunction &MF = DAG.getMachineFunction();
3796 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3797 SDLoc dl(Op);
3798 SDValue Chain = Op.getOperand(0);
3799 // call "\01__gnu_mcount_nc"
3800 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
3801 const uint32_t *Mask =
3803 assert(Mask && "Missing call preserved mask for calling convention");
3804 // Mark LR an implicit live-in.
3805 Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
3806 SDValue ReturnAddress =
3807 DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT);
3808 constexpr EVT ResultTys[] = {MVT::Other, MVT::Glue};
3809 SDValue Callee =
3810 DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0);
3812 if (Subtarget->isThumb())
3813 return SDValue(
3814 DAG.getMachineNode(
3815 ARM::tBL_PUSHLR, dl, ResultTys,
3816 {ReturnAddress, DAG.getTargetConstant(ARMCC::AL, dl, PtrVT),
3817 DAG.getRegister(0, PtrVT), Callee, RegisterMask, Chain}),
3818 0);
3819 return SDValue(
3820 DAG.getMachineNode(ARM::BL_PUSHLR, dl, ResultTys,
3821 {ReturnAddress, Callee, RegisterMask, Chain}),
3822 0);
3823 }
3824 }
3825}
3826
3827SDValue
3828ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
3829 const ARMSubtarget *Subtarget) const {
3830 unsigned IntNo = Op.getConstantOperandVal(0);
3831 SDLoc dl(Op);
3832 switch (IntNo) {
3833 default: return SDValue(); // Don't custom lower most intrinsics.
3834 case Intrinsic::thread_pointer: {
3835 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3836 return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
3837 }
3838 case Intrinsic::arm_cls: {
3839 const SDValue &Operand = Op.getOperand(1);
3840 const EVT VTy = Op.getValueType();
3841 SDValue SRA =
3842 DAG.getNode(ISD::SRA, dl, VTy, Operand, DAG.getConstant(31, dl, VTy));
3843 SDValue XOR = DAG.getNode(ISD::XOR, dl, VTy, SRA, Operand);
3844 SDValue SHL =
3845 DAG.getNode(ISD::SHL, dl, VTy, XOR, DAG.getConstant(1, dl, VTy));
3846 SDValue OR =
3847 DAG.getNode(ISD::OR, dl, VTy, SHL, DAG.getConstant(1, dl, VTy));
3848 SDValue Result = DAG.getNode(ISD::CTLZ, dl, VTy, OR);
3849 return Result;
3850 }
3851 case Intrinsic::arm_cls64: {
3852 // cls(x) = if cls(hi(x)) != 31 then cls(hi(x))
3853 // else 31 + clz(if hi(x) == 0 then lo(x) else not(lo(x)))
3854 const SDValue &Operand = Op.getOperand(1);
3855 const EVT VTy = Op.getValueType();
3856 SDValue Lo, Hi;
3857 std::tie(Lo, Hi) = DAG.SplitScalar(Operand, dl, VTy, VTy);
3858 SDValue Constant0 = DAG.getConstant(0, dl, VTy);
3859 SDValue Constant1 = DAG.getConstant(1, dl, VTy);
3860 SDValue Constant31 = DAG.getConstant(31, dl, VTy);
3861 SDValue SRAHi = DAG.getNode(ISD::SRA, dl, VTy, Hi, Constant31);
3862 SDValue XORHi = DAG.getNode(ISD::XOR, dl, VTy, SRAHi, Hi);
3863 SDValue SHLHi = DAG.getNode(ISD::SHL, dl, VTy, XORHi, Constant1);
3864 SDValue ORHi = DAG.getNode(ISD::OR, dl, VTy, SHLHi, Constant1);
3865 SDValue CLSHi = DAG.getNode(ISD::CTLZ, dl, VTy, ORHi);
3866 SDValue CheckLo =
3867 DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::CondCode::SETEQ);
3868 SDValue HiIsZero =
3869 DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::CondCode::SETEQ);
3870 SDValue AdjustedLo =
3871 DAG.getSelect(dl, VTy, HiIsZero, Lo, DAG.getNOT(dl, Lo, VTy));
3872 SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VTy, AdjustedLo);
3873 SDValue Result =
3874 DAG.getSelect(dl, VTy, CheckLo,
3875 DAG.getNode(ISD::ADD, dl, VTy, CLZAdjustedLo, Constant31), CLSHi);
3876 return Result;
3877 }
3878 case Intrinsic::eh_sjlj_lsda: {
3879 MachineFunction &MF = DAG.getMachineFunction();
3880 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3881 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3882 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3883 SDValue CPAddr;
3884 bool IsPositionIndependent = isPositionIndependent();
3885 unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
3886 ARMConstantPoolValue *CPV =
3887 ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex,
3888 ARMCP::CPLSDA, PCAdj);
3889 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3890 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3891 SDValue Result = DAG.getLoad(
3892 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3894
3895 if (IsPositionIndependent) {
3896 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3897 Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
3898 }
3899 return Result;
3900 }
3901 case Intrinsic::arm_neon_vabs:
3902 return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(),
3903 Op.getOperand(1));
3904 case Intrinsic::arm_neon_vabds:
3905 if (Op.getValueType().isInteger())
3906 return DAG.getNode(ISD::ABDS, SDLoc(Op), Op.getValueType(),
3907 Op.getOperand(1), Op.getOperand(2));
3908 return SDValue();
3909 case Intrinsic::arm_neon_vabdu:
3910 return DAG.getNode(ISD::ABDU, SDLoc(Op), Op.getValueType(),
3911 Op.getOperand(1), Op.getOperand(2));
3912 case Intrinsic::arm_neon_vmulls:
3913 case Intrinsic::arm_neon_vmullu: {
3914 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
3915 ? ARMISD::VMULLs : ARMISD::VMULLu;
3916 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3917 Op.getOperand(1), Op.getOperand(2));
3918 }
3919 case Intrinsic::arm_neon_vminnm:
3920 case Intrinsic::arm_neon_vmaxnm: {
3921 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm)
3922 ? ISD::FMINNUM : ISD::FMAXNUM;
3923 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3924 Op.getOperand(1), Op.getOperand(2));
3925 }
3926 case Intrinsic::arm_neon_vminu:
3927 case Intrinsic::arm_neon_vmaxu: {
3928 if (Op.getValueType().isFloatingPoint())
3929 return SDValue();
3930 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu)
3931 ? ISD::UMIN : ISD::UMAX;
3932 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3933 Op.getOperand(1), Op.getOperand(2));
3934 }
3935 case Intrinsic::arm_neon_vmins:
3936 case Intrinsic::arm_neon_vmaxs: {
3937 // v{min,max}s is overloaded between signed integers and floats.
3938 if (!Op.getValueType().isFloatingPoint()) {
3939 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
3940 ? ISD::SMIN : ISD::SMAX;
3941 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3942 Op.getOperand(1), Op.getOperand(2));
3943 }
3944 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
3945 ? ISD::FMINIMUM : ISD::FMAXIMUM;
3946 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3947 Op.getOperand(1), Op.getOperand(2));
3948 }
3949 case Intrinsic::arm_neon_vtbl1:
3950 return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(),
3951 Op.getOperand(1), Op.getOperand(2));
3952 case Intrinsic::arm_neon_vtbl2:
3953 return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(),
3954 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3955 case Intrinsic::arm_mve_pred_i2v:
3956 case Intrinsic::arm_mve_pred_v2i:
3957 return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(),
3958 Op.getOperand(1));
3959 case Intrinsic::arm_mve_vreinterpretq:
3960 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(Op), Op.getValueType(),
3961 Op.getOperand(1));
3962 case Intrinsic::arm_mve_lsll:
3963 return DAG.getNode(ARMISD::LSLL, SDLoc(Op), Op->getVTList(),
3964 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3965 case Intrinsic::arm_mve_asrl:
3966 return DAG.getNode(ARMISD::ASRL, SDLoc(Op), Op->getVTList(),
3967 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3968 }
3969}
3970
3972 const ARMSubtarget *Subtarget) {
3973 SDLoc dl(Op);
3974 auto SSID = static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
3975 if (SSID == SyncScope::SingleThread)
3976 return Op;
3977
3978 if (!Subtarget->hasDataBarrier()) {
3979 // Some ARMv6 cpus can support data barriers with an mcr instruction.
3980 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
3981 // here.
3982 assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
3983 "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
3984 return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
3985 DAG.getConstant(0, dl, MVT::i32));
3986 }
3987
3988 AtomicOrdering Ord =
3989 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
3991 if (Subtarget->isMClass()) {
3992 // Only a full system barrier exists in the M-class architectures.
3994 } else if (Subtarget->preferISHSTBarriers() &&
3995 Ord == AtomicOrdering::Release) {
3996 // Swift happens to implement ISHST barriers in a way that's compatible with
3997 // Release semantics but weaker than ISH so we'd be fools not to use
3998 // it. Beware: other processors probably don't!
4000 }
4001
4002 return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),
4003 DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32),
4004 DAG.getConstant(Domain, dl, MVT::i32));
4005}
4006
4008 const ARMSubtarget *Subtarget) {
4009 // ARM pre v5TE and Thumb1 does not have preload instructions.
4010 if (!(Subtarget->isThumb2() ||
4011 (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
4012 // Just preserve the chain.
4013 return Op.getOperand(0);
4014
4015 SDLoc dl(Op);
4016 unsigned isRead = ~Op.getConstantOperandVal(2) & 1;
4017 if (!isRead &&
4018 (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
4019 // ARMv7 with MP extension has PLDW.
4020 return Op.getOperand(0);
4021
4022 unsigned isData = Op.getConstantOperandVal(4);
4023 if (Subtarget->isThumb()) {
4024 // Invert the bits.
4025 isRead = ~isRead & 1;
4026 isData = ~isData & 1;
4027 }
4028
4029 return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
4030 Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32),
4031 DAG.getConstant(isData, dl, MVT::i32));
4032}
4033
4036 ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();
4037
4038 // vastart just stores the address of the VarArgsFrameIndex slot into the
4039 // memory location argument.
4040 SDLoc dl(Op);
4042 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
4043 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
4044 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
4045 MachinePointerInfo(SV));
4046}
4047
4048SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA,
4049 CCValAssign &NextVA,
4050 SDValue &Root,
4051 SelectionDAG &DAG,
4052 const SDLoc &dl) const {
4053 MachineFunction &MF = DAG.getMachineFunction();
4054 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4055
4056 const TargetRegisterClass *RC;
4057 if (AFI->isThumb1OnlyFunction())
4058 RC = &ARM::tGPRRegClass;
4059 else
4060 RC = &ARM::GPRRegClass;
4061
4062 // Transform the arguments stored in physical registers into virtual ones.
4063 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
4064 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
4065
4066 SDValue ArgValue2;
4067 if (NextVA.isMemLoc()) {
4068 MachineFrameInfo &MFI = MF.getFrameInfo();
4069 int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true);
4070
4071 // Create load node to retrieve arguments from the stack.
4072 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
4073 ArgValue2 = DAG.getLoad(
4074 MVT::i32, dl, Root, FIN,
4076 } else {
4077 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
4078 ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
4079 }
4080 if (!Subtarget->isLittle())
4081 std::swap (ArgValue, ArgValue2);
4082 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
4083}
4084
4085// The remaining GPRs hold either the beginning of variable-argument
4086// data, or the beginning of an aggregate passed by value (usually
4087// byval). Either way, we allocate stack slots adjacent to the data
4088// provided by our caller, and store the unallocated registers there.
4089// If this is a variadic function, the va_list pointer will begin with
4090// these values; otherwise, this reassembles a (byval) structure that
4091// was split between registers and memory.
4092// Return: The frame index registers were stored into.
4093int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
4094 const SDLoc &dl, SDValue &Chain,
4095 const Value *OrigArg,
4096 unsigned InRegsParamRecordIdx,
4097 int ArgOffset, unsigned ArgSize) const {
4098 // Currently, two use-cases possible:
4099 // Case #1. Non-var-args function, and we meet first byval parameter.
4100 // Setup first unallocated register as first byval register;
4101 // eat all remained registers
4102 // (these two actions are performed by HandleByVal method).
4103 // Then, here, we initialize stack frame with
4104 // "store-reg" instructions.
4105 // Case #2. Var-args function, that doesn't contain byval parameters.
4106 // The same: eat all remained unallocated registers,
4107 // initialize stack frame.
4108
4109 MachineFunction &MF = DAG.getMachineFunction();
4110 MachineFrameInfo &MFI = MF.getFrameInfo();
4111 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4112 unsigned RBegin, REnd;
4113 if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
4114 CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
4115 } else {
4116 unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
4117 RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx];
4118 REnd = ARM::R4;
4119 }
4120
4121 if (REnd != RBegin)
4122 ArgOffset = -4 * (ARM::R4 - RBegin);
4123
4124 auto PtrVT = getPointerTy(DAG.getDataLayout());
4125 int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false);
4126 SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT);
4127
4129 const TargetRegisterClass *RC =
4130 AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
4131
4132 for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {
4133 Register VReg = MF.addLiveIn(Reg, RC);
4134 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
4135 SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
4136 MachinePointerInfo(OrigArg, 4 * i));
4137 MemOps.push_back(Store);
4138 FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT));
4139 }
4140
4141 if (!MemOps.empty())
4142 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
4143 return FrameIndex;
4144}
4145
4146// Setup stack frame, the va_list pointer will start from.
4147void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
4148 const SDLoc &dl, SDValue &Chain,
4149 unsigned ArgOffset,
4150 unsigned TotalArgRegsSaveSize,
4151 bool ForceMutable) const {
4152 MachineFunction &MF = DAG.getMachineFunction();
4153 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4154
4155 // Try to store any remaining integer argument regs
4156 // to their spots on the stack so that they may be loaded by dereferencing
4157 // the result of va_next.
4158 // If there is no regs to be stored, just point address after last
4159 // argument passed via stack.
4160 int FrameIndex = StoreByValRegs(
4161 CCInfo, DAG, dl, Chain, nullptr, CCInfo.getInRegsParamsCount(),
4162 CCInfo.getStackSize(), std::max(4U, TotalArgRegsSaveSize));
4163 AFI->setVarArgsFrameIndex(FrameIndex);
4164}
4165
4166bool ARMTargetLowering::splitValueIntoRegisterParts(
4167 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
4168 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
4169 EVT ValueVT = Val.getValueType();
4170 if ((ValueVT == MVT::f16 || ValueVT == MVT::bf16) && PartVT == MVT::f32) {
4171 unsigned ValueBits = ValueVT.getSizeInBits();
4172 unsigned PartBits = PartVT.getSizeInBits();
4173 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val);
4174 Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val);
4175 Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
4176 Parts[0] = Val;
4177 return true;
4178 }
4179 return false;
4180}
4181
4182SDValue ARMTargetLowering::joinRegisterPartsIntoValue(
4183 SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
4184 MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const {
4185 if ((ValueVT == MVT::f16 || ValueVT == MVT::bf16) && PartVT == MVT::f32) {
4186 unsigned ValueBits = ValueVT.getSizeInBits();
4187 unsigned PartBits = PartVT.getSizeInBits();
4188 SDValue Val = Parts[0];
4189
4190 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val);
4191 Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val);
4192 Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
4193 return Val;
4194 }
4195 return SDValue();
4196}
4197
4198SDValue ARMTargetLowering::LowerFormalArguments(
4199 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4200 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4201 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4202 MachineFunction &MF = DAG.getMachineFunction();
4203 MachineFrameInfo &MFI = MF.getFrameInfo();
4204
4205 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4206
4207 // Assign locations to all of the incoming arguments.
4209 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
4210 *DAG.getContext());
4211 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
4212
4214 unsigned CurArgIdx = 0;
4215
4216 // Initially ArgRegsSaveSize is zero.
4217 // Then we increase this value each time we meet byval parameter.
4218 // We also increase this value in case of varargs function.
4219 AFI->setArgRegsSaveSize(0);
4220
4221 // Calculate the amount of stack space that we need to allocate to store
4222 // byval and variadic arguments that are passed in registers.
4223 // We need to know this before we allocate the first byval or variadic
4224 // argument, as they will be allocated a stack slot below the CFA (Canonical
4225 // Frame Address, the stack pointer at entry to the function).
4226 unsigned ArgRegBegin = ARM::R4;
4227 for (const CCValAssign &VA : ArgLocs) {
4228 if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount())
4229 break;
4230
4231 unsigned Index = VA.getValNo();
4232 ISD::ArgFlagsTy Flags = Ins[Index].Flags;
4233 if (!Flags.isByVal())
4234 continue;
4235
4236 assert(VA.isMemLoc() && "unexpected byval pointer in reg");
4237 unsigned RBegin, REnd;
4238 CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd);
4239 ArgRegBegin = std::min(ArgRegBegin, RBegin);
4240
4241 CCInfo.nextInRegsParam();
4242 }
4243 CCInfo.rewindByValRegsInfo();
4244
4245 int lastInsIndex = -1;
4246 if (isVarArg && MFI.hasVAStart()) {
4247 unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
4248 if (RegIdx != std::size(GPRArgRegs))
4249 ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]);
4250 }
4251
4252 unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin);
4253 AFI->setArgRegsSaveSize(TotalArgRegsSaveSize);
4254 auto PtrVT = getPointerTy(DAG.getDataLayout());
4255
4256 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4257 CCValAssign &VA = ArgLocs[i];
4258 if (Ins[VA.getValNo()].isOrigArg()) {
4259 std::advance(CurOrigArg,
4260 Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx);
4261 CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex();
4262 }
4263 // Arguments stored in registers.
4264 if (VA.isRegLoc()) {
4265 EVT RegVT = VA.getLocVT();
4266 SDValue ArgValue;
4267
4268 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
4269 // f64 and vector types are split up into multiple registers or
4270 // combinations of registers and stack slots.
4271 SDValue ArgValue1 =
4272 GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4273 VA = ArgLocs[++i]; // skip ahead to next loc
4274 SDValue ArgValue2;
4275 if (VA.isMemLoc()) {
4276 int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);
4277 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4278 ArgValue2 = DAG.getLoad(
4279 MVT::f64, dl, Chain, FIN,
4281 } else {
4282 ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4283 }
4284 ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
4285 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
4286 ArgValue1, DAG.getIntPtrConstant(0, dl));
4287 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
4288 ArgValue2, DAG.getIntPtrConstant(1, dl));
4289 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
4290 ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4291 } else {
4292 const TargetRegisterClass *RC;
4293
4294 if (RegVT == MVT::f16 || RegVT == MVT::bf16)
4295 RC = &ARM::HPRRegClass;
4296 else if (RegVT == MVT::f32)
4297 RC = &ARM::SPRRegClass;
4298 else if (RegVT == MVT::f64 || RegVT == MVT::v4f16 ||
4299 RegVT == MVT::v4bf16)
4300 RC = &ARM::DPRRegClass;
4301 else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16 ||
4302 RegVT == MVT::v8bf16)
4303 RC = &ARM::QPRRegClass;
4304 else if (RegVT == MVT::i32)
4305 RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
4306 : &ARM::GPRRegClass;
4307 else
4308 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
4309
4310 // Transform the arguments in physical registers into virtual ones.
4311 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
4312 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
4313
4314 // If this value is passed in r0 and has the returned attribute (e.g.
4315 // C++ 'structors), record this fact for later use.
4316 if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) {
4317 AFI->setPreservesR0();
4318 }
4319 }
4320
4321 // If this is an 8 or 16-bit value, it is really passed promoted
4322 // to 32 bits. Insert an assert[sz]ext to capture this, then
4323 // truncate to the right size.
4324 switch (VA.getLocInfo()) {
4325 default: llvm_unreachable("Unknown loc info!");
4326 case CCValAssign::Full: break;
4327 case CCValAssign::BCvt:
4328 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
4329 break;
4330 }
4331
4332 // f16 arguments have their size extended to 4 bytes and passed as if they
4333 // had been copied to the LSBs of a 32-bit register.
4334 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
4335 if (VA.needsCustom() &&
4336 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
4337 ArgValue = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), ArgValue);
4338
4339 // On CMSE Entry Functions, formal integer arguments whose bitwidth is
4340 // less than 32 bits must be sign- or zero-extended in the callee for
4341 // security reasons. Although the ABI mandates an extension done by the
4342 // caller, the latter cannot be trusted to follow the rules of the ABI.
4343 const ISD::InputArg &Arg = Ins[VA.getValNo()];
4344 if (AFI->isCmseNSEntryFunction() && Arg.ArgVT.isScalarInteger() &&
4345 RegVT.isScalarInteger() && Arg.ArgVT.bitsLT(MVT::i32))
4346 ArgValue = handleCMSEValue(ArgValue, Arg, DAG, dl);
4347
4348 InVals.push_back(ArgValue);
4349 } else { // VA.isRegLoc()
4350 // Only arguments passed on the stack should make it here.
4351 assert(VA.isMemLoc());
4352 assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
4353
4354 int index = VA.getValNo();
4355
4356 // Some Ins[] entries become multiple ArgLoc[] entries.
4357 // Process them only once.
4358 if (index != lastInsIndex)
4359 {
4360 ISD::ArgFlagsTy Flags = Ins[index].Flags;
4361 // FIXME: For now, all byval parameter objects are marked mutable.
4362 // This can be changed with more analysis.
4363 // In case of tail call optimization mark all arguments mutable.
4364 // Since they could be overwritten by lowering of arguments in case of
4365 // a tail call.
4366 if (Flags.isByVal()) {
4367 assert(Ins[index].isOrigArg() &&
4368 "Byval arguments cannot be implicit");
4369 unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();
4370
4371 int FrameIndex = StoreByValRegs(
4372 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex,
4373 VA.getLocMemOffset(), Flags.getByValSize());
4374 InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT));
4375 CCInfo.nextInRegsParam();
4376 } else if (VA.needsCustom() && (VA.getValVT() == MVT::f16 ||
4377 VA.getValVT() == MVT::bf16)) {
4378 // f16 and bf16 values are passed in the least-significant half of
4379 // a 4 byte stack slot. This is done as-if the extension was done
4380 // in a 32-bit register, so the actual bytes used for the value
4381 // differ between little and big endian.
4382 assert(VA.getLocVT().getSizeInBits() == 32);
4383 unsigned FIOffset = VA.getLocMemOffset();
4384 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits() / 8,
4385 FIOffset, true);
4386
4387 SDValue Addr = DAG.getFrameIndex(FI, PtrVT);
4388 if (DAG.getDataLayout().isBigEndian())
4389 Addr = DAG.getObjectPtrOffset(dl, Addr, TypeSize::getFixed(2));
4390
4391 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, Addr,
4393 DAG.getMachineFunction(), FI)));
4394
4395 } else {
4396 unsigned FIOffset = VA.getLocMemOffset();
4397 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
4398 FIOffset, true);
4399
4400 // Create load nodes to retrieve arguments from the stack.
4401 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4402 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
4404 DAG.getMachineFunction(), FI)));
4405 }
4406 lastInsIndex = index;
4407 }
4408 }
4409 }
4410
4411 // varargs
4412 if (isVarArg && MFI.hasVAStart()) {
4413 VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getStackSize(),
4414 TotalArgRegsSaveSize);
4415 if (AFI->isCmseNSEntryFunction()) {
4416 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
4418 "secure entry function must not be variadic", dl.getDebugLoc()));
4419 }
4420 }
4421
4422 unsigned StackArgSize = CCInfo.getStackSize();
4423 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4424 if (canGuaranteeTCO(CallConv, TailCallOpt)) {
4425 // The only way to guarantee a tail call is if the callee restores its
4426 // argument area, but it must also keep the stack aligned when doing so.
4427 MaybeAlign StackAlign = DAG.getDataLayout().getStackAlignment();
4428 assert(StackAlign && "data layout string is missing stack alignment");
4429 StackArgSize = alignTo(StackArgSize, *StackAlign);
4430
4431 AFI->setArgumentStackToRestore(StackArgSize);
4432 }
4433 AFI->setArgumentStackSize(StackArgSize);
4434
4435 if (CCInfo.getStackSize() > 0 && AFI->isCmseNSEntryFunction()) {
4436 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
4438 "secure entry function requires arguments on stack", dl.getDebugLoc()));
4439 }
4440
4441 return Chain;
4442}
4443
4444/// isFloatingPointZero - Return true if this is +0.0.
4447 return CFP->getValueAPF().isPosZero();
4448 else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
4449 // Maybe this has already been legalized into the constant pool?
4450 if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
4451 SDValue WrapperOp = Op.getOperand(1).getOperand(0);
4453 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
4454 return CFP->getValueAPF().isPosZero();
4455 }
4456 } else if (Op->getOpcode() == ISD::BITCAST &&
4457 Op->getValueType(0) == MVT::f64) {
4458 // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
4459 // created by LowerConstantFP().
4460 SDValue BitcastOp = Op->getOperand(0);
4461 if (BitcastOp->getOpcode() == ARMISD::VMOVIMM &&
4462 isNullConstant(BitcastOp->getOperand(0)))
4463 return true;
4464 }
4465 return false;
4466}
4467
4468/// Returns appropriate ARM CMP (cmp) and corresponding condition code for
4469/// the given operands.
4470SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
4471 SDValue &ARMcc, SelectionDAG &DAG,
4472 const SDLoc &dl) const {
4473 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
4474 unsigned C = RHSC->getZExtValue();
4475 if (!isLegalICmpImmediate((int32_t)C)) {
4476 // Constant does not fit, try adjusting it by one.
4477 switch (CC) {
4478 default: break;
4479 case ISD::SETLT:
4480 case ISD::SETGE:
4481 if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
4482 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4483 RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4484 }
4485 break;
4486 case ISD::SETULT:
4487 case ISD::SETUGE:
4488 if (C != 0 && isLegalICmpImmediate(C-1)) {
4489 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
4490 RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4491 }
4492 break;
4493 case ISD::SETLE:
4494 case ISD::SETGT:
4495 if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
4496 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4497 RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4498 }
4499 break;
4500 case ISD::SETULE:
4501 case ISD::SETUGT:
4502 if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
4503 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
4504 RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4505 }
4506 break;
4507 }
4508 }
4509 } else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) &&
4511 // In ARM and Thumb-2, the compare instructions can shift their second
4512 // operand.
4514 std::swap(LHS, RHS);
4515 }
4516
4517 // Thumb1 has very limited immediate modes, so turning an "and" into a
4518 // shift can save multiple instructions.
4519 //
4520 // If we have (x & C1), and C1 is an appropriate mask, we can transform it
4521 // into "((x << n) >> n)". But that isn't necessarily profitable on its
4522 // own. If it's the operand to an unsigned comparison with an immediate,
4523 // we can eliminate one of the shifts: we transform
4524 // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)".
4525 //
4526 // We avoid transforming cases which aren't profitable due to encoding
4527 // details:
4528 //
4529 // 1. C2 fits into the immediate field of a cmp, and the transformed version
4530 // would not; in that case, we're essentially trading one immediate load for
4531 // another.
4532 // 2. C1 is 255 or 65535, so we can use uxtb or uxth.
4533 // 3. C2 is zero; we have other code for this special case.
4534 //
4535 // FIXME: Figure out profitability for Thumb2; we usually can't save an
4536 // instruction, since the AND is always one instruction anyway, but we could
4537 // use narrow instructions in some cases.
4538 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND &&
4539 LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4540 LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) &&
4541 !isSignedIntSetCC(CC)) {
4542 unsigned Mask = LHS.getConstantOperandVal(1);
4543 auto *RHSC = cast<ConstantSDNode>(RHS.getNode());
4544 uint64_t RHSV = RHSC->getZExtValue();
4545 if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) {
4546 unsigned ShiftBits = llvm::countl_zero(Mask);
4547 if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) {
4548 SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32);
4549 LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt);
4550 RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32);
4551 }
4552 }
4553 }
4554
4555 // The specific comparison "(x<<c) > 0x80000000U" can be optimized to a
4556 // single "lsls x, c+1". The shift sets the "C" and "Z" flags the same
4557 // way a cmp would.
4558 // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and
4559 // some tweaks to the heuristics for the previous and->shift transform.
4560 // FIXME: Optimize cases where the LHS isn't a shift.
4561 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL &&
4562 isa<ConstantSDNode>(RHS) && RHS->getAsZExtVal() == 0x80000000U &&
4563 CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4564 LHS.getConstantOperandVal(1) < 31) {
4565 unsigned ShiftAmt = LHS.getConstantOperandVal(1) + 1;
4566 SDValue Shift =
4567 DAG.getNode(ARMISD::LSLS, dl, DAG.getVTList(MVT::i32, FlagsVT),
4568 LHS.getOperand(0), DAG.getConstant(ShiftAmt, dl, MVT::i32));
4569 ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32);
4570 return Shift.getValue(1);
4571 }
4572
4574
4575 // If the RHS is a constant zero then the V (overflow) flag will never be
4576 // set. This can allow us to simplify GE to PL or LT to MI, which can be
4577 // simpler for other passes (like the peephole optimiser) to deal with.
4578 if (isNullConstant(RHS)) {
4579 switch (CondCode) {
4580 default: break;
4581 case ARMCC::GE:
4583 break;
4584 case ARMCC::LT:
4586 break;
4587 }
4588 }
4589
4590 unsigned CompareType;
4591 switch (CondCode) {
4592 default:
4593 CompareType = ARMISD::CMP;
4594 break;
4595 case ARMCC::EQ:
4596 case ARMCC::NE:
4597 // Uses only Z Flag
4598 CompareType = ARMISD::CMPZ;
4599 break;
4600 }
4601 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
4602 return DAG.getNode(CompareType, dl, FlagsVT, LHS, RHS);
4603}
4604
4605/// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
4606SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
4607 SelectionDAG &DAG, const SDLoc &dl,
4608 bool Signaling) const {
4609 assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64);
4610 SDValue Flags;
4612 Flags = DAG.getNode(Signaling ? ARMISD::CMPFPE : ARMISD::CMPFP, dl, FlagsVT,
4613 LHS, RHS);
4614 else
4615 Flags = DAG.getNode(Signaling ? ARMISD::CMPFPEw0 : ARMISD::CMPFPw0, dl,
4616 FlagsVT, LHS);
4617 return DAG.getNode(ARMISD::FMSTAT, dl, FlagsVT, Flags);
4618}
4619
4620// This function returns three things: the arithmetic computation itself
4621// (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The
4622// comparison and the condition code define the case in which the arithmetic
4623// computation *does not* overflow.
4624std::pair<SDValue, SDValue>
4625ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
4626 SDValue &ARMcc) const {
4627 assert(Op.getValueType() == MVT::i32 && "Unsupported value type");
4628
4629 SDValue Value, OverflowCmp;
4630 SDValue LHS = Op.getOperand(0);
4631 SDValue RHS = Op.getOperand(1);
4632 SDLoc dl(Op);
4633
4634 // FIXME: We are currently always generating CMPs because we don't support
4635 // generating CMN through the backend. This is not as good as the natural
4636 // CMP case because it causes a register dependency and cannot be folded
4637 // later.
4638
4639 switch (Op.getOpcode()) {
4640 default:
4641 llvm_unreachable("Unknown overflow instruction!");
4642 case ISD::SADDO:
4643 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4644 Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
4645 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value, LHS);
4646 break;
4647 case ISD::UADDO:
4648 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4649 // We use ADDC here to correspond to its use in LowerUnsignedALUO.
4650 // We do not use it in the USUBO case as Value may not be used.
4651 Value = DAG.getNode(ARMISD::ADDC, dl,
4652 DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS)
4653 .getValue(0);
4654 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value, LHS);
4655 break;
4656 case ISD::SSUBO:
4657 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4658 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4659 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, LHS, RHS);
4660 break;
4661 case ISD::USUBO:
4662 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4663 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4664 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, LHS, RHS);
4665 break;
4666 case ISD::UMULO:
4667 // We generate a UMUL_LOHI and then check if the high word is 0.
4668 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4669 Value = DAG.getNode(ISD::UMUL_LOHI, dl,
4670 DAG.getVTList(Op.getValueType(), Op.getValueType()),
4671 LHS, RHS);
4672 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value.getValue(1),
4673 DAG.getConstant(0, dl, MVT::i32));
4674 Value = Value.getValue(0); // We only want the low 32 bits for the result.
4675 break;
4676 case ISD::SMULO:
4677 // We generate a SMUL_LOHI and then check if all the bits of the high word
4678 // are the same as the sign bit of the low word.
4679 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4680 Value = DAG.getNode(ISD::SMUL_LOHI, dl,
4681 DAG.getVTList(Op.getValueType(), Op.getValueType()),
4682 LHS, RHS);
4683 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value.getValue(1),
4684 DAG.getNode(ISD::SRA, dl, Op.getValueType(),
4685 Value.getValue(0),
4686 DAG.getConstant(31, dl, MVT::i32)));
4687 Value = Value.getValue(0); // We only want the low 32 bits for the result.
4688 break;
4689 } // switch (...)
4690
4691 return std::make_pair(Value, OverflowCmp);
4692}
4693
4694SDValue
4695ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const {
4696 // Let legalize expand this if it isn't a legal type yet.
4697 if (!isTypeLegal(Op.getValueType()))
4698 return SDValue();
4699
4700 SDValue Value, OverflowCmp;
4701 SDValue ARMcc;
4702 std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc);
4703 SDLoc dl(Op);
4704 // We use 0 and 1 as false and true values.
4705 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
4706 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
4707 EVT VT = Op.getValueType();
4708
4709 SDValue Overflow =
4710 DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal, ARMcc, OverflowCmp);
4711
4712 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
4713 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4714}
4715
4717 SelectionDAG &DAG) {
4718 SDLoc DL(BoolCarry);
4719 EVT CarryVT = BoolCarry.getValueType();
4720
4721 // This converts the boolean value carry into the carry flag by doing
4722 // ARMISD::SUBC Carry, 1
4723 SDValue Carry = DAG.getNode(ARMISD::SUBC, DL,
4724 DAG.getVTList(CarryVT, MVT::i32),
4725 BoolCarry, DAG.getConstant(1, DL, CarryVT));
4726 return Carry.getValue(1);
4727}
4728
4730 SelectionDAG &DAG) {
4731 SDLoc DL(Flags);
4732
4733 // Now convert the carry flag into a boolean carry. We do this
4734 // using ARMISD:ADDE 0, 0, Carry
4735 return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32),
4736 DAG.getConstant(0, DL, MVT::i32),
4737 DAG.getConstant(0, DL, MVT::i32), Flags);
4738}
4739
4740SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op,
4741 SelectionDAG &DAG) const {
4742 // Let legalize expand this if it isn't a legal type yet.
4743 if (!isTypeLegal(Op.getValueType()))
4744 return SDValue();
4745
4746 SDValue LHS = Op.getOperand(0);
4747 SDValue RHS = Op.getOperand(1);
4748 SDLoc dl(Op);
4749
4750 EVT VT = Op.getValueType();
4751 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
4752 SDValue Value;
4753 SDValue Overflow;
4754 switch (Op.getOpcode()) {
4755 default:
4756 llvm_unreachable("Unknown overflow instruction!");
4757 case ISD::UADDO:
4758 Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS);
4759 // Convert the carry flag into a boolean value.
4760 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
4761 break;
4762 case ISD::USUBO: {
4763 Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS);
4764 // Convert the carry flag into a boolean value.
4765 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
4766 // ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow
4767 // value. So compute 1 - C.
4768 Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32,
4769 DAG.getConstant(1, dl, MVT::i32), Overflow);
4770 break;
4771 }
4772 }
4773
4774 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4775}
4776
4778 const ARMSubtarget *Subtarget) {
4779 EVT VT = Op.getValueType();
4780 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP() || Subtarget->isThumb1Only())
4781 return SDValue();
4782 if (!VT.isSimple())
4783 return SDValue();
4784
4785 unsigned NewOpcode;
4786 switch (VT.getSimpleVT().SimpleTy) {
4787 default:
4788 return SDValue();
4789 case MVT::i8:
4790 switch (Op->getOpcode()) {
4791 case ISD::UADDSAT:
4792 NewOpcode = ARMISD::UQADD8b;
4793 break;
4794 case ISD::SADDSAT:
4795 NewOpcode = ARMISD::QADD8b;
4796 break;
4797 case ISD::USUBSAT:
4798 NewOpcode = ARMISD::UQSUB8b;
4799 break;
4800 case ISD::SSUBSAT:
4801 NewOpcode = ARMISD::QSUB8b;
4802 break;
4803 }
4804 break;
4805 case MVT::i16:
4806 switch (Op->getOpcode()) {
4807 case ISD::UADDSAT:
4808 NewOpcode = ARMISD::UQADD16b;
4809 break;
4810 case ISD::SADDSAT:
4811 NewOpcode = ARMISD::QADD16b;
4812 break;
4813 case ISD::USUBSAT:
4814 NewOpcode = ARMISD::UQSUB16b;
4815 break;
4816 case ISD::SSUBSAT:
4817 NewOpcode = ARMISD::QSUB16b;
4818 break;
4819 }
4820 break;
4821 }
4822
4823 SDLoc dl(Op);
4824 SDValue Add =
4825 DAG.getNode(NewOpcode, dl, MVT::i32,
4826 DAG.getSExtOrTrunc(Op->getOperand(0), dl, MVT::i32),
4827 DAG.getSExtOrTrunc(Op->getOperand(1), dl, MVT::i32));
4828 return DAG.getNode(ISD::TRUNCATE, dl, VT, Add);
4829}
4830
4831SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
4832 SDValue Cond = Op.getOperand(0);
4833 SDValue SelectTrue = Op.getOperand(1);
4834 SDValue SelectFalse = Op.getOperand(2);
4835 SDLoc dl(Op);
4836 unsigned Opc = Cond.getOpcode();
4837
4838 if (Cond.getResNo() == 1 &&
4839 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
4840 Opc == ISD::USUBO)) {
4841 if (!isTypeLegal(Cond->getValueType(0)))
4842 return SDValue();
4843
4844 SDValue Value, OverflowCmp;
4845 SDValue ARMcc;
4846 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
4847 EVT VT = Op.getValueType();
4848
4849 return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, OverflowCmp, DAG);
4850 }
4851
4852 // Convert:
4853 //
4854 // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
4855 // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
4856 //
4857 if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
4858 const ConstantSDNode *CMOVTrue =
4859 dyn_cast<ConstantSDNode>(Cond.getOperand(0));
4860 const ConstantSDNode *CMOVFalse =
4861 dyn_cast<ConstantSDNode>(Cond.getOperand(1));
4862
4863 if (CMOVTrue && CMOVFalse) {
4864 unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
4865 unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
4866
4867 SDValue True;
4868 SDValue False;
4869 if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
4870 True = SelectTrue;
4871 False = SelectFalse;
4872 } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
4873 True = SelectFalse;
4874 False = SelectTrue;
4875 }
4876
4877 if (True.getNode() && False.getNode())
4878 return getCMOV(dl, Op.getValueType(), True, False, Cond.getOperand(2),
4879 Cond.getOperand(3), DAG);
4880 }
4881 }
4882
4883 // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the
4884 // undefined bits before doing a full-word comparison with zero.
4885 Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond,
4886 DAG.getConstant(1, dl, Cond.getValueType()));
4887
4888 return DAG.getSelectCC(dl, Cond,
4889 DAG.getConstant(0, dl, Cond.getValueType()),
4890 SelectTrue, SelectFalse, ISD::SETNE);
4891}
4892
4894 bool &swpCmpOps, bool &swpVselOps) {
4895 // Start by selecting the GE condition code for opcodes that return true for
4896 // 'equality'
4897 if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
4898 CC == ISD::SETULE || CC == ISD::SETGE || CC == ISD::SETLE)
4899 CondCode = ARMCC::GE;
4900
4901 // and GT for opcodes that return false for 'equality'.
4902 else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
4903 CC == ISD::SETULT || CC == ISD::SETGT || CC == ISD::SETLT)
4904 CondCode = ARMCC::GT;
4905
4906 // Since we are constrained to GE/GT, if the opcode contains 'less', we need
4907 // to swap the compare operands.
4908 if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
4909 CC == ISD::SETULT || CC == ISD::SETLE || CC == ISD::SETLT)
4910 swpCmpOps = true;
4911
4912 // Both GT and GE are ordered comparisons, and return false for 'unordered'.
4913 // If we have an unordered opcode, we need to swap the operands to the VSEL
4914 // instruction (effectively negating the condition).
4915 //
4916 // This also has the effect of swapping which one of 'less' or 'greater'
4917 // returns true, so we also swap the compare operands. It also switches
4918 // whether we return true for 'equality', so we compensate by picking the
4919 // opposite condition code to our original choice.
4920 if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||
4921 CC == ISD::SETUGT) {
4922 swpCmpOps = !swpCmpOps;
4923 swpVselOps = !swpVselOps;
4924 CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
4925 }
4926
4927 // 'ordered' is 'anything but unordered', so use the VS condition code and
4928 // swap the VSEL operands.
4929 if (CC == ISD::SETO) {
4930 CondCode = ARMCC::VS;
4931 swpVselOps = true;
4932 }
4933
4934 // 'unordered or not equal' is 'anything but equal', so use the EQ condition
4935 // code and swap the VSEL operands. Also do this if we don't care about the
4936 // unordered case.
4937 if (CC == ISD::SETUNE || CC == ISD::SETNE) {
4938 CondCode = ARMCC::EQ;
4939 swpVselOps = true;
4940 }
4941}
4942
4943SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal,
4944 SDValue TrueVal, SDValue ARMcc,
4945 SDValue Flags, SelectionDAG &DAG) const {
4946 if (!Subtarget->hasFP64() && VT == MVT::f64) {
4947 FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl,
4948 DAG.getVTList(MVT::i32, MVT::i32), FalseVal);
4949 TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl,
4950 DAG.getVTList(MVT::i32, MVT::i32), TrueVal);
4951
4952 SDValue TrueLow = TrueVal.getValue(0);
4953 SDValue TrueHigh = TrueVal.getValue(1);
4954 SDValue FalseLow = FalseVal.getValue(0);
4955 SDValue FalseHigh = FalseVal.getValue(1);
4956
4957 SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow,
4958 ARMcc, Flags);
4959 SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh,
4960 ARMcc, Flags);
4961
4962 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High);
4963 }
4964 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, Flags);
4965}
4966
4967static bool isGTorGE(ISD::CondCode CC) {
4968 return CC == ISD::SETGT || CC == ISD::SETGE;
4969}
4970
4971static bool isLTorLE(ISD::CondCode CC) {
4972 return CC == ISD::SETLT || CC == ISD::SETLE;
4973}
4974
4975// See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating.
4976// All of these conditions (and their <= and >= counterparts) will do:
4977// x < k ? k : x
4978// x > k ? x : k
4979// k < x ? x : k
4980// k > x ? k : x
4981static bool isLowerSaturate(const SDValue LHS, const SDValue RHS,
4982 const SDValue TrueVal, const SDValue FalseVal,
4983 const ISD::CondCode CC, const SDValue K) {
4984 return (isGTorGE(CC) &&
4985 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) ||
4986 (isLTorLE(CC) &&
4987 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal)));
4988}
4989
4990// Check if two chained conditionals could be converted into SSAT or USAT.
4991//
4992// SSAT can replace a set of two conditional selectors that bound a number to an
4993// interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples:
4994//
4995// x < -k ? -k : (x > k ? k : x)
4996// x < -k ? -k : (x < k ? x : k)
4997// x > -k ? (x > k ? k : x) : -k
4998// x < k ? (x < -k ? -k : x) : k
4999// etc.
5000//
5001// LLVM canonicalizes these to either a min(max()) or a max(min())
5002// pattern. This function tries to match one of these and will return a SSAT
5003// node if successful.
5004//
5005// USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1
5006// is a power of 2.
5008 EVT VT = Op.getValueType();
5009 SDValue V1 = Op.getOperand(0);
5010 SDValue K1 = Op.getOperand(1);
5011 SDValue TrueVal1 = Op.getOperand(2);
5012 SDValue FalseVal1 = Op.getOperand(3);
5013 ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5014
5015 const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1;
5016 if (Op2.getOpcode() != ISD::SELECT_CC)
5017 return SDValue();
5018
5019 SDValue V2 = Op2.getOperand(0);
5020 SDValue K2 = Op2.getOperand(1);
5021 SDValue TrueVal2 = Op2.getOperand(2);
5022 SDValue FalseVal2 = Op2.getOperand(3);
5023 ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get();
5024
5025 SDValue V1Tmp = V1;
5026 SDValue V2Tmp = V2;
5027
5028 // Check that the registers and the constants match a max(min()) or min(max())
5029 // pattern
5030 if (V1Tmp != TrueVal1 || V2Tmp != TrueVal2 || K1 != FalseVal1 ||
5031 K2 != FalseVal2 ||
5032 !((isGTorGE(CC1) && isLTorLE(CC2)) || (isLTorLE(CC1) && isGTorGE(CC2))))
5033 return SDValue();
5034
5035 // Check that the constant in the lower-bound check is
5036 // the opposite of the constant in the upper-bound check
5037 // in 1's complement.
5039 return SDValue();
5040
5041 int64_t Val1 = cast<ConstantSDNode>(K1)->getSExtValue();
5042 int64_t Val2 = cast<ConstantSDNode>(K2)->getSExtValue();
5043 int64_t PosVal = std::max(Val1, Val2);
5044 int64_t NegVal = std::min(Val1, Val2);
5045
5046 if (!((Val1 > Val2 && isLTorLE(CC1)) || (Val1 < Val2 && isLTorLE(CC2))) ||
5047 !isPowerOf2_64(PosVal + 1))
5048 return SDValue();
5049
5050 // Handle the difference between USAT (unsigned) and SSAT (signed)
5051 // saturation
5052 // At this point, PosVal is guaranteed to be positive
5053 uint64_t K = PosVal;
5054 SDLoc dl(Op);
5055 if (Val1 == ~Val2)
5056 return DAG.getNode(ARMISD::SSAT, dl, VT, V2Tmp,
5057 DAG.getConstant(llvm::countr_one(K), dl, VT));
5058 if (NegVal == 0)
5059 return DAG.getNode(ARMISD::USAT, dl, VT, V2Tmp,
5060 DAG.getConstant(llvm::countr_one(K), dl, VT));
5061
5062 return SDValue();
5063}
5064
5065// Check if a condition of the type x < k ? k : x can be converted into a
5066// bit operation instead of conditional moves.
5067// Currently this is allowed given:
5068// - The conditions and values match up
5069// - k is 0 or -1 (all ones)
5070// This function will not check the last condition, thats up to the caller
5071// It returns true if the transformation can be made, and in such case
5072// returns x in V, and k in SatK.
5074 SDValue &SatK)
5075{
5076 SDValue LHS = Op.getOperand(0);
5077 SDValue RHS = Op.getOperand(1);
5078 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5079 SDValue TrueVal = Op.getOperand(2);
5080 SDValue FalseVal = Op.getOperand(3);
5081
5083 ? &RHS
5084 : nullptr;
5085
5086 // No constant operation in comparison, early out
5087 if (!K)
5088 return false;
5089
5090 SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal;
5091 V = (KTmp == TrueVal) ? FalseVal : TrueVal;
5092 SDValue VTmp = (K && *K == LHS) ? RHS : LHS;
5093
5094 // If the constant on left and right side, or variable on left and right,
5095 // does not match, early out
5096 if (*K != KTmp || V != VTmp)
5097 return false;
5098
5099 if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) {
5100 SatK = *K;
5101 return true;
5102 }
5103
5104 return false;
5105}
5106
5107bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const {
5108 if (VT == MVT::f32)
5109 return !Subtarget->hasVFP2Base();
5110 if (VT == MVT::f64)
5111 return !Subtarget->hasFP64();
5112 if (VT == MVT::f16)
5113 return !Subtarget->hasFullFP16();
5114 return false;
5115}
5116
5117SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
5118 EVT VT = Op.getValueType();
5119 SDLoc dl(Op);
5120
5121 // Try to convert two saturating conditional selects into a single SSAT
5122 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2())
5123 if (SDValue SatValue = LowerSaturatingConditional(Op, DAG))
5124 return SatValue;
5125
5126 // Try to convert expressions of the form x < k ? k : x (and similar forms)
5127 // into more efficient bit operations, which is possible when k is 0 or -1
5128 // On ARM and Thumb-2 which have flexible operand 2 this will result in
5129 // single instructions. On Thumb the shift and the bit operation will be two
5130 // instructions.
5131 // Only allow this transformation on full-width (32-bit) operations
5132 SDValue LowerSatConstant;
5133 SDValue SatValue;
5134 if (VT == MVT::i32 &&
5135 isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) {
5136 SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue,
5137 DAG.getConstant(31, dl, VT));
5138 if (isNullConstant(LowerSatConstant)) {
5139 SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV,
5140 DAG.getAllOnesConstant(dl, VT));
5141 return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV);
5142 } else if (isAllOnesConstant(LowerSatConstant))
5143 return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV);
5144 }
5145
5146 SDValue LHS = Op.getOperand(0);
5147 SDValue RHS = Op.getOperand(1);
5148 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5149 SDValue TrueVal = Op.getOperand(2);
5150 SDValue FalseVal = Op.getOperand(3);
5151 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal);
5152 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TrueVal);
5153 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
5154 if (Op.getValueType().isInteger()) {
5155
5156 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
5157 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
5158 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
5159 // Both require less instructions than compare and conditional select.
5160 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TrueVal && RHSC &&
5161 RHSC->isZero() && CFVal && CFVal->isZero() &&
5162 LHS.getValueType() == RHS.getValueType()) {
5163 EVT VT = LHS.getValueType();
5164 SDValue Shift =
5165 DAG.getNode(ISD::SRA, dl, VT, LHS,
5166 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
5167
5168 if (CC == ISD::SETGT)
5169 Shift = DAG.getNOT(dl, Shift, VT);
5170
5171 return DAG.getNode(ISD::AND, dl, VT, LHS, Shift);
5172 }
5173 }
5174
5175 if (Subtarget->hasV8_1MMainlineOps() && CFVal && CTVal &&
5176 LHS.getValueType() == MVT::i32 && RHS.getValueType() == MVT::i32) {
5177 unsigned TVal = CTVal->getZExtValue();
5178 unsigned FVal = CFVal->getZExtValue();
5179 unsigned Opcode = 0;
5180
5181 if (TVal == ~FVal) {
5182 Opcode = ARMISD::CSINV;
5183 } else if (TVal == ~FVal + 1) {
5184 Opcode = ARMISD::CSNEG;
5185 } else if (TVal + 1 == FVal) {
5186 Opcode = ARMISD::CSINC;
5187 } else if (TVal == FVal + 1) {
5188 Opcode = ARMISD::CSINC;
5189 std::swap(TrueVal, FalseVal);
5190 std::swap(TVal, FVal);
5191 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5192 }
5193
5194 if (Opcode) {
5195 // If one of the constants is cheaper than another, materialise the
5196 // cheaper one and let the csel generate the other.
5197 if (Opcode != ARMISD::CSINC &&
5198 HasLowerConstantMaterializationCost(FVal, TVal, Subtarget)) {
5199 std::swap(TrueVal, FalseVal);
5200 std::swap(TVal, FVal);
5201 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5202 }
5203
5204 // Attempt to use ZR checking TVal is 0, possibly inverting the condition
5205 // to get there. CSINC not is invertable like the other two (~(~a) == a,
5206 // -(-a) == a, but (a+1)+1 != a).
5207 if (FVal == 0 && Opcode != ARMISD::CSINC) {
5208 std::swap(TrueVal, FalseVal);
5209 std::swap(TVal, FVal);
5210 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5211 }
5212
5213 // Drops F's value because we can get it by inverting/negating TVal.
5214 FalseVal = TrueVal;
5215
5216 SDValue ARMcc;
5217 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5218 EVT VT = TrueVal.getValueType();
5219 return DAG.getNode(Opcode, dl, VT, TrueVal, FalseVal, ARMcc, Cmp);
5220 }
5221 }
5222
5223 if (isUnsupportedFloatingType(LHS.getValueType())) {
5224 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
5225
5226 // If softenSetCCOperands only returned one value, we should compare it to
5227 // zero.
5228 if (!RHS.getNode()) {
5229 RHS = DAG.getConstant(0, dl, LHS.getValueType());
5230 CC = ISD::SETNE;
5231 }
5232 }
5233
5234 if (LHS.getValueType() == MVT::i32) {
5235 // Try to generate VSEL on ARMv8.
5236 // The VSEL instruction can't use all the usual ARM condition
5237 // codes: it only has two bits to select the condition code, so it's
5238 // constrained to use only GE, GT, VS and EQ.
5239 //
5240 // To implement all the various ISD::SETXXX opcodes, we sometimes need to
5241 // swap the operands of the previous compare instruction (effectively
5242 // inverting the compare condition, swapping 'less' and 'greater') and
5243 // sometimes need to swap the operands to the VSEL (which inverts the
5244 // condition in the sense of firing whenever the previous condition didn't)
5245 if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 ||
5246 TrueVal.getValueType() == MVT::f32 ||
5247 TrueVal.getValueType() == MVT::f64)) {
5249 if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
5250 CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
5251 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5252 std::swap(TrueVal, FalseVal);
5253 }
5254 }
5255
5256 SDValue ARMcc;
5257 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5258 // Choose GE over PL, which vsel does now support
5259 if (ARMcc->getAsZExtVal() == ARMCC::PL)
5260 ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32);
5261 return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, Cmp, DAG);
5262 }
5263
5264 ARMCC::CondCodes CondCode, CondCode2;
5265 FPCCToARMCC(CC, CondCode, CondCode2);
5266
5267 // Normalize the fp compare. If RHS is zero we prefer to keep it there so we
5268 // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we
5269 // must use VSEL (limited condition codes), due to not having conditional f16
5270 // moves.
5271 if (Subtarget->hasFPARMv8Base() &&
5272 !(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) &&
5273 (TrueVal.getValueType() == MVT::f16 ||
5274 TrueVal.getValueType() == MVT::f32 ||
5275 TrueVal.getValueType() == MVT::f64)) {
5276 bool swpCmpOps = false;
5277 bool swpVselOps = false;
5278 checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
5279
5280 if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||
5281 CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {
5282 if (swpCmpOps)
5283 std::swap(LHS, RHS);
5284 if (swpVselOps)
5285 std::swap(TrueVal, FalseVal);
5286 }
5287 }
5288
5289 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5290 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5291 SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, Cmp, DAG);
5292 if (CondCode2 != ARMCC::AL) {
5293 SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
5294 Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, Cmp, DAG);
5295 }
5296 return Result;
5297}
5298
5299/// canChangeToInt - Given the fp compare operand, return true if it is suitable
5300/// to morph to an integer compare sequence.
5301static bool canChangeToInt(SDValue Op, bool &SeenZero,
5302 const ARMSubtarget *Subtarget) {
5303 SDNode *N = Op.getNode();
5304 if (!N->hasOneUse())
5305 // Otherwise it requires moving the value from fp to integer registers.
5306 return false;
5307 if (!N->getNumValues())
5308 return false;
5309 EVT VT = Op.getValueType();
5310 if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
5311 // f32 case is generally profitable. f64 case only makes sense when vcmpe +
5312 // vmrs are very slow, e.g. cortex-a8.
5313 return false;
5314
5315 if (isFloatingPointZero(Op)) {
5316 SeenZero = true;
5317 return true;
5318 }
5319 return ISD::isNormalLoad(N);
5320}
5321
5324 return DAG.getConstant(0, SDLoc(Op), MVT::i32);
5325
5327 return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(),
5328 Ld->getPointerInfo(), Ld->getAlign(),
5329 Ld->getMemOperand()->getFlags());
5330
5331 llvm_unreachable("Unknown VFP cmp argument!");
5332}
5333
5335 SDValue &RetVal1, SDValue &RetVal2) {
5336 SDLoc dl(Op);
5337
5338 if (isFloatingPointZero(Op)) {
5339 RetVal1 = DAG.getConstant(0, dl, MVT::i32);
5340 RetVal2 = DAG.getConstant(0, dl, MVT::i32);
5341 return;
5342 }
5343
5344 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
5345 SDValue Ptr = Ld->getBasePtr();
5346 RetVal1 =
5347 DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
5348 Ld->getAlign(), Ld->getMemOperand()->getFlags());
5349
5350 EVT PtrType = Ptr.getValueType();
5351 SDValue NewPtr = DAG.getNode(ISD::ADD, dl,
5352 PtrType, Ptr, DAG.getConstant(4, dl, PtrType));
5353 RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr,
5354 Ld->getPointerInfo().getWithOffset(4),
5355 commonAlignment(Ld->getAlign(), 4),
5356 Ld->getMemOperand()->getFlags());
5357 return;
5358 }
5359
5360 llvm_unreachable("Unknown VFP cmp argument!");
5361}
5362
5363/// OptimizeVFPBrcond - With nnan and without daz, it's legal to optimize some
5364/// f32 and even f64 comparisons to integer ones.
5365SDValue
5366ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
5367 SDValue Chain = Op.getOperand(0);
5368 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5369 SDValue LHS = Op.getOperand(2);
5370 SDValue RHS = Op.getOperand(3);
5371 SDValue Dest = Op.getOperand(4);
5372 SDLoc dl(Op);
5373
5374 bool LHSSeenZero = false;
5375 bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
5376 bool RHSSeenZero = false;
5377 bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget);
5378 if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) {
5379 // If unsafe fp math optimization is enabled and there are no other uses of
5380 // the CMP operands, and the condition code is EQ or NE, we can optimize it
5381 // to an integer comparison.
5382 if (CC == ISD::SETOEQ)
5383 CC = ISD::SETEQ;
5384 else if (CC == ISD::SETUNE)
5385 CC = ISD::SETNE;
5386
5387 SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32);
5388 SDValue ARMcc;
5389 if (LHS.getValueType() == MVT::f32) {
5390 LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5391 bitcastf32Toi32(LHS, DAG), Mask);
5392 RHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5393 bitcastf32Toi32(RHS, DAG), Mask);
5394 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5395 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5396 Cmp);
5397 }
5398
5399 SDValue LHS1, LHS2;
5400 SDValue RHS1, RHS2;
5401 expandf64Toi32(LHS, DAG, LHS1, LHS2);
5402 expandf64Toi32(RHS, DAG, RHS1, RHS2);
5403 LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
5404 RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
5406 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5407 SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
5408 return DAG.getNode(ARMISD::BCC_i64, dl, MVT::Other, Ops);
5409 }
5410
5411 return SDValue();
5412}
5413
5414// Generate CMP + CMOV for integer abs.
5415SDValue ARMTargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
5416 SDLoc DL(Op);
5417
5418 SDValue Neg = DAG.getNegative(Op.getOperand(0), DL, MVT::i32);
5419
5420 // Generate CMP & CMOV.
5421 SDValue Cmp = DAG.getNode(ARMISD::CMP, DL, FlagsVT, Op.getOperand(0),
5422 DAG.getConstant(0, DL, MVT::i32));
5423 return DAG.getNode(ARMISD::CMOV, DL, MVT::i32, Op.getOperand(0), Neg,
5424 DAG.getConstant(ARMCC::MI, DL, MVT::i32), Cmp);
5425}
5426
5427SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
5428 SDValue Chain = Op.getOperand(0);
5429 SDValue Cond = Op.getOperand(1);
5430 SDValue Dest = Op.getOperand(2);
5431 SDLoc dl(Op);
5432
5433 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5434 // instruction.
5435 unsigned Opc = Cond.getOpcode();
5436 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5437 !Subtarget->isThumb1Only();
5438 if (Cond.getResNo() == 1 &&
5439 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5440 Opc == ISD::USUBO || OptimizeMul)) {
5441 // Only lower legal XALUO ops.
5442 if (!isTypeLegal(Cond->getValueType(0)))
5443 return SDValue();
5444
5445 // The actual operation with overflow check.
5446 SDValue Value, OverflowCmp;
5447 SDValue ARMcc;
5448 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
5449
5450 // Reverse the condition code.
5452 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
5454 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
5455
5456 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5457 OverflowCmp);
5458 }
5459
5460 return SDValue();
5461}
5462
5463SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
5464 SDValue Chain = Op.getOperand(0);
5465 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5466 SDValue LHS = Op.getOperand(2);
5467 SDValue RHS = Op.getOperand(3);
5468 SDValue Dest = Op.getOperand(4);
5469 SDLoc dl(Op);
5470
5471 if (isUnsupportedFloatingType(LHS.getValueType())) {
5472 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
5473
5474 // If softenSetCCOperands only returned one value, we should compare it to
5475 // zero.
5476 if (!RHS.getNode()) {
5477 RHS = DAG.getConstant(0, dl, LHS.getValueType());
5478 CC = ISD::SETNE;
5479 }
5480 }
5481
5482 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5483 // instruction.
5484 unsigned Opc = LHS.getOpcode();
5485 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5486 !Subtarget->isThumb1Only();
5487 if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) &&
5488 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5489 Opc == ISD::USUBO || OptimizeMul) &&
5490 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
5491 // Only lower legal XALUO ops.
5492 if (!isTypeLegal(LHS->getValueType(0)))
5493 return SDValue();
5494
5495 // The actual operation with overflow check.
5496 SDValue Value, OverflowCmp;
5497 SDValue ARMcc;
5498 std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc);
5499
5500 if ((CC == ISD::SETNE) != isOneConstant(RHS)) {
5501 // Reverse the condition code.
5503 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
5505 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
5506 }
5507
5508 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5509 OverflowCmp);
5510 }
5511
5512 if (LHS.getValueType() == MVT::i32) {
5513 SDValue ARMcc;
5514 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5515 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, Cmp);
5516 }
5517
5518 SDNodeFlags Flags = Op->getFlags();
5519 if (Flags.hasNoNaNs() &&
5520 DAG.getDenormalMode(MVT::f32) == DenormalMode::getIEEE() &&
5521 DAG.getDenormalMode(MVT::f64) == DenormalMode::getIEEE() &&
5522 (CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETNE ||
5523 CC == ISD::SETUNE)) {
5524 if (SDValue Result = OptimizeVFPBrcond(Op, DAG))
5525 return Result;
5526 }
5527
5528 ARMCC::CondCodes CondCode, CondCode2;
5529 FPCCToARMCC(CC, CondCode, CondCode2);
5530
5531 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5532 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5533 SDValue Ops[] = {Chain, Dest, ARMcc, Cmp};
5534 SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Ops);
5535 if (CondCode2 != ARMCC::AL) {
5536 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
5537 SDValue Ops[] = {Res, Dest, ARMcc, Cmp};
5538 Res = DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Ops);
5539 }
5540 return Res;
5541}
5542
5543SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
5544 SDValue Chain = Op.getOperand(0);
5545 SDValue Table = Op.getOperand(1);
5546 SDValue Index = Op.getOperand(2);
5547 SDLoc dl(Op);
5548
5549 EVT PTy = getPointerTy(DAG.getDataLayout());
5550 JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
5551 SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
5552 Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI);
5553 Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy));
5554 SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index);
5555 if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) {
5556 // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table
5557 // which does another jump to the destination. This also makes it easier
5558 // to translate it to TBB / TBH later (Thumb2 only).
5559 // FIXME: This might not work if the function is extremely large.
5560 return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
5561 Addr, Op.getOperand(2), JTI);
5562 }
5563 if (isPositionIndependent() || Subtarget->isROPI()) {
5564 Addr =
5565 DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
5567 Chain = Addr.getValue(1);
5568 Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr);
5569 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5570 } else {
5571 Addr =
5572 DAG.getLoad(PTy, dl, Chain, Addr,
5574 Chain = Addr.getValue(1);
5575 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5576 }
5577}
5578
5580 EVT VT = Op.getValueType();
5581 SDLoc dl(Op);
5582
5583 if (Op.getValueType().getVectorElementType() == MVT::i32) {
5584 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
5585 return Op;
5586 return DAG.UnrollVectorOp(Op.getNode());
5587 }
5588
5589 const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();
5590
5591 EVT NewTy;
5592 const EVT OpTy = Op.getOperand(0).getValueType();
5593 if (OpTy == MVT::v4f32)
5594 NewTy = MVT::v4i32;
5595 else if (OpTy == MVT::v4f16 && HasFullFP16)
5596 NewTy = MVT::v4i16;
5597 else if (OpTy == MVT::v8f16 && HasFullFP16)
5598 NewTy = MVT::v8i16;
5599 else
5600 llvm_unreachable("Invalid type for custom lowering!");
5601
5602 if (VT != MVT::v4i16 && VT != MVT::v8i16)
5603 return DAG.UnrollVectorOp(Op.getNode());
5604
5605 Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0));
5606 return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
5607}
5608
5609SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
5610 EVT VT = Op.getValueType();
5611 if (VT.isVector())
5612 return LowerVectorFP_TO_INT(Op, DAG);
5613
5614 bool IsStrict = Op->isStrictFPOpcode();
5615 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
5616
5617 if (isUnsupportedFloatingType(SrcVal.getValueType())) {
5618 RTLIB::Libcall LC;
5619 if (Op.getOpcode() == ISD::FP_TO_SINT ||
5620 Op.getOpcode() == ISD::STRICT_FP_TO_SINT)
5621 LC = RTLIB::getFPTOSINT(SrcVal.getValueType(),
5622 Op.getValueType());
5623 else
5624 LC = RTLIB::getFPTOUINT(SrcVal.getValueType(),
5625 Op.getValueType());
5626 SDLoc Loc(Op);
5627 MakeLibCallOptions CallOptions;
5628 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
5630 std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal,
5631 CallOptions, Loc, Chain);
5632 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
5633 }
5634
5635 // FIXME: Remove this when we have strict fp instruction selection patterns
5636 if (IsStrict) {
5637 SDLoc Loc(Op);
5638 SDValue Result =
5641 Loc, Op.getValueType(), SrcVal);
5642 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
5643 }
5644
5645 return Op;
5646}
5647
5649 const ARMSubtarget *Subtarget) {
5650 EVT VT = Op.getValueType();
5651 EVT ToVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
5652 EVT FromVT = Op.getOperand(0).getValueType();
5653
5654 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f32)
5655 return Op;
5656 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f64 &&
5657 Subtarget->hasFP64())
5658 return Op;
5659 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f16 &&
5660 Subtarget->hasFullFP16())
5661 return Op;
5662 if (VT == MVT::v4i32 && ToVT == MVT::i32 && FromVT == MVT::v4f32 &&
5663 Subtarget->hasMVEFloatOps())
5664 return Op;
5665 if (VT == MVT::v8i16 && ToVT == MVT::i16 && FromVT == MVT::v8f16 &&
5666 Subtarget->hasMVEFloatOps())
5667 return Op;
5668
5669 if (FromVT != MVT::v4f32 && FromVT != MVT::v8f16)
5670 return SDValue();
5671
5672 SDLoc DL(Op);
5673 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT_SAT;
5674 unsigned BW = ToVT.getScalarSizeInBits() - IsSigned;
5675 SDValue CVT = DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
5676 DAG.getValueType(VT.getScalarType()));
5677 SDValue Max = DAG.getNode(IsSigned ? ISD::SMIN : ISD::UMIN, DL, VT, CVT,
5678 DAG.getConstant((1 << BW) - 1, DL, VT));
5679 if (IsSigned)
5680 Max = DAG.getNode(ISD::SMAX, DL, VT, Max,
5681 DAG.getSignedConstant(-(1 << BW), DL, VT));
5682 return Max;
5683}
5684
5686 EVT VT = Op.getValueType();
5687 SDLoc dl(Op);
5688
5689 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
5690 if (VT.getVectorElementType() == MVT::f32)
5691 return Op;
5692 return DAG.UnrollVectorOp(Op.getNode());
5693 }
5694
5695 assert((Op.getOperand(0).getValueType() == MVT::v4i16 ||
5696 Op.getOperand(0).getValueType() == MVT::v8i16) &&
5697 "Invalid type for custom lowering!");
5698
5699 const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();
5700
5701 EVT DestVecType;
5702 if (VT == MVT::v4f32)
5703 DestVecType = MVT::v4i32;
5704 else if (VT == MVT::v4f16 && HasFullFP16)
5705 DestVecType = MVT::v4i16;
5706 else if (VT == MVT::v8f16 && HasFullFP16)
5707 DestVecType = MVT::v8i16;
5708 else
5709 return DAG.UnrollVectorOp(Op.getNode());
5710
5711 unsigned CastOpc;
5712 unsigned Opc;
5713 switch (Op.getOpcode()) {
5714 default: llvm_unreachable("Invalid opcode!");
5715 case ISD::SINT_TO_FP:
5716 CastOpc = ISD::SIGN_EXTEND;
5718 break;
5719 case ISD::UINT_TO_FP:
5720 CastOpc = ISD::ZERO_EXTEND;
5722 break;
5723 }
5724
5725 Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0));
5726 return DAG.getNode(Opc, dl, VT, Op);
5727}
5728
5729SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
5730 EVT VT = Op.getValueType();
5731 if (VT.isVector())
5732 return LowerVectorINT_TO_FP(Op, DAG);
5733 if (isUnsupportedFloatingType(VT)) {
5734 RTLIB::Libcall LC;
5735 if (Op.getOpcode() == ISD::SINT_TO_FP)
5736 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(),
5737 Op.getValueType());
5738 else
5739 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),
5740 Op.getValueType());
5741 MakeLibCallOptions CallOptions;
5742 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
5743 CallOptions, SDLoc(Op)).first;
5744 }
5745
5746 return Op;
5747}
5748
5749SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
5750 // Implement fcopysign with a fabs and a conditional fneg.
5751 SDValue Tmp0 = Op.getOperand(0);
5752 SDValue Tmp1 = Op.getOperand(1);
5753 SDLoc dl(Op);
5754 EVT VT = Op.getValueType();
5755 EVT SrcVT = Tmp1.getValueType();
5756 bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
5757 Tmp0.getOpcode() == ARMISD::VMOVDRR;
5758 bool UseNEON = !InGPR && Subtarget->hasNEON();
5759
5760 if (UseNEON) {
5761 // Use VBSL to copy the sign bit.
5762 unsigned EncodedVal = ARM_AM::createVMOVModImm(0x6, 0x80);
5763 SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
5764 DAG.getTargetConstant(EncodedVal, dl, MVT::i32));
5765 EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
5766 if (VT == MVT::f64)
5767 Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
5768 DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
5769 DAG.getConstant(32, dl, MVT::i32));
5770 else /*if (VT == MVT::f32)*/
5771 Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
5772 if (SrcVT == MVT::f32) {
5773 Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
5774 if (VT == MVT::f64)
5775 Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
5776 DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
5777 DAG.getConstant(32, dl, MVT::i32));
5778 } else if (VT == MVT::f32)
5779 Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64,
5780 DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
5781 DAG.getConstant(32, dl, MVT::i32));
5782 Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
5783 Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
5784
5786 dl, MVT::i32);
5787 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
5788 SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
5789 DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
5790
5791 SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
5792 DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
5793 DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
5794 if (VT == MVT::f32) {
5795 Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
5796 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
5797 DAG.getConstant(0, dl, MVT::i32));
5798 } else {
5799 Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
5800 }
5801
5802 return Res;
5803 }
5804
5805 // Bitcast operand 1 to i32.
5806 if (SrcVT == MVT::f64)
5807 Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
5808 Tmp1).getValue(1);
5809 Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
5810
5811 // Or in the signbit with integer operations.
5812 SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32);
5813 SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32);
5814 Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
5815 if (VT == MVT::f32) {
5816 Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
5817 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
5818 return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
5819 DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
5820 }
5821
5822 // f64: Or the high part with signbit and then combine two parts.
5823 Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
5824 Tmp0);
5825 SDValue Lo = Tmp0.getValue(0);
5826 SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
5827 Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
5828 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
5829}
5830
5831SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
5832 MachineFunction &MF = DAG.getMachineFunction();
5833 MachineFrameInfo &MFI = MF.getFrameInfo();
5834 MFI.setReturnAddressIsTaken(true);
5835
5836 EVT VT = Op.getValueType();
5837 SDLoc dl(Op);
5838 unsigned Depth = Op.getConstantOperandVal(0);
5839 if (Depth) {
5840 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
5841 SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
5842 return DAG.getLoad(VT, dl, DAG.getEntryNode(),
5843 DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
5844 MachinePointerInfo());
5845 }
5846
5847 // Return LR, which contains the return address. Mark it an implicit live-in.
5848 Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
5849 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
5850}
5851
5852SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
5853 const ARMBaseRegisterInfo &ARI =
5854 *static_cast<const ARMBaseRegisterInfo*>(RegInfo);
5855 MachineFunction &MF = DAG.getMachineFunction();
5856 MachineFrameInfo &MFI = MF.getFrameInfo();
5857 MFI.setFrameAddressIsTaken(true);
5858
5859 EVT VT = Op.getValueType();
5860 SDLoc dl(Op); // FIXME probably not meaningful
5861 unsigned Depth = Op.getConstantOperandVal(0);
5862 Register FrameReg = ARI.getFrameRegister(MF);
5863 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
5864 while (Depth--)
5865 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
5866 MachinePointerInfo());
5867 return FrameAddr;
5868}
5869
5870// FIXME? Maybe this could be a TableGen attribute on some registers and
5871// this table could be generated automatically from RegInfo.
5872Register ARMTargetLowering::getRegisterByName(const char* RegName, LLT VT,
5873 const MachineFunction &MF) const {
5874 return StringSwitch<Register>(RegName)
5875 .Case("sp", ARM::SP)
5876 .Default(Register());
5877}
5878
5879// Result is 64 bit value so split into two 32 bit values and return as a
5880// pair of values.
5882 SelectionDAG &DAG) {
5883 SDLoc DL(N);
5884
5885 // This function is only supposed to be called for i64 type destination.
5886 assert(N->getValueType(0) == MVT::i64
5887 && "ExpandREAD_REGISTER called for non-i64 type result.");
5888
5890 DAG.getVTList(MVT::i32, MVT::i32, MVT::Other),
5891 N->getOperand(0),
5892 N->getOperand(1));
5893
5894 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0),
5895 Read.getValue(1)));
5896 Results.push_back(Read.getValue(2)); // Chain
5897}
5898
5899/// \p BC is a bitcast that is about to be turned into a VMOVDRR.
5900/// When \p DstVT, the destination type of \p BC, is on the vector
5901/// register bank and the source of bitcast, \p Op, operates on the same bank,
5902/// it might be possible to combine them, such that everything stays on the
5903/// vector register bank.
5904/// \p return The node that would replace \p BT, if the combine
5905/// is possible.
5907 SelectionDAG &DAG) {
5908 SDValue Op = BC->getOperand(0);
5909 EVT DstVT = BC->getValueType(0);
5910
5911 // The only vector instruction that can produce a scalar (remember,
5912 // since the bitcast was about to be turned into VMOVDRR, the source
5913 // type is i64) from a vector is EXTRACT_VECTOR_ELT.
5914 // Moreover, we can do this combine only if there is one use.
5915 // Finally, if the destination type is not a vector, there is not
5916 // much point on forcing everything on the vector bank.
5917 if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5918 !Op.hasOneUse())
5919 return SDValue();
5920
5921 // If the index is not constant, we will introduce an additional
5922 // multiply that will stick.
5923 // Give up in that case.
5924 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1));
5925 if (!Index)
5926 return SDValue();
5927 unsigned DstNumElt = DstVT.getVectorNumElements();
5928
5929 // Compute the new index.
5930 const APInt &APIntIndex = Index->getAPIntValue();
5931 APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);
5932 NewIndex *= APIntIndex;
5933 // Check if the new constant index fits into i32.
5934 if (NewIndex.getBitWidth() > 32)
5935 return SDValue();
5936
5937 // vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
5938 // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
5939 SDLoc dl(Op);
5940 SDValue ExtractSrc = Op.getOperand(0);
5941 EVT VecVT = EVT::getVectorVT(
5942 *DAG.getContext(), DstVT.getScalarType(),
5943 ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);
5944 SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc);
5945 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast,
5946 DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));
5947}
5948
5949/// ExpandBITCAST - If the target supports VFP, this function is called to
5950/// expand a bit convert where either the source or destination type is i64 to
5951/// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64
5952/// operand type is illegal (e.g., v2f32 for a target that doesn't support
5953/// vectors), since the legalizer won't know what to do with that.
5954SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
5955 const ARMSubtarget *Subtarget) const {
5956 SDLoc dl(N);
5957 SDValue Op = N->getOperand(0);
5958
5959 // This function is only supposed to be called for i16 and i64 types, either
5960 // as the source or destination of the bit convert.
5961 EVT SrcVT = Op.getValueType();
5962 EVT DstVT = N->getValueType(0);
5963
5964 if ((SrcVT == MVT::i16 || SrcVT == MVT::i32) &&
5965 (DstVT == MVT::f16 || DstVT == MVT::bf16))
5966 return MoveToHPR(SDLoc(N), DAG, MVT::i32, DstVT.getSimpleVT(),
5967 DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), MVT::i32, Op));
5968
5969 if ((DstVT == MVT::i16 || DstVT == MVT::i32) &&
5970 (SrcVT == MVT::f16 || SrcVT == MVT::bf16)) {
5971 if (Subtarget->hasFullFP16() && !Subtarget->hasBF16())
5972 Op = DAG.getBitcast(MVT::f16, Op);
5973 return DAG.getNode(
5974 ISD::TRUNCATE, SDLoc(N), DstVT,
5975 MoveFromHPR(SDLoc(N), DAG, MVT::i32, SrcVT.getSimpleVT(), Op));
5976 }
5977
5978 if (!(SrcVT == MVT::i64 || DstVT == MVT::i64))
5979 return SDValue();
5980
5981 // Turn i64->f64 into VMOVDRR.
5982 if (SrcVT == MVT::i64 && isTypeLegal(DstVT)) {
5983 // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
5984 // if we can combine the bitcast with its source.
5986 return Val;
5987 SDValue Lo, Hi;
5988 std::tie(Lo, Hi) = DAG.SplitScalar(Op, dl, MVT::i32, MVT::i32);
5989 return DAG.getNode(ISD::BITCAST, dl, DstVT,
5990 DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
5991 }
5992
5993 // Turn f64->i64 into VMOVRRD.
5994 if (DstVT == MVT::i64 && isTypeLegal(SrcVT)) {
5995 SDValue Cvt;
5996 if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() &&
5997 SrcVT.getVectorNumElements() > 1)
5998 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
5999 DAG.getVTList(MVT::i32, MVT::i32),
6000 DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op));
6001 else
6002 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
6003 DAG.getVTList(MVT::i32, MVT::i32), Op);
6004 // Merge the pieces into a single i64 value.
6005 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
6006 }
6007
6008 return SDValue();
6009}
6010
6011/// getZeroVector - Returns a vector of specified type with all zero elements.
6012/// Zero vectors are used to represent vector negation and in those cases
6013/// will be implemented with the NEON VNEG instruction. However, VNEG does
6014/// not support i64 elements, so sometimes the zero vectors will need to be
6015/// explicitly constructed. Regardless, use a canonical VMOV to create the
6016/// zero vector.
6017static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6018 assert(VT.isVector() && "Expected a vector type");
6019 // The canonical modified immediate encoding of a zero vector is....0!
6020 SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32);
6021 EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
6022 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
6023 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
6024}
6025
6026/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
6027/// i32 values and take a 2 x i32 value to shift plus a shift amount.
6028SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
6029 SelectionDAG &DAG) const {
6030 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6031 EVT VT = Op.getValueType();
6032 unsigned VTBits = VT.getSizeInBits();
6033 SDLoc dl(Op);
6034 SDValue ShOpLo = Op.getOperand(0);
6035 SDValue ShOpHi = Op.getOperand(1);
6036 SDValue ShAmt = Op.getOperand(2);
6037 SDValue ARMcc;
6038 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
6039
6040 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
6041
6042 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6043 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
6044 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
6045 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
6046 DAG.getConstant(VTBits, dl, MVT::i32));
6047 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
6048 SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
6049 SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
6050 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6051 ISD::SETGE, ARMcc, DAG, dl);
6052 SDValue Lo =
6053 DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift, ARMcc, CmpLo);
6054
6055 SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
6056 SDValue HiBigShift = Opc == ISD::SRA
6057 ? DAG.getNode(Opc, dl, VT, ShOpHi,
6058 DAG.getConstant(VTBits - 1, dl, VT))
6059 : DAG.getConstant(0, dl, VT);
6060 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6061 ISD::SETGE, ARMcc, DAG, dl);
6062 SDValue Hi =
6063 DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, ARMcc, CmpHi);
6064
6065 SDValue Ops[2] = { Lo, Hi };
6066 return DAG.getMergeValues(Ops, dl);
6067}
6068
6069/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
6070/// i32 values and take a 2 x i32 value to shift plus a shift amount.
6071SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
6072 SelectionDAG &DAG) const {
6073 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6074 EVT VT = Op.getValueType();
6075 unsigned VTBits = VT.getSizeInBits();
6076 SDLoc dl(Op);
6077 SDValue ShOpLo = Op.getOperand(0);
6078 SDValue ShOpHi = Op.getOperand(1);
6079 SDValue ShAmt = Op.getOperand(2);
6080 SDValue ARMcc;
6081
6082 assert(Op.getOpcode() == ISD::SHL_PARTS);
6083 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6084 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
6085 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
6086 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
6087 SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
6088
6089 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
6090 DAG.getConstant(VTBits, dl, MVT::i32));
6091 SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
6092 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6093 ISD::SETGE, ARMcc, DAG, dl);
6094 SDValue Hi =
6095 DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, ARMcc, CmpHi);
6096
6097 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6098 ISD::SETGE, ARMcc, DAG, dl);
6099 SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
6100 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift,
6101 DAG.getConstant(0, dl, VT), ARMcc, CmpLo);
6102
6103 SDValue Ops[2] = { Lo, Hi };
6104 return DAG.getMergeValues(Ops, dl);
6105}
6106
6107SDValue ARMTargetLowering::LowerGET_ROUNDING(SDValue Op,
6108 SelectionDAG &DAG) const {
6109 // The rounding mode is in bits 23:22 of the FPSCR.
6110 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
6111 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
6112 // so that the shift + and get folded into a bitfield extract.
6113 SDLoc dl(Op);
6114 SDValue Chain = Op.getOperand(0);
6115 SDValue Ops[] = {Chain,
6116 DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32)};
6117
6118 SDValue FPSCR =
6119 DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, {MVT::i32, MVT::Other}, Ops);
6120 Chain = FPSCR.getValue(1);
6121 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
6122 DAG.getConstant(1U << 22, dl, MVT::i32));
6123 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
6124 DAG.getConstant(22, dl, MVT::i32));
6125 SDValue And = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
6126 DAG.getConstant(3, dl, MVT::i32));
6127 return DAG.getMergeValues({And, Chain}, dl);
6128}
6129
6130SDValue ARMTargetLowering::LowerSET_ROUNDING(SDValue Op,
6131 SelectionDAG &DAG) const {
6132 SDLoc DL(Op);
6133 SDValue Chain = Op->getOperand(0);
6134 SDValue RMValue = Op->getOperand(1);
6135
6136 // The rounding mode is in bits 23:22 of the FPSCR.
6137 // The llvm.set.rounding argument value to ARM rounding mode value mapping
6138 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
6139 // ((arg - 1) & 3) << 22).
6140 //
6141 // It is expected that the argument of llvm.set.rounding is within the
6142 // segment [0, 3], so NearestTiesToAway (4) is not handled here. It is
6143 // responsibility of the code generated llvm.set.rounding to ensure this
6144 // condition.
6145
6146 // Calculate new value of FPSCR[23:22].
6147 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
6148 DAG.getConstant(1, DL, MVT::i32));
6149 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
6150 DAG.getConstant(0x3, DL, MVT::i32));
6151 RMValue = DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
6152 DAG.getConstant(ARM::RoundingBitsPos, DL, MVT::i32));
6153
6154 // Get current value of FPSCR.
6155 SDValue Ops[] = {Chain,
6156 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6157 SDValue FPSCR =
6158 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6159 Chain = FPSCR.getValue(1);
6160 FPSCR = FPSCR.getValue(0);
6161
6162 // Put new rounding mode into FPSCR[23:22].
6163 const unsigned RMMask = ~(ARM::Rounding::rmMask << ARM::RoundingBitsPos);
6164 FPSCR = DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,
6165 DAG.getConstant(RMMask, DL, MVT::i32));
6166 FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCR, RMValue);
6167 SDValue Ops2[] = {
6168 Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};
6169 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6170}
6171
6172SDValue ARMTargetLowering::LowerSET_FPMODE(SDValue Op,
6173 SelectionDAG &DAG) const {
6174 SDLoc DL(Op);
6175 SDValue Chain = Op->getOperand(0);
6176 SDValue Mode = Op->getOperand(1);
6177
6178 // Generate nodes to build:
6179 // FPSCR = (FPSCR & FPStatusBits) | (Mode & ~FPStatusBits)
6180 SDValue Ops[] = {Chain,
6181 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6182 SDValue FPSCR =
6183 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6184 Chain = FPSCR.getValue(1);
6185 FPSCR = FPSCR.getValue(0);
6186
6187 SDValue FPSCRMasked =
6188 DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,
6189 DAG.getConstant(ARM::FPStatusBits, DL, MVT::i32));
6190 SDValue InputMasked =
6191 DAG.getNode(ISD::AND, DL, MVT::i32, Mode,
6192 DAG.getConstant(~ARM::FPStatusBits, DL, MVT::i32));
6193 FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCRMasked, InputMasked);
6194
6195 SDValue Ops2[] = {
6196 Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};
6197 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6198}
6199
6200SDValue ARMTargetLowering::LowerRESET_FPMODE(SDValue Op,
6201 SelectionDAG &DAG) const {
6202 SDLoc DL(Op);
6203 SDValue Chain = Op->getOperand(0);
6204
6205 // To get the default FP mode all control bits are cleared:
6206 // FPSCR = FPSCR & (FPStatusBits | FPReservedBits)
6207 SDValue Ops[] = {Chain,
6208 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6209 SDValue FPSCR =
6210 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6211 Chain = FPSCR.getValue(1);
6212 FPSCR = FPSCR.getValue(0);
6213
6214 SDValue FPSCRMasked = DAG.getNode(
6215 ISD::AND, DL, MVT::i32, FPSCR,
6217 SDValue Ops2[] = {Chain,
6218 DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32),
6219 FPSCRMasked};
6220 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6221}
6222
6224 const ARMSubtarget *ST) {
6225 SDLoc dl(N);
6226 EVT VT = N->getValueType(0);
6227 if (VT.isVector() && ST->hasNEON()) {
6228
6229 // Compute the least significant set bit: LSB = X & -X
6230 SDValue X = N->getOperand(0);
6231 SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X);
6232 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX);
6233
6234 EVT ElemTy = VT.getVectorElementType();
6235
6236 if (ElemTy == MVT::i8) {
6237 // Compute with: cttz(x) = ctpop(lsb - 1)
6238 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6239 DAG.getTargetConstant(1, dl, ElemTy));
6240 SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
6241 return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
6242 }
6243
6244 if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) &&
6245 (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) {
6246 // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0
6247 unsigned NumBits = ElemTy.getSizeInBits();
6248 SDValue WidthMinus1 =
6249 DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6250 DAG.getTargetConstant(NumBits - 1, dl, ElemTy));
6251 SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB);
6252 return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ);
6253 }
6254
6255 // Compute with: cttz(x) = ctpop(lsb - 1)
6256
6257 // Compute LSB - 1.
6258 SDValue Bits;
6259 if (ElemTy == MVT::i64) {
6260 // Load constant 0xffff'ffff'ffff'ffff to register.
6261 SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6262 DAG.getTargetConstant(0x1eff, dl, MVT::i32));
6263 Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF);
6264 } else {
6265 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6266 DAG.getTargetConstant(1, dl, ElemTy));
6267 Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
6268 }
6269 return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
6270 }
6271
6272 if (!ST->hasV6T2Ops())
6273 return SDValue();
6274
6275 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0));
6276 return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
6277}
6278
6280 const ARMSubtarget *ST) {
6281 EVT VT = N->getValueType(0);
6282 SDLoc DL(N);
6283
6284 assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
6285 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
6286 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
6287 "Unexpected type for custom ctpop lowering");
6288
6289 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6290 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
6291 SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0));
6292 Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res);
6293
6294 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
6295 unsigned EltSize = 8;
6296 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
6297 while (EltSize != VT.getScalarSizeInBits()) {
6299 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL,
6300 TLI.getPointerTy(DAG.getDataLayout())));
6301 Ops.push_back(Res);
6302
6303 EltSize *= 2;
6304 NumElts /= 2;
6305 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
6306 Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops);
6307 }
6308
6309 return Res;
6310}
6311
6312/// Getvshiftimm - Check if this is a valid build_vector for the immediate
6313/// operand of a vector shift operation, where all the elements of the
6314/// build_vector must have the same constant integer value.
6315static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
6316 // Ignore bit_converts.
6317 while (Op.getOpcode() == ISD::BITCAST)
6318 Op = Op.getOperand(0);
6320 APInt SplatBits, SplatUndef;
6321 unsigned SplatBitSize;
6322 bool HasAnyUndefs;
6323 if (!BVN ||
6324 !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
6325 ElementBits) ||
6326 SplatBitSize > ElementBits)
6327 return false;
6328 Cnt = SplatBits.getSExtValue();
6329 return true;
6330}
6331
6332/// isVShiftLImm - Check if this is a valid build_vector for the immediate
6333/// operand of a vector shift left operation. That value must be in the range:
6334/// 0 <= Value < ElementBits for a left shift; or
6335/// 0 <= Value <= ElementBits for a long left shift.
6336static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
6337 assert(VT.isVector() && "vector shift count is not a vector type");
6338 int64_t ElementBits = VT.getScalarSizeInBits();
6339 if (!getVShiftImm(Op, ElementBits, Cnt))
6340 return false;
6341 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
6342}
6343
6344/// isVShiftRImm - Check if this is a valid build_vector for the immediate
6345/// operand of a vector shift right operation. For a shift opcode, the value
6346/// is positive, but for an intrinsic the value count must be negative. The
6347/// absolute value must be in the range:
6348/// 1 <= |Value| <= ElementBits for a right shift; or
6349/// 1 <= |Value| <= ElementBits/2 for a narrow right shift.
6350static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
6351 int64_t &Cnt) {
6352 assert(VT.isVector() && "vector shift count is not a vector type");
6353 int64_t ElementBits = VT.getScalarSizeInBits();
6354 if (!getVShiftImm(Op, ElementBits, Cnt))
6355 return false;
6356 if (!isIntrinsic)
6357 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
6358 if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) {
6359 Cnt = -Cnt;
6360 return true;
6361 }
6362 return false;
6363}
6364
6366 const ARMSubtarget *ST) {
6367 EVT VT = N->getValueType(0);
6368 SDLoc dl(N);
6369 int64_t Cnt;
6370
6371 if (!VT.isVector())
6372 return SDValue();
6373
6374 // We essentially have two forms here. Shift by an immediate and shift by a
6375 // vector register (there are also shift by a gpr, but that is just handled
6376 // with a tablegen pattern). We cannot easily match shift by an immediate in
6377 // tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM.
6378 // For shifting by a vector, we don't have VSHR, only VSHL (which can be
6379 // signed or unsigned, and a negative shift indicates a shift right).
6380 if (N->getOpcode() == ISD::SHL) {
6381 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
6382 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
6383 DAG.getConstant(Cnt, dl, MVT::i32));
6384 return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0),
6385 N->getOperand(1));
6386 }
6387
6388 assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
6389 "unexpected vector shift opcode");
6390
6391 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
6392 unsigned VShiftOpc =
6393 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
6394 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
6395 DAG.getConstant(Cnt, dl, MVT::i32));
6396 }
6397
6398 // Other right shifts we don't have operations for (we use a shift left by a
6399 // negative number).
6400 EVT ShiftVT = N->getOperand(1).getValueType();
6401 SDValue NegatedCount = DAG.getNode(
6402 ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1));
6403 unsigned VShiftOpc =
6404 (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu);
6405 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount);
6406}
6407
6409 const ARMSubtarget *ST) {
6410 EVT VT = N->getValueType(0);
6411 SDLoc dl(N);
6412
6413 // We can get here for a node like i32 = ISD::SHL i32, i64
6414 if (VT != MVT::i64)
6415 return SDValue();
6416
6417 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA ||
6418 N->getOpcode() == ISD::SHL) &&
6419 "Unknown shift to lower!");
6420
6421 unsigned ShOpc = N->getOpcode();
6422 if (ST->hasMVEIntegerOps()) {
6423 SDValue ShAmt = N->getOperand(1);
6424 unsigned ShPartsOpc = ARMISD::LSLL;
6426
6427 // If the shift amount is greater than 32 or has a greater bitwidth than 64
6428 // then do the default optimisation
6429 if ((!Con && ShAmt->getValueType(0).getSizeInBits() > 64) ||
6430 (Con && (Con->getAPIntValue() == 0 || Con->getAPIntValue().uge(32))))
6431 return SDValue();
6432
6433 // Extract the lower 32 bits of the shift amount if it's not an i32
6434 if (ShAmt->getValueType(0) != MVT::i32)
6435 ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32);
6436
6437 if (ShOpc == ISD::SRL) {
6438 if (!Con)
6439 // There is no t2LSRLr instruction so negate and perform an lsll if the
6440 // shift amount is in a register, emulating a right shift.
6441 ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6442 DAG.getConstant(0, dl, MVT::i32), ShAmt);
6443 else
6444 // Else generate an lsrl on the immediate shift amount
6445 ShPartsOpc = ARMISD::LSRL;
6446 } else if (ShOpc == ISD::SRA)
6447 ShPartsOpc = ARMISD::ASRL;
6448
6449 // Split Lower/Upper 32 bits of the destination/source
6450 SDValue Lo, Hi;
6451 std::tie(Lo, Hi) =
6452 DAG.SplitScalar(N->getOperand(0), dl, MVT::i32, MVT::i32);
6453 // Generate the shift operation as computed above
6454 Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi,
6455 ShAmt);
6456 // The upper 32 bits come from the second return value of lsll
6457 Hi = SDValue(Lo.getNode(), 1);
6458 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6459 }
6460
6461 // We only lower SRA, SRL of 1 here, all others use generic lowering.
6462 if (!isOneConstant(N->getOperand(1)) || N->getOpcode() == ISD::SHL)
6463 return SDValue();
6464
6465 // If we are in thumb mode, we don't have RRX.
6466 if (ST->isThumb1Only())
6467 return SDValue();
6468
6469 // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr.
6470 SDValue Lo, Hi;
6471 std::tie(Lo, Hi) = DAG.SplitScalar(N->getOperand(0), dl, MVT::i32, MVT::i32);
6472
6473 // First, build a LSRS1/ASRS1 op, which shifts the top part by one and
6474 // captures the shifted out bit into a carry flag.
6475 unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::LSRS1 : ARMISD::ASRS1;
6476 Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, FlagsVT), Hi);
6477
6478 // The low part is an ARMISD::RRX operand, which shifts the carry in.
6479 Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
6480
6481 // Merge the pieces into a single i64 value.
6482 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6483}
6484
6486 const ARMSubtarget *ST) {
6487 bool Invert = false;
6488 bool Swap = false;
6489 unsigned Opc = ARMCC::AL;
6490
6491 SDValue Op0 = Op.getOperand(0);
6492 SDValue Op1 = Op.getOperand(1);
6493 SDValue CC = Op.getOperand(2);
6494 EVT VT = Op.getValueType();
6495 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
6496 SDLoc dl(Op);
6497
6498 EVT CmpVT;
6499 if (ST->hasNEON())
6501 else {
6502 assert(ST->hasMVEIntegerOps() &&
6503 "No hardware support for integer vector comparison!");
6504
6505 if (Op.getValueType().getVectorElementType() != MVT::i1)
6506 return SDValue();
6507
6508 // Make sure we expand floating point setcc to scalar if we do not have
6509 // mve.fp, so that we can handle them from there.
6510 if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps())
6511 return SDValue();
6512
6513 CmpVT = VT;
6514 }
6515
6516 if (Op0.getValueType().getVectorElementType() == MVT::i64 &&
6517 (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) {
6518 // Special-case integer 64-bit equality comparisons. They aren't legal,
6519 // but they can be lowered with a few vector instructions.
6520 unsigned CmpElements = CmpVT.getVectorNumElements() * 2;
6521 EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements);
6522 SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0);
6523 SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1);
6524 SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1,
6525 DAG.getCondCode(ISD::SETEQ));
6526 SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp);
6527 SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed);
6528 Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged);
6529 if (SetCCOpcode == ISD::SETNE)
6530 Merged = DAG.getNOT(dl, Merged, CmpVT);
6531 Merged = DAG.getSExtOrTrunc(Merged, dl, VT);
6532 return Merged;
6533 }
6534
6535 if (CmpVT.getVectorElementType() == MVT::i64)
6536 // 64-bit comparisons are not legal in general.
6537 return SDValue();
6538
6539 if (Op1.getValueType().isFloatingPoint()) {
6540 switch (SetCCOpcode) {
6541 default: llvm_unreachable("Illegal FP comparison");
6542 case ISD::SETUNE:
6543 case ISD::SETNE:
6544 if (ST->hasMVEFloatOps()) {
6545 Opc = ARMCC::NE; break;
6546 } else {
6547 Invert = true; [[fallthrough]];
6548 }
6549 case ISD::SETOEQ:
6550 case ISD::SETEQ: Opc = ARMCC::EQ; break;
6551 case ISD::SETOLT:
6552 case ISD::SETLT: Swap = true; [[fallthrough]];
6553 case ISD::SETOGT:
6554 case ISD::SETGT: Opc = ARMCC::GT; break;
6555 case ISD::SETOLE:
6556 case ISD::SETLE: Swap = true; [[fallthrough]];
6557 case ISD::SETOGE:
6558 case ISD::SETGE: Opc = ARMCC::GE; break;
6559 case ISD::SETUGE: Swap = true; [[fallthrough]];
6560 case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break;
6561 case ISD::SETUGT: Swap = true; [[fallthrough]];
6562 case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break;
6563 case ISD::SETUEQ: Invert = true; [[fallthrough]];
6564 case ISD::SETONE: {
6565 // Expand this to (OLT | OGT).
6566 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6567 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6568 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6569 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6570 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6571 if (Invert)
6572 Result = DAG.getNOT(dl, Result, VT);
6573 return Result;
6574 }
6575 case ISD::SETUO: Invert = true; [[fallthrough]];
6576 case ISD::SETO: {
6577 // Expand this to (OLT | OGE).
6578 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6579 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6580 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6581 DAG.getConstant(ARMCC::GE, dl, MVT::i32));
6582 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6583 if (Invert)
6584 Result = DAG.getNOT(dl, Result, VT);
6585 return Result;
6586 }
6587 }
6588 } else {
6589 // Integer comparisons.
6590 switch (SetCCOpcode) {
6591 default: llvm_unreachable("Illegal integer comparison");
6592 case ISD::SETNE:
6593 if (ST->hasMVEIntegerOps()) {
6594 Opc = ARMCC::NE; break;
6595 } else {
6596 Invert = true; [[fallthrough]];
6597 }
6598 case ISD::SETEQ: Opc = ARMCC::EQ; break;
6599 case ISD::SETLT: Swap = true; [[fallthrough]];
6600 case ISD::SETGT: Opc = ARMCC::GT; break;
6601 case ISD::SETLE: Swap = true; [[fallthrough]];
6602 case ISD::SETGE: Opc = ARMCC::GE; break;
6603 case ISD::SETULT: Swap = true; [[fallthrough]];
6604 case ISD::SETUGT: Opc = ARMCC::HI; break;
6605 case ISD::SETULE: Swap = true; [[fallthrough]];
6606 case ISD::SETUGE: Opc = ARMCC::HS; break;
6607 }
6608
6609 // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
6610 if (ST->hasNEON() && Opc == ARMCC::EQ) {
6611 SDValue AndOp;
6613 AndOp = Op0;
6614 else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
6615 AndOp = Op1;
6616
6617 // Ignore bitconvert.
6618 if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
6619 AndOp = AndOp.getOperand(0);
6620
6621 if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
6622 Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0));
6623 Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1));
6624 SDValue Result = DAG.getNode(ARMISD::VTST, dl, CmpVT, Op0, Op1);
6625 if (!Invert)
6626 Result = DAG.getNOT(dl, Result, VT);
6627 return Result;
6628 }
6629 }
6630 }
6631
6632 if (Swap)
6633 std::swap(Op0, Op1);
6634
6635 // If one of the operands is a constant vector zero, attempt to fold the
6636 // comparison to a specialized compare-against-zero form.
6638 (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::EQ ||
6639 Opc == ARMCC::NE)) {
6640 if (Opc == ARMCC::GE)
6641 Opc = ARMCC::LE;
6642 else if (Opc == ARMCC::GT)
6643 Opc = ARMCC::LT;
6644 std::swap(Op0, Op1);
6645 }
6646
6647 SDValue Result;
6649 (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::LE ||
6650 Opc == ARMCC::LT || Opc == ARMCC::NE || Opc == ARMCC::EQ))
6651 Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, Op0,
6652 DAG.getConstant(Opc, dl, MVT::i32));
6653 else
6654 Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6655 DAG.getConstant(Opc, dl, MVT::i32));
6656
6657 Result = DAG.getSExtOrTrunc(Result, dl, VT);
6658
6659 if (Invert)
6660 Result = DAG.getNOT(dl, Result, VT);
6661
6662 return Result;
6663}
6664
6666 SDValue LHS = Op.getOperand(0);
6667 SDValue RHS = Op.getOperand(1);
6668 SDValue Carry = Op.getOperand(2);
6669 SDValue Cond = Op.getOperand(3);
6670 SDLoc DL(Op);
6671
6672 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
6673
6674 // ARMISD::SUBE expects a carry not a borrow like ISD::USUBO_CARRY so we
6675 // have to invert the carry first.
6676 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
6677 DAG.getConstant(1, DL, MVT::i32), Carry);
6678 // This converts the boolean value carry into the carry flag.
6679 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
6680
6681 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
6682 SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry);
6683
6684 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
6685 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
6686 SDValue ARMcc = DAG.getConstant(
6687 IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32);
6688 return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc,
6689 Cmp.getValue(1));
6690}
6691
6692/// isVMOVModifiedImm - Check if the specified splat value corresponds to a
6693/// valid vector constant for a NEON or MVE instruction with a "modified
6694/// immediate" operand (e.g., VMOV). If so, return the encoded value.
6695static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
6696 unsigned SplatBitSize, SelectionDAG &DAG,
6697 const SDLoc &dl, EVT &VT, EVT VectorVT,
6698 VMOVModImmType type) {
6699 unsigned OpCmode, Imm;
6700 bool is128Bits = VectorVT.is128BitVector();
6701
6702 // SplatBitSize is set to the smallest size that splats the vector, so a
6703 // zero vector will always have SplatBitSize == 8. However, NEON modified
6704 // immediate instructions others than VMOV do not support the 8-bit encoding
6705 // of a zero vector, and the default encoding of zero is supposed to be the
6706 // 32-bit version.
6707 if (SplatBits == 0)
6708 SplatBitSize = 32;
6709
6710 switch (SplatBitSize) {
6711 case 8:
6712 if (type != VMOVModImm)
6713 return SDValue();
6714 // Any 1-byte value is OK. Op=0, Cmode=1110.
6715 assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
6716 OpCmode = 0xe;
6717 Imm = SplatBits;
6718 VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
6719 break;
6720
6721 case 16:
6722 // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
6723 VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
6724 if ((SplatBits & ~0xff) == 0) {
6725 // Value = 0x00nn: Op=x, Cmode=100x.
6726 OpCmode = 0x8;
6727 Imm = SplatBits;
6728 break;
6729 }
6730 if ((SplatBits & ~0xff00) == 0) {
6731 // Value = 0xnn00: Op=x, Cmode=101x.
6732 OpCmode = 0xa;
6733 Imm = SplatBits >> 8;
6734 break;
6735 }
6736 return SDValue();
6737
6738 case 32:
6739 // NEON's 32-bit VMOV supports splat values where:
6740 // * only one byte is nonzero, or
6741 // * the least significant byte is 0xff and the second byte is nonzero, or
6742 // * the least significant 2 bytes are 0xff and the third is nonzero.
6743 VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
6744 if ((SplatBits & ~0xff) == 0) {
6745 // Value = 0x000000nn: Op=x, Cmode=000x.
6746 OpCmode = 0;
6747 Imm = SplatBits;
6748 break;
6749 }
6750 if ((SplatBits & ~0xff00) == 0) {
6751 // Value = 0x0000nn00: Op=x, Cmode=001x.
6752 OpCmode = 0x2;
6753 Imm = SplatBits >> 8;
6754 break;
6755 }
6756 if ((SplatBits & ~0xff0000) == 0) {
6757 // Value = 0x00nn0000: Op=x, Cmode=010x.
6758 OpCmode = 0x4;
6759 Imm = SplatBits >> 16;
6760 break;
6761 }
6762 if ((SplatBits & ~0xff000000) == 0) {
6763 // Value = 0xnn000000: Op=x, Cmode=011x.
6764 OpCmode = 0x6;
6765 Imm = SplatBits >> 24;
6766 break;
6767 }
6768
6769 // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
6770 if (type == OtherModImm) return SDValue();
6771
6772 if ((SplatBits & ~0xffff) == 0 &&
6773 ((SplatBits | SplatUndef) & 0xff) == 0xff) {
6774 // Value = 0x0000nnff: Op=x, Cmode=1100.
6775 OpCmode = 0xc;
6776 Imm = SplatBits >> 8;
6777 break;
6778 }
6779
6780 // cmode == 0b1101 is not supported for MVE VMVN
6781 if (type == MVEVMVNModImm)
6782 return SDValue();
6783
6784 if ((SplatBits & ~0xffffff) == 0 &&
6785 ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
6786 // Value = 0x00nnffff: Op=x, Cmode=1101.
6787 OpCmode = 0xd;
6788 Imm = SplatBits >> 16;
6789 break;
6790 }
6791
6792 // Note: there are a few 32-bit splat values (specifically: 00ffff00,
6793 // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
6794 // VMOV.I32. A (very) minor optimization would be to replicate the value
6795 // and fall through here to test for a valid 64-bit splat. But, then the
6796 // caller would also need to check and handle the change in size.
6797 return SDValue();
6798
6799 case 64: {
6800 if (type != VMOVModImm)
6801 return SDValue();
6802 // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
6803 uint64_t BitMask = 0xff;
6804 unsigned ImmMask = 1;
6805 Imm = 0;
6806 for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
6807 if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
6808 Imm |= ImmMask;
6809 } else if ((SplatBits & BitMask) != 0) {
6810 return SDValue();
6811 }
6812 BitMask <<= 8;
6813 ImmMask <<= 1;
6814 }
6815
6816 // Op=1, Cmode=1110.
6817 OpCmode = 0x1e;
6818 VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
6819 break;
6820 }
6821
6822 default:
6823 llvm_unreachable("unexpected size for isVMOVModifiedImm");
6824 }
6825
6826 unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm);
6827 return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
6828}
6829
6830SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
6831 const ARMSubtarget *ST) const {
6832 EVT VT = Op.getValueType();
6833 bool IsDouble = (VT == MVT::f64);
6834 ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
6835 const APFloat &FPVal = CFP->getValueAPF();
6836
6837 // Prevent floating-point constants from using literal loads
6838 // when execute-only is enabled.
6839 if (ST->genExecuteOnly()) {
6840 // We shouldn't trigger this for v6m execute-only
6841 assert((!ST->isThumb1Only() || ST->hasV8MBaselineOps()) &&
6842 "Unexpected architecture");
6843
6844 // If we can represent the constant as an immediate, don't lower it
6845 if (isFPImmLegal(FPVal, VT))
6846 return Op;
6847 // Otherwise, construct as integer, and move to float register
6848 APInt INTVal = FPVal.bitcastToAPInt();
6849 SDLoc DL(CFP);
6850 switch (VT.getSimpleVT().SimpleTy) {
6851 default:
6852 llvm_unreachable("Unknown floating point type!");
6853 break;
6854 case MVT::f64: {
6855 SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
6856 SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
6857 return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);
6858 }
6859 case MVT::f32:
6860 return DAG.getNode(ARMISD::VMOVSR, DL, VT,
6861 DAG.getConstant(INTVal, DL, MVT::i32));
6862 }
6863 }
6864
6865 if (!ST->hasVFP3Base())
6866 return SDValue();
6867
6868 // Use the default (constant pool) lowering for double constants when we have
6869 // an SP-only FPU
6870 if (IsDouble && !Subtarget->hasFP64())
6871 return SDValue();
6872
6873 // Try splatting with a VMOV.f32...
6874 int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
6875
6876 if (ImmVal != -1) {
6877 if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {
6878 // We have code in place to select a valid ConstantFP already, no need to
6879 // do any mangling.
6880 return Op;
6881 }
6882
6883 // It's a float and we are trying to use NEON operations where
6884 // possible. Lower it to a splat followed by an extract.
6885 SDLoc DL(Op);
6886 SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32);
6887 SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
6888 NewVal);
6889 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
6890 DAG.getConstant(0, DL, MVT::i32));
6891 }
6892
6893 // The rest of our options are NEON only, make sure that's allowed before
6894 // proceeding..
6895 if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
6896 return SDValue();
6897
6898 EVT VMovVT;
6899 uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();
6900
6901 // It wouldn't really be worth bothering for doubles except for one very
6902 // important value, which does happen to match: 0.0. So make sure we don't do
6903 // anything stupid.
6904 if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
6905 return SDValue();
6906
6907 // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
6908 SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
6909 VMovVT, VT, VMOVModImm);
6910 if (NewVal != SDValue()) {
6911 SDLoc DL(Op);
6912 SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
6913 NewVal);
6914 if (IsDouble)
6915 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
6916
6917 // It's a float: cast and extract a vector element.
6918 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
6919 VecConstant);
6920 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
6921 DAG.getConstant(0, DL, MVT::i32));
6922 }
6923
6924 // Finally, try a VMVN.i32
6925 NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
6926 VT, VMVNModImm);
6927 if (NewVal != SDValue()) {
6928 SDLoc DL(Op);
6929 SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
6930
6931 if (IsDouble)
6932 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
6933
6934 // It's a float: cast and extract a vector element.
6935 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
6936 VecConstant);
6937 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
6938 DAG.getConstant(0, DL, MVT::i32));
6939 }
6940
6941 return SDValue();
6942}
6943
6944// check if an VEXT instruction can handle the shuffle mask when the
6945// vector sources of the shuffle are the same.
6946static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
6947 unsigned NumElts = VT.getVectorNumElements();
6948
6949 // Assume that the first shuffle index is not UNDEF. Fail if it is.
6950 if (M[0] < 0)
6951 return false;
6952
6953 Imm = M[0];
6954
6955 // If this is a VEXT shuffle, the immediate value is the index of the first
6956 // element. The other shuffle indices must be the successive elements after
6957 // the first one.
6958 unsigned ExpectedElt = Imm;
6959 for (unsigned i = 1; i < NumElts; ++i) {
6960 // Increment the expected index. If it wraps around, just follow it
6961 // back to index zero and keep going.
6962 ++ExpectedElt;
6963 if (ExpectedElt == NumElts)
6964 ExpectedElt = 0;
6965
6966 if (M[i] < 0) continue; // ignore UNDEF indices
6967 if (ExpectedElt != static_cast<unsigned>(M[i]))
6968 return false;
6969 }
6970
6971 return true;
6972}
6973
6974static bool isVEXTMask(ArrayRef<int> M, EVT VT,
6975 bool &ReverseVEXT, unsigned &Imm) {
6976 unsigned NumElts = VT.getVectorNumElements();
6977 ReverseVEXT = false;
6978
6979 // Assume that the first shuffle index is not UNDEF. Fail if it is.
6980 if (M[0] < 0)
6981 return false;
6982
6983 Imm = M[0];
6984
6985 // If this is a VEXT shuffle, the immediate value is the index of the first
6986 // element. The other shuffle indices must be the successive elements after
6987 // the first one.
6988 unsigned ExpectedElt = Imm;
6989 for (unsigned i = 1; i < NumElts; ++i) {
6990 // Increment the expected index. If it wraps around, it may still be
6991 // a VEXT but the source vectors must be swapped.
6992 ExpectedElt += 1;
6993 if (ExpectedElt == NumElts * 2) {
6994 ExpectedElt = 0;
6995 ReverseVEXT = true;
6996 }
6997
6998 if (M[i] < 0) continue; // ignore UNDEF indices
6999 if (ExpectedElt != static_cast<unsigned>(M[i]))
7000 return false;
7001 }
7002
7003 // Adjust the index value if the source operands will be swapped.
7004 if (ReverseVEXT)
7005 Imm -= NumElts;
7006
7007 return true;
7008}
7009
7010static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
7011 // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
7012 // range, then 0 is placed into the resulting vector. So pretty much any mask
7013 // of 8 elements can work here.
7014 return VT == MVT::v8i8 && M.size() == 8;
7015}
7016
7017static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask,
7018 unsigned Index) {
7019 if (Mask.size() == Elements * 2)
7020 return Index / Elements;
7021 return Mask[Index] == 0 ? 0 : 1;
7022}
7023
7024// Checks whether the shuffle mask represents a vector transpose (VTRN) by
7025// checking that pairs of elements in the shuffle mask represent the same index
7026// in each vector, incrementing the expected index by 2 at each step.
7027// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6]
7028// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g}
7029// v2={e,f,g,h}
7030// WhichResult gives the offset for each element in the mask based on which
7031// of the two results it belongs to.
7032//
7033// The transpose can be represented either as:
7034// result1 = shufflevector v1, v2, result1_shuffle_mask
7035// result2 = shufflevector v1, v2, result2_shuffle_mask
7036// where v1/v2 and the shuffle masks have the same number of elements
7037// (here WhichResult (see below) indicates which result is being checked)
7038//
7039// or as:
7040// results = shufflevector v1, v2, shuffle_mask
7041// where both results are returned in one vector and the shuffle mask has twice
7042// as many elements as v1/v2 (here WhichResult will always be 0 if true) here we
7043// want to check the low half and high half of the shuffle mask as if it were
7044// the other case
7045static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7046 unsigned EltSz = VT.getScalarSizeInBits();
7047 if (EltSz == 64)
7048 return false;
7049
7050 unsigned NumElts = VT.getVectorNumElements();
7051 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7052 return false;
7053
7054 // If the mask is twice as long as the input vector then we need to check the
7055 // upper and lower parts of the mask with a matching value for WhichResult
7056 // FIXME: A mask with only even values will be rejected in case the first
7057 // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only
7058 // M[0] is used to determine WhichResult
7059 for (unsigned i = 0; i < M.size(); i += NumElts) {
7060 WhichResult = SelectPairHalf(NumElts, M, i);
7061 for (unsigned j = 0; j < NumElts; j += 2) {
7062 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
7063 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult))
7064 return false;
7065 }
7066 }
7067
7068 if (M.size() == NumElts*2)
7069 WhichResult = 0;
7070
7071 return true;
7072}
7073
7074/// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
7075/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7076/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
7077static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7078 unsigned EltSz = VT.getScalarSizeInBits();
7079 if (EltSz == 64)
7080 return false;
7081
7082 unsigned NumElts = VT.getVectorNumElements();
7083 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7084 return false;
7085
7086 for (unsigned i = 0; i < M.size(); i += NumElts) {
7087 WhichResult = SelectPairHalf(NumElts, M, i);
7088 for (unsigned j = 0; j < NumElts; j += 2) {
7089 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
7090 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult))
7091 return false;
7092 }
7093 }
7094
7095 if (M.size() == NumElts*2)
7096 WhichResult = 0;
7097
7098 return true;
7099}
7100
7101// Checks whether the shuffle mask represents a vector unzip (VUZP) by checking
7102// that the mask elements are either all even and in steps of size 2 or all odd
7103// and in steps of size 2.
7104// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6]
7105// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g}
7106// v2={e,f,g,h}
7107// Requires similar checks to that of isVTRNMask with
7108// respect the how results are returned.
7109static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7110 unsigned EltSz = VT.getScalarSizeInBits();
7111 if (EltSz == 64)
7112 return false;
7113
7114 unsigned NumElts = VT.getVectorNumElements();
7115 if (M.size() != NumElts && M.size() != NumElts*2)
7116 return false;
7117
7118 for (unsigned i = 0; i < M.size(); i += NumElts) {
7119 WhichResult = SelectPairHalf(NumElts, M, i);
7120 for (unsigned j = 0; j < NumElts; ++j) {
7121 if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult)
7122 return false;
7123 }
7124 }
7125
7126 if (M.size() == NumElts*2)
7127 WhichResult = 0;
7128
7129 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7130 if (VT.is64BitVector() && EltSz == 32)
7131 return false;
7132
7133 return true;
7134}
7135
7136/// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
7137/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7138/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
7139static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7140 unsigned EltSz = VT.getScalarSizeInBits();
7141 if (EltSz == 64)
7142 return false;
7143
7144 unsigned NumElts = VT.getVectorNumElements();
7145 if (M.size() != NumElts && M.size() != NumElts*2)
7146 return false;
7147
7148 unsigned Half = NumElts / 2;
7149 for (unsigned i = 0; i < M.size(); i += NumElts) {
7150 WhichResult = SelectPairHalf(NumElts, M, i);
7151 for (unsigned j = 0; j < NumElts; j += Half) {
7152 unsigned Idx = WhichResult;
7153 for (unsigned k = 0; k < Half; ++k) {
7154 int MIdx = M[i + j + k];
7155 if (MIdx >= 0 && (unsigned) MIdx != Idx)
7156 return false;
7157 Idx += 2;
7158 }
7159 }
7160 }
7161
7162 if (M.size() == NumElts*2)
7163 WhichResult = 0;
7164
7165 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7166 if (VT.is64BitVector() && EltSz == 32)
7167 return false;
7168
7169 return true;
7170}
7171
7172// Checks whether the shuffle mask represents a vector zip (VZIP) by checking
7173// that pairs of elements of the shufflemask represent the same index in each
7174// vector incrementing sequentially through the vectors.
7175// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5]
7176// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f}
7177// v2={e,f,g,h}
7178// Requires similar checks to that of isVTRNMask with respect the how results
7179// are returned.
7180static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7181 unsigned EltSz = VT.getScalarSizeInBits();
7182 if (EltSz == 64)
7183 return false;
7184
7185 unsigned NumElts = VT.getVectorNumElements();
7186 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7187 return false;
7188
7189 for (unsigned i = 0; i < M.size(); i += NumElts) {
7190 WhichResult = SelectPairHalf(NumElts, M, i);
7191 unsigned Idx = WhichResult * NumElts / 2;
7192 for (unsigned j = 0; j < NumElts; j += 2) {
7193 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
7194 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts))
7195 return false;
7196 Idx += 1;
7197 }
7198 }
7199
7200 if (M.size() == NumElts*2)
7201 WhichResult = 0;
7202
7203 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7204 if (VT.is64BitVector() && EltSz == 32)
7205 return false;
7206
7207 return true;
7208}
7209
7210/// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
7211/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7212/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
7213static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7214 unsigned EltSz = VT.getScalarSizeInBits();
7215 if (EltSz == 64)
7216 return false;
7217
7218 unsigned NumElts = VT.getVectorNumElements();
7219 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7220 return false;
7221
7222 for (unsigned i = 0; i < M.size(); i += NumElts) {
7223 WhichResult = SelectPairHalf(NumElts, M, i);
7224 unsigned Idx = WhichResult * NumElts / 2;
7225 for (unsigned j = 0; j < NumElts; j += 2) {
7226 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
7227 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx))
7228 return false;
7229 Idx += 1;
7230 }
7231 }
7232
7233 if (M.size() == NumElts*2)
7234 WhichResult = 0;
7235
7236 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7237 if (VT.is64BitVector() && EltSz == 32)
7238 return false;
7239
7240 return true;
7241}
7242
7243/// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN),
7244/// and return the corresponding ARMISD opcode if it is, or 0 if it isn't.
7245static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT,
7246 unsigned &WhichResult,
7247 bool &isV_UNDEF) {
7248 isV_UNDEF = false;
7249 if (isVTRNMask(ShuffleMask, VT, WhichResult))
7250 return ARMISD::VTRN;
7251 if (isVUZPMask(ShuffleMask, VT, WhichResult))
7252 return ARMISD::VUZP;
7253 if (isVZIPMask(ShuffleMask, VT, WhichResult))
7254 return ARMISD::VZIP;
7255
7256 isV_UNDEF = true;
7257 if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
7258 return ARMISD::VTRN;
7259 if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
7260 return ARMISD::VUZP;
7261 if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
7262 return ARMISD::VZIP;
7263
7264 return 0;
7265}
7266
7267/// \return true if this is a reverse operation on an vector.
7268static bool isReverseMask(ArrayRef<int> M, EVT VT) {
7269 unsigned NumElts = VT.getVectorNumElements();
7270 // Make sure the mask has the right size.
7271 if (NumElts != M.size())
7272 return false;
7273
7274 // Look for <15, ..., 3, -1, 1, 0>.
7275 for (unsigned i = 0; i != NumElts; ++i)
7276 if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
7277 return false;
7278
7279 return true;
7280}
7281
7282static bool isTruncMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
7283 unsigned NumElts = VT.getVectorNumElements();
7284 // Make sure the mask has the right size.
7285 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
7286 return false;
7287
7288 // Half-width truncation patterns (e.g. v4i32 -> v8i16):
7289 // !Top && SingleSource: <0, 2, 4, 6, 0, 2, 4, 6>
7290 // !Top && !SingleSource: <0, 2, 4, 6, 8, 10, 12, 14>
7291 // Top && SingleSource: <1, 3, 5, 7, 1, 3, 5, 7>
7292 // Top && !SingleSource: <1, 3, 5, 7, 9, 11, 13, 15>
7293 int Ofs = Top ? 1 : 0;
7294 int Upper = SingleSource ? 0 : NumElts;
7295 for (int i = 0, e = NumElts / 2; i != e; ++i) {
7296 if (M[i] >= 0 && M[i] != (i * 2) + Ofs)
7297 return false;
7298 if (M[i + e] >= 0 && M[i + e] != (i * 2) + Ofs + Upper)
7299 return false;
7300 }
7301 return true;
7302}
7303
7304static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
7305 unsigned NumElts = VT.getVectorNumElements();
7306 // Make sure the mask has the right size.
7307 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
7308 return false;
7309
7310 // If Top
7311 // Look for <0, N, 2, N+2, 4, N+4, ..>.
7312 // This inserts Input2 into Input1
7313 // else if not Top
7314 // Look for <0, N+1, 2, N+3, 4, N+5, ..>
7315 // This inserts Input1 into Input2
7316 unsigned Offset = Top ? 0 : 1;
7317 unsigned N = SingleSource ? 0 : NumElts;
7318 for (unsigned i = 0; i < NumElts; i += 2) {
7319 if (M[i] >= 0 && M[i] != (int)i)
7320 return false;
7321 if (M[i + 1] >= 0 && M[i + 1] != (int)(N + i + Offset))
7322 return false;
7323 }
7324
7325 return true;
7326}
7327
7328static bool isVMOVNTruncMask(ArrayRef<int> M, EVT ToVT, bool rev) {
7329 unsigned NumElts = ToVT.getVectorNumElements();
7330 if (NumElts != M.size())
7331 return false;
7332
7333 // Test if the Trunc can be convertable to a VMOVN with this shuffle. We are
7334 // looking for patterns of:
7335 // !rev: 0 N/2 1 N/2+1 2 N/2+2 ...
7336 // rev: N/2 0 N/2+1 1 N/2+2 2 ...
7337
7338 unsigned Off0 = rev ? NumElts / 2 : 0;
7339 unsigned Off1 = rev ? 0 : NumElts / 2;
7340 for (unsigned i = 0; i < NumElts; i += 2) {
7341 if (M[i] >= 0 && M[i] != (int)(Off0 + i / 2))
7342 return false;
7343 if (M[i + 1] >= 0 && M[i + 1] != (int)(Off1 + i / 2))
7344 return false;
7345 }
7346
7347 return true;
7348}
7349
7350// Reconstruct an MVE VCVT from a BuildVector of scalar fptrunc, all extracted
7351// from a pair of inputs. For example:
7352// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7353// FP_ROUND(EXTRACT_ELT(Y, 0),
7354// FP_ROUND(EXTRACT_ELT(X, 1),
7355// FP_ROUND(EXTRACT_ELT(Y, 1), ...)
7357 const ARMSubtarget *ST) {
7358 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7359 if (!ST->hasMVEFloatOps())
7360 return SDValue();
7361
7362 SDLoc dl(BV);
7363 EVT VT = BV.getValueType();
7364 if (VT != MVT::v8f16)
7365 return SDValue();
7366
7367 // We are looking for a buildvector of fptrunc elements, where all the
7368 // elements are interleavingly extracted from two sources. Check the first two
7369 // items are valid enough and extract some info from them (they are checked
7370 // properly in the loop below).
7371 if (BV.getOperand(0).getOpcode() != ISD::FP_ROUND ||
7374 return SDValue();
7375 if (BV.getOperand(1).getOpcode() != ISD::FP_ROUND ||
7378 return SDValue();
7379 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
7380 SDValue Op1 = BV.getOperand(1).getOperand(0).getOperand(0);
7381 if (Op0.getValueType() != MVT::v4f32 || Op1.getValueType() != MVT::v4f32)
7382 return SDValue();
7383
7384 // Check all the values in the BuildVector line up with our expectations.
7385 for (unsigned i = 1; i < 4; i++) {
7386 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
7387 return Trunc.getOpcode() == ISD::FP_ROUND &&
7389 Trunc.getOperand(0).getOperand(0) == Op &&
7390 Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
7391 };
7392 if (!Check(BV.getOperand(i * 2 + 0), Op0, i))
7393 return SDValue();
7394 if (!Check(BV.getOperand(i * 2 + 1), Op1, i))
7395 return SDValue();
7396 }
7397
7398 SDValue N1 = DAG.getNode(ARMISD::VCVTN, dl, VT, DAG.getUNDEF(VT), Op0,
7399 DAG.getConstant(0, dl, MVT::i32));
7400 return DAG.getNode(ARMISD::VCVTN, dl, VT, N1, Op1,
7401 DAG.getConstant(1, dl, MVT::i32));
7402}
7403
7404// Reconstruct an MVE VCVT from a BuildVector of scalar fpext, all extracted
7405// from a single input on alternating lanes. For example:
7406// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7407// FP_ROUND(EXTRACT_ELT(X, 2),
7408// FP_ROUND(EXTRACT_ELT(X, 4), ...)
7410 const ARMSubtarget *ST) {
7411 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7412 if (!ST->hasMVEFloatOps())
7413 return SDValue();
7414
7415 SDLoc dl(BV);
7416 EVT VT = BV.getValueType();
7417 if (VT != MVT::v4f32)
7418 return SDValue();
7419
7420 // We are looking for a buildvector of fptext elements, where all the
7421 // elements are alternating lanes from a single source. For example <0,2,4,6>
7422 // or <1,3,5,7>. Check the first two items are valid enough and extract some
7423 // info from them (they are checked properly in the loop below).
7424 if (BV.getOperand(0).getOpcode() != ISD::FP_EXTEND ||
7426 return SDValue();
7427 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
7429 if (Op0.getValueType() != MVT::v8f16 || (Offset != 0 && Offset != 1))
7430 return SDValue();
7431
7432 // Check all the values in the BuildVector line up with our expectations.
7433 for (unsigned i = 1; i < 4; i++) {
7434 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
7435 return Trunc.getOpcode() == ISD::FP_EXTEND &&
7437 Trunc.getOperand(0).getOperand(0) == Op &&
7438 Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
7439 };
7440 if (!Check(BV.getOperand(i), Op0, 2 * i + Offset))
7441 return SDValue();
7442 }
7443
7444 return DAG.getNode(ARMISD::VCVTL, dl, VT, Op0,
7445 DAG.getConstant(Offset, dl, MVT::i32));
7446}
7447
7448// If N is an integer constant that can be moved into a register in one
7449// instruction, return an SDValue of such a constant (will become a MOV
7450// instruction). Otherwise return null.
7452 const ARMSubtarget *ST, const SDLoc &dl) {
7453 uint64_t Val;
7454 if (!isa<ConstantSDNode>(N))
7455 return SDValue();
7456 Val = N->getAsZExtVal();
7457
7458 if (ST->isThumb1Only()) {
7459 if (Val <= 255 || ~Val <= 255)
7460 return DAG.getConstant(Val, dl, MVT::i32);
7461 } else {
7462 if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
7463 return DAG.getConstant(Val, dl, MVT::i32);
7464 }
7465 return SDValue();
7466}
7467
7469 const ARMSubtarget *ST) {
7470 SDLoc dl(Op);
7471 EVT VT = Op.getValueType();
7472
7473 assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!");
7474
7475 unsigned NumElts = VT.getVectorNumElements();
7476 unsigned BoolMask;
7477 unsigned BitsPerBool;
7478 if (NumElts == 2) {
7479 BitsPerBool = 8;
7480 BoolMask = 0xff;
7481 } else if (NumElts == 4) {
7482 BitsPerBool = 4;
7483 BoolMask = 0xf;
7484 } else if (NumElts == 8) {
7485 BitsPerBool = 2;
7486 BoolMask = 0x3;
7487 } else if (NumElts == 16) {
7488 BitsPerBool = 1;
7489 BoolMask = 0x1;
7490 } else
7491 return SDValue();
7492
7493 // If this is a single value copied into all lanes (a splat), we can just sign
7494 // extend that single value
7495 SDValue FirstOp = Op.getOperand(0);
7496 if (!isa<ConstantSDNode>(FirstOp) &&
7497 llvm::all_of(llvm::drop_begin(Op->ops()), [&FirstOp](const SDUse &U) {
7498 return U.get().isUndef() || U.get() == FirstOp;
7499 })) {
7500 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp,
7501 DAG.getValueType(MVT::i1));
7502 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext);
7503 }
7504
7505 // First create base with bits set where known
7506 unsigned Bits32 = 0;
7507 for (unsigned i = 0; i < NumElts; ++i) {
7508 SDValue V = Op.getOperand(i);
7509 if (!isa<ConstantSDNode>(V) && !V.isUndef())
7510 continue;
7511 bool BitSet = V.isUndef() ? false : V->getAsZExtVal();
7512 if (BitSet)
7513 Bits32 |= BoolMask << (i * BitsPerBool);
7514 }
7515
7516 // Add in unknown nodes
7517 SDValue Base = DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT,
7518 DAG.getConstant(Bits32, dl, MVT::i32));
7519 for (unsigned i = 0; i < NumElts; ++i) {
7520 SDValue V = Op.getOperand(i);
7521 if (isa<ConstantSDNode>(V) || V.isUndef())
7522 continue;
7523 Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V,
7524 DAG.getConstant(i, dl, MVT::i32));
7525 }
7526
7527 return Base;
7528}
7529
7531 const ARMSubtarget *ST) {
7532 if (!ST->hasMVEIntegerOps())
7533 return SDValue();
7534
7535 // We are looking for a buildvector where each element is Op[0] + i*N
7536 EVT VT = Op.getValueType();
7537 SDValue Op0 = Op.getOperand(0);
7538 unsigned NumElts = VT.getVectorNumElements();
7539
7540 // Get the increment value from operand 1
7541 SDValue Op1 = Op.getOperand(1);
7542 if (Op1.getOpcode() != ISD::ADD || Op1.getOperand(0) != Op0 ||
7544 return SDValue();
7545 unsigned N = Op1.getConstantOperandVal(1);
7546 if (N != 1 && N != 2 && N != 4 && N != 8)
7547 return SDValue();
7548
7549 // Check that each other operand matches
7550 for (unsigned I = 2; I < NumElts; I++) {
7551 SDValue OpI = Op.getOperand(I);
7552 if (OpI.getOpcode() != ISD::ADD || OpI.getOperand(0) != Op0 ||
7554 OpI.getConstantOperandVal(1) != I * N)
7555 return SDValue();
7556 }
7557
7558 SDLoc DL(Op);
7559 return DAG.getNode(ARMISD::VIDUP, DL, DAG.getVTList(VT, MVT::i32), Op0,
7560 DAG.getConstant(N, DL, MVT::i32));
7561}
7562
7563// Returns true if the operation N can be treated as qr instruction variant at
7564// operand Op.
7565static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op) {
7566 switch (N->getOpcode()) {
7567 case ISD::ADD:
7568 case ISD::MUL:
7569 case ISD::SADDSAT:
7570 case ISD::UADDSAT:
7571 case ISD::AVGFLOORS:
7572 case ISD::AVGFLOORU:
7573 return true;
7574 case ISD::SUB:
7575 case ISD::SSUBSAT:
7576 case ISD::USUBSAT:
7577 return N->getOperand(1).getNode() == Op;
7579 switch (N->getConstantOperandVal(0)) {
7580 case Intrinsic::arm_mve_add_predicated:
7581 case Intrinsic::arm_mve_mul_predicated:
7582 case Intrinsic::arm_mve_qadd_predicated:
7583 case Intrinsic::arm_mve_vhadd:
7584 case Intrinsic::arm_mve_hadd_predicated:
7585 case Intrinsic::arm_mve_vqdmulh:
7586 case Intrinsic::arm_mve_qdmulh_predicated:
7587 case Intrinsic::arm_mve_vqrdmulh:
7588 case Intrinsic::arm_mve_qrdmulh_predicated:
7589 case Intrinsic::arm_mve_vqdmull:
7590 case Intrinsic::arm_mve_vqdmull_predicated:
7591 return true;
7592 case Intrinsic::arm_mve_sub_predicated:
7593 case Intrinsic::arm_mve_qsub_predicated:
7594 case Intrinsic::arm_mve_vhsub:
7595 case Intrinsic::arm_mve_hsub_predicated:
7596 return N->getOperand(2).getNode() == Op;
7597 default:
7598 return false;
7599 }
7600 default:
7601 return false;
7602 }
7603}
7604
7605// If this is a case we can't handle, return null and let the default
7606// expansion code take care of it.
7607SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
7608 const ARMSubtarget *ST) const {
7609 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
7610 SDLoc dl(Op);
7611 EVT VT = Op.getValueType();
7612
7613 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
7614 return LowerBUILD_VECTOR_i1(Op, DAG, ST);
7615
7616 if (SDValue R = LowerBUILD_VECTORToVIDUP(Op, DAG, ST))
7617 return R;
7618
7619 APInt SplatBits, SplatUndef;
7620 unsigned SplatBitSize;
7621 bool HasAnyUndefs;
7622 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
7623 if (SplatUndef.isAllOnes())
7624 return DAG.getUNDEF(VT);
7625
7626 // If all the users of this constant splat are qr instruction variants,
7627 // generate a vdup of the constant.
7628 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == SplatBitSize &&
7629 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32) &&
7630 all_of(BVN->users(),
7631 [BVN](const SDNode *U) { return IsQRMVEInstruction(U, BVN); })) {
7632 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
7633 : SplatBitSize == 16 ? MVT::v8i16
7634 : MVT::v16i8;
7635 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
7636 SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
7637 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
7638 }
7639
7640 if ((ST->hasNEON() && SplatBitSize <= 64) ||
7641 (ST->hasMVEIntegerOps() && SplatBitSize <= 64)) {
7642 // Check if an immediate VMOV works.
7643 EVT VmovVT;
7644 SDValue Val =
7645 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
7646 SplatBitSize, DAG, dl, VmovVT, VT, VMOVModImm);
7647
7648 if (Val.getNode()) {
7649 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
7650 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vmov);
7651 }
7652
7653 // Try an immediate VMVN.
7654 uint64_t NegatedImm = (~SplatBits).getZExtValue();
7655 Val = isVMOVModifiedImm(
7656 NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT,
7657 VT, ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm);
7658 if (Val.getNode()) {
7659 SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
7660 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vmov);
7661 }
7662
7663 // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
7664 if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
7665 int ImmVal = ARM_AM::getFP32Imm(SplatBits);
7666 if (ImmVal != -1) {
7667 SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32);
7668 return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
7669 }
7670 }
7671
7672 // If we are under MVE, generate a VDUP(constant), bitcast to the original
7673 // type.
7674 if (ST->hasMVEIntegerOps() &&
7675 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32)) {
7676 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
7677 : SplatBitSize == 16 ? MVT::v8i16
7678 : MVT::v16i8;
7679 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
7680 SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
7681 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
7682 }
7683 }
7684 }
7685
7686 // Scan through the operands to see if only one value is used.
7687 //
7688 // As an optimisation, even if more than one value is used it may be more
7689 // profitable to splat with one value then change some lanes.
7690 //
7691 // Heuristically we decide to do this if the vector has a "dominant" value,
7692 // defined as splatted to more than half of the lanes.
7693 unsigned NumElts = VT.getVectorNumElements();
7694 bool isOnlyLowElement = true;
7695 bool usesOnlyOneValue = true;
7696 bool hasDominantValue = false;
7697 bool isConstant = true;
7698
7699 // Map of the number of times a particular SDValue appears in the
7700 // element list.
7701 DenseMap<SDValue, unsigned> ValueCounts;
7702 SDValue Value;
7703 for (unsigned i = 0; i < NumElts; ++i) {
7704 SDValue V = Op.getOperand(i);
7705 if (V.isUndef())
7706 continue;
7707 if (i > 0)
7708 isOnlyLowElement = false;
7710 isConstant = false;
7711
7712 unsigned &Count = ValueCounts[V];
7713
7714 // Is this value dominant? (takes up more than half of the lanes)
7715 if (++Count > (NumElts / 2)) {
7716 hasDominantValue = true;
7717 Value = V;
7718 }
7719 }
7720 if (ValueCounts.size() != 1)
7721 usesOnlyOneValue = false;
7722 if (!Value.getNode() && !ValueCounts.empty())
7723 Value = ValueCounts.begin()->first;
7724
7725 if (ValueCounts.empty())
7726 return DAG.getUNDEF(VT);
7727
7728 // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
7729 // Keep going if we are hitting this case.
7730 if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
7731 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
7732
7733 unsigned EltSize = VT.getScalarSizeInBits();
7734
7735 // Use VDUP for non-constant splats. For f32 constant splats, reduce to
7736 // i32 and try again.
7737 if (hasDominantValue && EltSize <= 32) {
7738 if (!isConstant) {
7739 SDValue N;
7740
7741 // If we are VDUPing a value that comes directly from a vector, that will
7742 // cause an unnecessary move to and from a GPR, where instead we could
7743 // just use VDUPLANE. We can only do this if the lane being extracted
7744 // is at a constant index, as the VDUP from lane instructions only have
7745 // constant-index forms.
7746 ConstantSDNode *constIndex;
7747 if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7748 (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) {
7749 // We need to create a new undef vector to use for the VDUPLANE if the
7750 // size of the vector from which we get the value is different than the
7751 // size of the vector that we need to create. We will insert the element
7752 // such that the register coalescer will remove unnecessary copies.
7753 if (VT != Value->getOperand(0).getValueType()) {
7754 unsigned index = constIndex->getAPIntValue().getLimitedValue() %
7756 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
7757 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
7758 Value, DAG.getConstant(index, dl, MVT::i32)),
7759 DAG.getConstant(index, dl, MVT::i32));
7760 } else
7761 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
7762 Value->getOperand(0), Value->getOperand(1));
7763 } else
7764 N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
7765
7766 if (!usesOnlyOneValue) {
7767 // The dominant value was splatted as 'N', but we now have to insert
7768 // all differing elements.
7769 for (unsigned I = 0; I < NumElts; ++I) {
7770 if (Op.getOperand(I) == Value)
7771 continue;
7773 Ops.push_back(N);
7774 Ops.push_back(Op.getOperand(I));
7775 Ops.push_back(DAG.getConstant(I, dl, MVT::i32));
7776 N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops);
7777 }
7778 }
7779 return N;
7780 }
7783 MVT FVT = VT.getVectorElementType().getSimpleVT();
7784 assert(FVT == MVT::f32 || FVT == MVT::f16);
7785 MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16;
7786 for (unsigned i = 0; i < NumElts; ++i)
7787 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT,
7788 Op.getOperand(i)));
7789 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts);
7790 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
7791 Val = LowerBUILD_VECTOR(Val, DAG, ST);
7792 if (Val.getNode())
7793 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
7794 }
7795 if (usesOnlyOneValue) {
7796 SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
7797 if (isConstant && Val.getNode())
7798 return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
7799 }
7800 }
7801
7802 // If all elements are constants and the case above didn't get hit, fall back
7803 // to the default expansion, which will generate a load from the constant
7804 // pool.
7805 if (isConstant)
7806 return SDValue();
7807
7808 // Reconstruct the BUILDVECTOR to one of the legal shuffles (such as vext and
7809 // vmovn). Empirical tests suggest this is rarely worth it for vectors of
7810 // length <= 2.
7811 if (NumElts >= 4)
7812 if (SDValue shuffle = ReconstructShuffle(Op, DAG))
7813 return shuffle;
7814
7815 // Attempt to turn a buildvector of scalar fptrunc's or fpext's back into
7816 // VCVT's
7817 if (SDValue VCVT = LowerBuildVectorOfFPTrunc(Op, DAG, Subtarget))
7818 return VCVT;
7819 if (SDValue VCVT = LowerBuildVectorOfFPExt(Op, DAG, Subtarget))
7820 return VCVT;
7821
7822 if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
7823 // If we haven't found an efficient lowering, try splitting a 128-bit vector
7824 // into two 64-bit vectors; we might discover a better way to lower it.
7825 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts);
7826 EVT ExtVT = VT.getVectorElementType();
7827 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2);
7828 SDValue Lower = DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[0], NumElts / 2));
7829 if (Lower.getOpcode() == ISD::BUILD_VECTOR)
7830 Lower = LowerBUILD_VECTOR(Lower, DAG, ST);
7831 SDValue Upper =
7832 DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[NumElts / 2], NumElts / 2));
7833 if (Upper.getOpcode() == ISD::BUILD_VECTOR)
7834 Upper = LowerBUILD_VECTOR(Upper, DAG, ST);
7835 if (Lower && Upper)
7836 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper);
7837 }
7838
7839 // Vectors with 32- or 64-bit elements can be built by directly assigning
7840 // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands
7841 // will be legalized.
7842 if (EltSize >= 32) {
7843 // Do the expansion with floating-point types, since that is what the VFP
7844 // registers are defined to use, and since i64 is not legal.
7845 EVT EltVT = EVT::getFloatingPointVT(EltSize);
7846 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
7848 for (unsigned i = 0; i < NumElts; ++i)
7849 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
7850 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
7851 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
7852 }
7853
7854 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
7855 // know the default expansion would otherwise fall back on something even
7856 // worse. For a vector with one or two non-undef values, that's
7857 // scalar_to_vector for the elements followed by a shuffle (provided the
7858 // shuffle is valid for the target) and materialization element by element
7859 // on the stack followed by a load for everything else.
7860 if (!isConstant && !usesOnlyOneValue) {
7861 SDValue Vec = DAG.getUNDEF(VT);
7862 for (unsigned i = 0 ; i < NumElts; ++i) {
7863 SDValue V = Op.getOperand(i);
7864 if (V.isUndef())
7865 continue;
7866 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32);
7867 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
7868 }
7869 return Vec;
7870 }
7871
7872 return SDValue();
7873}
7874
7875// Gather data to see if the operation can be modelled as a
7876// shuffle in combination with VEXTs.
7877SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
7878 SelectionDAG &DAG) const {
7879 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7880 SDLoc dl(Op);
7881 EVT VT = Op.getValueType();
7882 unsigned NumElts = VT.getVectorNumElements();
7883
7884 struct ShuffleSourceInfo {
7885 SDValue Vec;
7886 unsigned MinElt = std::numeric_limits<unsigned>::max();
7887 unsigned MaxElt = 0;
7888
7889 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
7890 // be compatible with the shuffle we intend to construct. As a result
7891 // ShuffleVec will be some sliding window into the original Vec.
7892 SDValue ShuffleVec;
7893
7894 // Code should guarantee that element i in Vec starts at element "WindowBase
7895 // + i * WindowScale in ShuffleVec".
7896 int WindowBase = 0;
7897 int WindowScale = 1;
7898
7899 ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {}
7900
7901 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
7902 };
7903
7904 // First gather all vectors used as an immediate source for this BUILD_VECTOR
7905 // node.
7907 for (unsigned i = 0; i < NumElts; ++i) {
7908 SDValue V = Op.getOperand(i);
7909 if (V.isUndef())
7910 continue;
7911 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
7912 // A shuffle can only come from building a vector from various
7913 // elements of other vectors.
7914 return SDValue();
7915 } else if (!isa<ConstantSDNode>(V.getOperand(1))) {
7916 // Furthermore, shuffles require a constant mask, whereas extractelts
7917 // accept variable indices.
7918 return SDValue();
7919 }
7920
7921 // Add this element source to the list if it's not already there.
7922 SDValue SourceVec = V.getOperand(0);
7923 auto Source = llvm::find(Sources, SourceVec);
7924 if (Source == Sources.end())
7925 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
7926
7927 // Update the minimum and maximum lane number seen.
7928 unsigned EltNo = V.getConstantOperandVal(1);
7929 Source->MinElt = std::min(Source->MinElt, EltNo);
7930 Source->MaxElt = std::max(Source->MaxElt, EltNo);
7931 }
7932
7933 // Currently only do something sane when at most two source vectors
7934 // are involved.
7935 if (Sources.size() > 2)
7936 return SDValue();
7937
7938 // Find out the smallest element size among result and two sources, and use
7939 // it as element size to build the shuffle_vector.
7940 EVT SmallestEltTy = VT.getVectorElementType();
7941 for (auto &Source : Sources) {
7942 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
7943 if (SrcEltTy.bitsLT(SmallestEltTy))
7944 SmallestEltTy = SrcEltTy;
7945 }
7946 unsigned ResMultiplier =
7947 VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
7948 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
7949 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
7950
7951 // If the source vector is too wide or too narrow, we may nevertheless be able
7952 // to construct a compatible shuffle either by concatenating it with UNDEF or
7953 // extracting a suitable range of elements.
7954 for (auto &Src : Sources) {
7955 EVT SrcVT = Src.ShuffleVec.getValueType();
7956
7957 uint64_t SrcVTSize = SrcVT.getFixedSizeInBits();
7958 uint64_t VTSize = VT.getFixedSizeInBits();
7959 if (SrcVTSize == VTSize)
7960 continue;
7961
7962 // This stage of the search produces a source with the same element type as
7963 // the original, but with a total width matching the BUILD_VECTOR output.
7964 EVT EltVT = SrcVT.getVectorElementType();
7965 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
7966 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
7967
7968 if (SrcVTSize < VTSize) {
7969 if (2 * SrcVTSize != VTSize)
7970 return SDValue();
7971 // We can pad out the smaller vector for free, so if it's part of a
7972 // shuffle...
7973 Src.ShuffleVec =
7974 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
7975 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
7976 continue;
7977 }
7978
7979 if (SrcVTSize != 2 * VTSize)
7980 return SDValue();
7981
7982 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
7983 // Span too large for a VEXT to cope
7984 return SDValue();
7985 }
7986
7987 if (Src.MinElt >= NumSrcElts) {
7988 // The extraction can just take the second half
7989 Src.ShuffleVec =
7990 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
7991 DAG.getConstant(NumSrcElts, dl, MVT::i32));
7992 Src.WindowBase = -NumSrcElts;
7993 } else if (Src.MaxElt < NumSrcElts) {
7994 // The extraction can just take the first half
7995 Src.ShuffleVec =
7996 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
7997 DAG.getConstant(0, dl, MVT::i32));
7998 } else {
7999 // An actual VEXT is needed
8000 SDValue VEXTSrc1 =
8001 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8002 DAG.getConstant(0, dl, MVT::i32));
8003 SDValue VEXTSrc2 =
8004 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8005 DAG.getConstant(NumSrcElts, dl, MVT::i32));
8006
8007 Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1,
8008 VEXTSrc2,
8009 DAG.getConstant(Src.MinElt, dl, MVT::i32));
8010 Src.WindowBase = -Src.MinElt;
8011 }
8012 }
8013
8014 // Another possible incompatibility occurs from the vector element types. We
8015 // can fix this by bitcasting the source vectors to the same type we intend
8016 // for the shuffle.
8017 for (auto &Src : Sources) {
8018 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
8019 if (SrcEltTy == SmallestEltTy)
8020 continue;
8021 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
8022 Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec);
8023 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
8024 Src.WindowBase *= Src.WindowScale;
8025 }
8026
8027 // Final check before we try to actually produce a shuffle.
8028 LLVM_DEBUG({
8029 for (auto Src : Sources)
8030 assert(Src.ShuffleVec.getValueType() == ShuffleVT);
8031 });
8032
8033 // The stars all align, our next step is to produce the mask for the shuffle.
8034 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
8035 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
8036 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
8037 SDValue Entry = Op.getOperand(i);
8038 if (Entry.isUndef())
8039 continue;
8040
8041 auto Src = llvm::find(Sources, Entry.getOperand(0));
8042 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
8043
8044 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
8045 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
8046 // segment.
8047 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
8048 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
8049 VT.getScalarSizeInBits());
8050 int LanesDefined = BitsDefined / BitsPerShuffleLane;
8051
8052 // This source is expected to fill ResMultiplier lanes of the final shuffle,
8053 // starting at the appropriate offset.
8054 int *LaneMask = &Mask[i * ResMultiplier];
8055
8056 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
8057 ExtractBase += NumElts * (Src - Sources.begin());
8058 for (int j = 0; j < LanesDefined; ++j)
8059 LaneMask[j] = ExtractBase + j;
8060 }
8061
8062
8063 // We can't handle more than two sources. This should have already
8064 // been checked before this point.
8065 assert(Sources.size() <= 2 && "Too many sources!");
8066
8067 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
8068 for (unsigned i = 0; i < Sources.size(); ++i)
8069 ShuffleOps[i] = Sources[i].ShuffleVec;
8070
8071 SDValue Shuffle = buildLegalVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
8072 ShuffleOps[1], Mask, DAG);
8073 if (!Shuffle)
8074 return SDValue();
8075 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle);
8076}
8077
8079 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
8088 OP_VUZPL, // VUZP, left result
8089 OP_VUZPR, // VUZP, right result
8090 OP_VZIPL, // VZIP, left result
8091 OP_VZIPR, // VZIP, right result
8092 OP_VTRNL, // VTRN, left result
8093 OP_VTRNR // VTRN, right result
8094};
8095
8096static bool isLegalMVEShuffleOp(unsigned PFEntry) {
8097 unsigned OpNum = (PFEntry >> 26) & 0x0F;
8098 switch (OpNum) {
8099 case OP_COPY:
8100 case OP_VREV:
8101 case OP_VDUP0:
8102 case OP_VDUP1:
8103 case OP_VDUP2:
8104 case OP_VDUP3:
8105 return true;
8106 }
8107 return false;
8108}
8109
8110/// isShuffleMaskLegal - Targets can use this to indicate that they only
8111/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
8112/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
8113/// are assumed to be legal.
8115 if (VT.getVectorNumElements() == 4 &&
8116 (VT.is128BitVector() || VT.is64BitVector())) {
8117 unsigned PFIndexes[4];
8118 for (unsigned i = 0; i != 4; ++i) {
8119 if (M[i] < 0)
8120 PFIndexes[i] = 8;
8121 else
8122 PFIndexes[i] = M[i];
8123 }
8124
8125 // Compute the index in the perfect shuffle table.
8126 unsigned PFTableIndex =
8127 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
8128 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
8129 unsigned Cost = (PFEntry >> 30);
8130
8131 if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry)))
8132 return true;
8133 }
8134
8135 bool ReverseVEXT, isV_UNDEF;
8136 unsigned Imm, WhichResult;
8137
8138 unsigned EltSize = VT.getScalarSizeInBits();
8139 if (EltSize >= 32 ||
8141 ShuffleVectorInst::isIdentityMask(M, M.size()) ||
8142 isVREVMask(M, VT, 64) ||
8143 isVREVMask(M, VT, 32) ||
8144 isVREVMask(M, VT, 16))
8145 return true;
8146 else if (Subtarget->hasNEON() &&
8147 (isVEXTMask(M, VT, ReverseVEXT, Imm) ||
8148 isVTBLMask(M, VT) ||
8149 isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF)))
8150 return true;
8151 else if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8152 isReverseMask(M, VT))
8153 return true;
8154 else if (Subtarget->hasMVEIntegerOps() &&
8155 (isVMOVNMask(M, VT, true, false) ||
8156 isVMOVNMask(M, VT, false, false) || isVMOVNMask(M, VT, true, true)))
8157 return true;
8158 else if (Subtarget->hasMVEIntegerOps() &&
8159 (isTruncMask(M, VT, false, false) ||
8160 isTruncMask(M, VT, false, true) ||
8161 isTruncMask(M, VT, true, false) || isTruncMask(M, VT, true, true)))
8162 return true;
8163 else
8164 return false;
8165}
8166
8167/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
8168/// the specified operations to build the shuffle.
8170 SDValue RHS, SelectionDAG &DAG,
8171 const SDLoc &dl) {
8172 unsigned OpNum = (PFEntry >> 26) & 0x0F;
8173 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
8174 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
8175
8176 if (OpNum == OP_COPY) {
8177 if (LHSID == (1*9+2)*9+3) return LHS;
8178 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
8179 return RHS;
8180 }
8181
8182 SDValue OpLHS, OpRHS;
8183 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
8184 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
8185 EVT VT = OpLHS.getValueType();
8186
8187 switch (OpNum) {
8188 default: llvm_unreachable("Unknown shuffle opcode!");
8189 case OP_VREV:
8190 // VREV divides the vector in half and swaps within the half.
8191 if (VT.getScalarSizeInBits() == 32)
8192 return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
8193 // vrev <4 x i16> -> VREV32
8194 if (VT.getScalarSizeInBits() == 16)
8195 return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
8196 // vrev <4 x i8> -> VREV16
8197 assert(VT.getScalarSizeInBits() == 8);
8198 return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
8199 case OP_VDUP0:
8200 case OP_VDUP1:
8201 case OP_VDUP2:
8202 case OP_VDUP3:
8203 return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
8204 OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32));
8205 case OP_VEXT1:
8206 case OP_VEXT2:
8207 case OP_VEXT3:
8208 return DAG.getNode(ARMISD::VEXT, dl, VT,
8209 OpLHS, OpRHS,
8210 DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32));
8211 case OP_VUZPL:
8212 case OP_VUZPR:
8213 return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
8214 OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
8215 case OP_VZIPL:
8216 case OP_VZIPR:
8217 return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
8218 OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
8219 case OP_VTRNL:
8220 case OP_VTRNR:
8221 return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
8222 OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
8223 }
8224}
8225
8227 ArrayRef<int> ShuffleMask,
8228 SelectionDAG &DAG) {
8229 // Check to see if we can use the VTBL instruction.
8230 SDValue V1 = Op.getOperand(0);
8231 SDValue V2 = Op.getOperand(1);
8232 SDLoc DL(Op);
8233
8234 SmallVector<SDValue, 8> VTBLMask;
8235 for (int I : ShuffleMask)
8236 VTBLMask.push_back(DAG.getSignedConstant(I, DL, MVT::i32));
8237
8238 if (V2.getNode()->isUndef())
8239 return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
8240 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
8241
8242 return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
8243 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
8244}
8245
8247 SDLoc DL(Op);
8248 EVT VT = Op.getValueType();
8249
8250 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8251 "Expect an v8i16/v16i8 type");
8252 SDValue OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, Op.getOperand(0));
8253 // For a v16i8 type: After the VREV, we have got <7, ..., 0, 15, ..., 8>. Now,
8254 // extract the first 8 bytes into the top double word and the last 8 bytes
8255 // into the bottom double word, through a new vector shuffle that will be
8256 // turned into a VEXT on Neon, or a couple of VMOVDs on MVE.
8257 std::vector<int> NewMask;
8258 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
8259 NewMask.push_back(VT.getVectorNumElements() / 2 + i);
8260 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
8261 NewMask.push_back(i);
8262 return DAG.getVectorShuffle(VT, DL, OpLHS, OpLHS, NewMask);
8263}
8264
8266 switch (VT.getSimpleVT().SimpleTy) {
8267 case MVT::v2i1:
8268 return MVT::v2f64;
8269 case MVT::v4i1:
8270 return MVT::v4i32;
8271 case MVT::v8i1:
8272 return MVT::v8i16;
8273 case MVT::v16i1:
8274 return MVT::v16i8;
8275 default:
8276 llvm_unreachable("Unexpected vector predicate type");
8277 }
8278}
8279
8281 SelectionDAG &DAG) {
8282 // Converting from boolean predicates to integers involves creating a vector
8283 // of all ones or all zeroes and selecting the lanes based upon the real
8284 // predicate.
8286 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32);
8287 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes);
8288
8289 SDValue AllZeroes =
8290 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32);
8291 AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes);
8292
8293 // Get full vector type from predicate type
8295
8296 SDValue RecastV1;
8297 // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast
8298 // this to a v16i1. This cannot be done with an ordinary bitcast because the
8299 // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node,
8300 // since we know in hardware the sizes are really the same.
8301 if (VT != MVT::v16i1)
8302 RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred);
8303 else
8304 RecastV1 = Pred;
8305
8306 // Select either all ones or zeroes depending upon the real predicate bits.
8307 SDValue PredAsVector =
8308 DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes);
8309
8310 // Recast our new predicate-as-integer v16i8 vector into something
8311 // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate.
8312 return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector);
8313}
8314
8316 const ARMSubtarget *ST) {
8317 EVT VT = Op.getValueType();
8319 ArrayRef<int> ShuffleMask = SVN->getMask();
8320
8321 assert(ST->hasMVEIntegerOps() &&
8322 "No support for vector shuffle of boolean predicates");
8323
8324 SDValue V1 = Op.getOperand(0);
8325 SDValue V2 = Op.getOperand(1);
8326 SDLoc dl(Op);
8327 if (isReverseMask(ShuffleMask, VT)) {
8328 SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1);
8329 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast);
8330 SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit,
8331 DAG.getConstant(16, dl, MVT::i32));
8332 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl);
8333 }
8334
8335 // Until we can come up with optimised cases for every single vector
8336 // shuffle in existence we have chosen the least painful strategy. This is
8337 // to essentially promote the boolean predicate to a 8-bit integer, where
8338 // each predicate represents a byte. Then we fall back on a normal integer
8339 // vector shuffle and convert the result back into a predicate vector. In
8340 // many cases the generated code might be even better than scalar code
8341 // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit
8342 // fields in a register into 8 other arbitrary 2-bit fields!
8343 SDValue PredAsVector1 = PromoteMVEPredVector(dl, V1, VT, DAG);
8344 EVT NewVT = PredAsVector1.getValueType();
8345 SDValue PredAsVector2 = V2.isUndef() ? DAG.getUNDEF(NewVT)
8346 : PromoteMVEPredVector(dl, V2, VT, DAG);
8347 assert(PredAsVector2.getValueType() == NewVT &&
8348 "Expected identical vector type in expanded i1 shuffle!");
8349
8350 // Do the shuffle!
8351 SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector1,
8352 PredAsVector2, ShuffleMask);
8353
8354 // Now return the result of comparing the shuffled vector with zero,
8355 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. For a v2i1
8356 // we convert to a v4i1 compare to fill in the two halves of the i64 as i32s.
8357 if (VT == MVT::v2i1) {
8358 SDValue BC = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Shuffled);
8359 SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, BC,
8360 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8361 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
8362 }
8363 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled,
8364 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8365}
8366
8368 ArrayRef<int> ShuffleMask,
8369 SelectionDAG &DAG) {
8370 // Attempt to lower the vector shuffle using as many whole register movs as
8371 // possible. This is useful for types smaller than 32bits, which would
8372 // often otherwise become a series for grp movs.
8373 SDLoc dl(Op);
8374 EVT VT = Op.getValueType();
8375 if (VT.getScalarSizeInBits() >= 32)
8376 return SDValue();
8377
8378 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8379 "Unexpected vector type");
8380 int NumElts = VT.getVectorNumElements();
8381 int QuarterSize = NumElts / 4;
8382 // The four final parts of the vector, as i32's
8383 SDValue Parts[4];
8384
8385 // Look for full lane vmovs like <0,1,2,3> or <u,5,6,7> etc, (but not
8386 // <u,u,u,u>), returning the vmov lane index
8387 auto getMovIdx = [](ArrayRef<int> ShuffleMask, int Start, int Length) {
8388 // Detect which mov lane this would be from the first non-undef element.
8389 int MovIdx = -1;
8390 for (int i = 0; i < Length; i++) {
8391 if (ShuffleMask[Start + i] >= 0) {
8392 if (ShuffleMask[Start + i] % Length != i)
8393 return -1;
8394 MovIdx = ShuffleMask[Start + i] / Length;
8395 break;
8396 }
8397 }
8398 // If all items are undef, leave this for other combines
8399 if (MovIdx == -1)
8400 return -1;
8401 // Check the remaining values are the correct part of the same mov
8402 for (int i = 1; i < Length; i++) {
8403 if (ShuffleMask[Start + i] >= 0 &&
8404 (ShuffleMask[Start + i] / Length != MovIdx ||
8405 ShuffleMask[Start + i] % Length != i))
8406 return -1;
8407 }
8408 return MovIdx;
8409 };
8410
8411 for (int Part = 0; Part < 4; ++Part) {
8412 // Does this part look like a mov
8413 int Elt = getMovIdx(ShuffleMask, Part * QuarterSize, QuarterSize);
8414 if (Elt != -1) {
8415 SDValue Input = Op->getOperand(0);
8416 if (Elt >= 4) {
8417 Input = Op->getOperand(1);
8418 Elt -= 4;
8419 }
8420 SDValue BitCast = DAG.getBitcast(MVT::v4f32, Input);
8421 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, BitCast,
8422 DAG.getConstant(Elt, dl, MVT::i32));
8423 }
8424 }
8425
8426 // Nothing interesting found, just return
8427 if (!Parts[0] && !Parts[1] && !Parts[2] && !Parts[3])
8428 return SDValue();
8429
8430 // The other parts need to be built with the old shuffle vector, cast to a
8431 // v4i32 and extract_vector_elts
8432 if (!Parts[0] || !Parts[1] || !Parts[2] || !Parts[3]) {
8433 SmallVector<int, 16> NewShuffleMask;
8434 for (int Part = 0; Part < 4; ++Part)
8435 for (int i = 0; i < QuarterSize; i++)
8436 NewShuffleMask.push_back(
8437 Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]);
8438 SDValue NewShuffle = DAG.getVectorShuffle(
8439 VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask);
8440 SDValue BitCast = DAG.getBitcast(MVT::v4f32, NewShuffle);
8441
8442 for (int Part = 0; Part < 4; ++Part)
8443 if (!Parts[Part])
8444 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32,
8445 BitCast, DAG.getConstant(Part, dl, MVT::i32));
8446 }
8447 // Build a vector out of the various parts and bitcast it back to the original
8448 // type.
8449 SDValue NewVec = DAG.getNode(ARMISD::BUILD_VECTOR, dl, MVT::v4f32, Parts);
8450 return DAG.getBitcast(VT, NewVec);
8451}
8452
8454 ArrayRef<int> ShuffleMask,
8455 SelectionDAG &DAG) {
8456 SDValue V1 = Op.getOperand(0);
8457 SDValue V2 = Op.getOperand(1);
8458 EVT VT = Op.getValueType();
8459 unsigned NumElts = VT.getVectorNumElements();
8460
8461 // An One-Off Identity mask is one that is mostly an identity mask from as
8462 // single source but contains a single element out-of-place, either from a
8463 // different vector or from another position in the same vector. As opposed to
8464 // lowering this via a ARMISD::BUILD_VECTOR we can generate an extract/insert
8465 // pair directly.
8466 auto isOneOffIdentityMask = [](ArrayRef<int> Mask, EVT VT, int BaseOffset,
8467 int &OffElement) {
8468 OffElement = -1;
8469 int NonUndef = 0;
8470 for (int i = 0, NumMaskElts = Mask.size(); i < NumMaskElts; ++i) {
8471 if (Mask[i] == -1)
8472 continue;
8473 NonUndef++;
8474 if (Mask[i] != i + BaseOffset) {
8475 if (OffElement == -1)
8476 OffElement = i;
8477 else
8478 return false;
8479 }
8480 }
8481 return NonUndef > 2 && OffElement != -1;
8482 };
8483 int OffElement;
8484 SDValue VInput;
8485 if (isOneOffIdentityMask(ShuffleMask, VT, 0, OffElement))
8486 VInput = V1;
8487 else if (isOneOffIdentityMask(ShuffleMask, VT, NumElts, OffElement))
8488 VInput = V2;
8489 else
8490 return SDValue();
8491
8492 SDLoc dl(Op);
8493 EVT SVT = VT.getScalarType() == MVT::i8 || VT.getScalarType() == MVT::i16
8494 ? MVT::i32
8495 : VT.getScalarType();
8496 SDValue Elt = DAG.getNode(
8497 ISD::EXTRACT_VECTOR_ELT, dl, SVT,
8498 ShuffleMask[OffElement] < (int)NumElts ? V1 : V2,
8499 DAG.getVectorIdxConstant(ShuffleMask[OffElement] % NumElts, dl));
8500 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, VInput, Elt,
8501 DAG.getVectorIdxConstant(OffElement % NumElts, dl));
8502}
8503
8505 const ARMSubtarget *ST) {
8506 SDValue V1 = Op.getOperand(0);
8507 SDValue V2 = Op.getOperand(1);
8508 SDLoc dl(Op);
8509 EVT VT = Op.getValueType();
8511 unsigned EltSize = VT.getScalarSizeInBits();
8512
8513 if (ST->hasMVEIntegerOps() && EltSize == 1)
8514 return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST);
8515
8516 // Convert shuffles that are directly supported on NEON to target-specific
8517 // DAG nodes, instead of keeping them as shuffles and matching them again
8518 // during code selection. This is more efficient and avoids the possibility
8519 // of inconsistencies between legalization and selection.
8520 // FIXME: floating-point vectors should be canonicalized to integer vectors
8521 // of the same time so that they get CSEd properly.
8522 ArrayRef<int> ShuffleMask = SVN->getMask();
8523
8524 if (EltSize <= 32) {
8525 if (SVN->isSplat()) {
8526 int Lane = SVN->getSplatIndex();
8527 // If this is undef splat, generate it via "just" vdup, if possible.
8528 if (Lane == -1) Lane = 0;
8529
8530 // Test if V1 is a SCALAR_TO_VECTOR.
8531 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8532 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
8533 }
8534 // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
8535 // (and probably will turn into a SCALAR_TO_VECTOR once legalization
8536 // reaches it).
8537 if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
8539 bool IsScalarToVector = true;
8540 for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
8541 if (!V1.getOperand(i).isUndef()) {
8542 IsScalarToVector = false;
8543 break;
8544 }
8545 if (IsScalarToVector)
8546 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
8547 }
8548 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
8549 DAG.getConstant(Lane, dl, MVT::i32));
8550 }
8551
8552 bool ReverseVEXT = false;
8553 unsigned Imm = 0;
8554 if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
8555 if (ReverseVEXT)
8556 std::swap(V1, V2);
8557 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
8558 DAG.getConstant(Imm, dl, MVT::i32));
8559 }
8560
8561 if (isVREVMask(ShuffleMask, VT, 64))
8562 return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
8563 if (isVREVMask(ShuffleMask, VT, 32))
8564 return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
8565 if (isVREVMask(ShuffleMask, VT, 16))
8566 return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
8567
8568 if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
8569 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
8570 DAG.getConstant(Imm, dl, MVT::i32));
8571 }
8572
8573 // Check for Neon shuffles that modify both input vectors in place.
8574 // If both results are used, i.e., if there are two shuffles with the same
8575 // source operands and with masks corresponding to both results of one of
8576 // these operations, DAG memoization will ensure that a single node is
8577 // used for both shuffles.
8578 unsigned WhichResult = 0;
8579 bool isV_UNDEF = false;
8580 if (ST->hasNEON()) {
8581 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
8582 ShuffleMask, VT, WhichResult, isV_UNDEF)) {
8583 if (isV_UNDEF)
8584 V2 = V1;
8585 return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)
8586 .getValue(WhichResult);
8587 }
8588 }
8589 if (ST->hasMVEIntegerOps()) {
8590 if (isVMOVNMask(ShuffleMask, VT, false, false))
8591 return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1,
8592 DAG.getConstant(0, dl, MVT::i32));
8593 if (isVMOVNMask(ShuffleMask, VT, true, false))
8594 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2,
8595 DAG.getConstant(1, dl, MVT::i32));
8596 if (isVMOVNMask(ShuffleMask, VT, true, true))
8597 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V1,
8598 DAG.getConstant(1, dl, MVT::i32));
8599 }
8600
8601 // Also check for these shuffles through CONCAT_VECTORS: we canonicalize
8602 // shuffles that produce a result larger than their operands with:
8603 // shuffle(concat(v1, undef), concat(v2, undef))
8604 // ->
8605 // shuffle(concat(v1, v2), undef)
8606 // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine).
8607 //
8608 // This is useful in the general case, but there are special cases where
8609 // native shuffles produce larger results: the two-result ops.
8610 //
8611 // Look through the concat when lowering them:
8612 // shuffle(concat(v1, v2), undef)
8613 // ->
8614 // concat(VZIP(v1, v2):0, :1)
8615 //
8616 if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
8617 SDValue SubV1 = V1->getOperand(0);
8618 SDValue SubV2 = V1->getOperand(1);
8619 EVT SubVT = SubV1.getValueType();
8620
8621 // We expect these to have been canonicalized to -1.
8622 assert(llvm::all_of(ShuffleMask, [&](int i) {
8623 return i < (int)VT.getVectorNumElements();
8624 }) && "Unexpected shuffle index into UNDEF operand!");
8625
8626 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
8627 ShuffleMask, SubVT, WhichResult, isV_UNDEF)) {
8628 if (isV_UNDEF)
8629 SubV2 = SubV1;
8630 assert((WhichResult == 0) &&
8631 "In-place shuffle of concat can only have one result!");
8632 SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT),
8633 SubV1, SubV2);
8634 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0),
8635 Res.getValue(1));
8636 }
8637 }
8638 }
8639
8640 if (ST->hasMVEIntegerOps() && EltSize <= 32) {
8641 if (SDValue V = LowerVECTOR_SHUFFLEUsingOneOff(Op, ShuffleMask, DAG))
8642 return V;
8643
8644 for (bool Top : {false, true}) {
8645 for (bool SingleSource : {false, true}) {
8646 if (isTruncMask(ShuffleMask, VT, Top, SingleSource)) {
8647 MVT FromSVT = MVT::getIntegerVT(EltSize * 2);
8648 MVT FromVT = MVT::getVectorVT(FromSVT, ShuffleMask.size() / 2);
8649 SDValue Lo = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT, V1);
8650 SDValue Hi = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT,
8651 SingleSource ? V1 : V2);
8652 if (Top) {
8653 SDValue Amt = DAG.getConstant(EltSize, dl, FromVT);
8654 Lo = DAG.getNode(ISD::SRL, dl, FromVT, Lo, Amt);
8655 Hi = DAG.getNode(ISD::SRL, dl, FromVT, Hi, Amt);
8656 }
8657 return DAG.getNode(ARMISD::MVETRUNC, dl, VT, Lo, Hi);
8658 }
8659 }
8660 }
8661 }
8662
8663 // If the shuffle is not directly supported and it has 4 elements, use
8664 // the PerfectShuffle-generated table to synthesize it from other shuffles.
8665 unsigned NumElts = VT.getVectorNumElements();
8666 if (NumElts == 4) {
8667 unsigned PFIndexes[4];
8668 for (unsigned i = 0; i != 4; ++i) {
8669 if (ShuffleMask[i] < 0)
8670 PFIndexes[i] = 8;
8671 else
8672 PFIndexes[i] = ShuffleMask[i];
8673 }
8674
8675 // Compute the index in the perfect shuffle table.
8676 unsigned PFTableIndex =
8677 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
8678 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
8679 unsigned Cost = (PFEntry >> 30);
8680
8681 if (Cost <= 4) {
8682 if (ST->hasNEON())
8683 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
8684 else if (isLegalMVEShuffleOp(PFEntry)) {
8685 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
8686 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
8687 unsigned PFEntryLHS = PerfectShuffleTable[LHSID];
8688 unsigned PFEntryRHS = PerfectShuffleTable[RHSID];
8689 if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS))
8690 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
8691 }
8692 }
8693 }
8694
8695 // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
8696 if (EltSize >= 32) {
8697 // Do the expansion with floating-point types, since that is what the VFP
8698 // registers are defined to use, and since i64 is not legal.
8699 EVT EltVT = EVT::getFloatingPointVT(EltSize);
8700 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
8701 V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
8702 V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
8704 for (unsigned i = 0; i < NumElts; ++i) {
8705 if (ShuffleMask[i] < 0)
8706 Ops.push_back(DAG.getUNDEF(EltVT));
8707 else
8708 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
8709 ShuffleMask[i] < (int)NumElts ? V1 : V2,
8710 DAG.getConstant(ShuffleMask[i] & (NumElts-1),
8711 dl, MVT::i32)));
8712 }
8713 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
8714 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
8715 }
8716
8717 if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8718 isReverseMask(ShuffleMask, VT))
8719 return LowerReverse_VECTOR_SHUFFLE(Op, DAG);
8720
8721 if (ST->hasNEON() && VT == MVT::v8i8)
8722 if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
8723 return NewOp;
8724
8725 if (ST->hasMVEIntegerOps())
8726 if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG))
8727 return NewOp;
8728
8729 return SDValue();
8730}
8731
8733 const ARMSubtarget *ST) {
8734 EVT VecVT = Op.getOperand(0).getValueType();
8735 SDLoc dl(Op);
8736
8737 assert(ST->hasMVEIntegerOps() &&
8738 "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
8739
8740 SDValue Conv =
8741 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
8742 unsigned Lane = Op.getConstantOperandVal(2);
8743 unsigned LaneWidth =
8745 unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth;
8746 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32,
8747 Op.getOperand(1), DAG.getValueType(MVT::i1));
8748 SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext,
8749 DAG.getConstant(~Mask, dl, MVT::i32));
8750 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI);
8751}
8752
8753SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
8754 SelectionDAG &DAG) const {
8755 // INSERT_VECTOR_ELT is legal only for immediate indexes.
8756 SDValue Lane = Op.getOperand(2);
8757 if (!isa<ConstantSDNode>(Lane))
8758 return SDValue();
8759
8760 SDValue Elt = Op.getOperand(1);
8761 EVT EltVT = Elt.getValueType();
8762
8763 if (Subtarget->hasMVEIntegerOps() &&
8764 Op.getValueType().getScalarSizeInBits() == 1)
8765 return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget);
8766
8767 if (getTypeAction(*DAG.getContext(), EltVT) ==
8769 // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32,
8770 // but the type system will try to do that if we don't intervene.
8771 // Reinterpret any such vector-element insertion as one with the
8772 // corresponding integer types.
8773
8774 SDLoc dl(Op);
8775
8776 EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits());
8777 assert(getTypeAction(*DAG.getContext(), IEltVT) !=
8779
8780 SDValue VecIn = Op.getOperand(0);
8781 EVT VecVT = VecIn.getValueType();
8782 EVT IVecVT = EVT::getVectorVT(*DAG.getContext(), IEltVT,
8783 VecVT.getVectorNumElements());
8784
8785 SDValue IElt = DAG.getNode(ISD::BITCAST, dl, IEltVT, Elt);
8786 SDValue IVecIn = DAG.getNode(ISD::BITCAST, dl, IVecVT, VecIn);
8787 SDValue IVecOut = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVecVT,
8788 IVecIn, IElt, Lane);
8789 return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut);
8790 }
8791
8792 return Op;
8793}
8794
8796 const ARMSubtarget *ST) {
8797 EVT VecVT = Op.getOperand(0).getValueType();
8798 SDLoc dl(Op);
8799
8800 assert(ST->hasMVEIntegerOps() &&
8801 "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
8802
8803 SDValue Conv =
8804 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
8805 unsigned Lane = Op.getConstantOperandVal(1);
8806 unsigned LaneWidth =
8808 SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv,
8809 DAG.getConstant(Lane * LaneWidth, dl, MVT::i32));
8810 return Shift;
8811}
8812
8814 const ARMSubtarget *ST) {
8815 // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
8816 SDValue Lane = Op.getOperand(1);
8817 if (!isa<ConstantSDNode>(Lane))
8818 return SDValue();
8819
8820 SDValue Vec = Op.getOperand(0);
8821 EVT VT = Vec.getValueType();
8822
8823 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
8824 return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST);
8825
8826 if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) {
8827 SDLoc dl(Op);
8828 return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
8829 }
8830
8831 return Op;
8832}
8833
8835 const ARMSubtarget *ST) {
8836 SDLoc dl(Op);
8837 assert(Op.getValueType().getScalarSizeInBits() == 1 &&
8838 "Unexpected custom CONCAT_VECTORS lowering");
8839 assert(isPowerOf2_32(Op.getNumOperands()) &&
8840 "Unexpected custom CONCAT_VECTORS lowering");
8841 assert(ST->hasMVEIntegerOps() &&
8842 "CONCAT_VECTORS lowering only supported for MVE");
8843
8844 auto ConcatPair = [&](SDValue V1, SDValue V2) {
8845 EVT Op1VT = V1.getValueType();
8846 EVT Op2VT = V2.getValueType();
8847 assert(Op1VT == Op2VT && "Operand types don't match!");
8848 assert((Op1VT == MVT::v2i1 || Op1VT == MVT::v4i1 || Op1VT == MVT::v8i1) &&
8849 "Unexpected i1 concat operations!");
8850 EVT VT = Op1VT.getDoubleNumVectorElementsVT(*DAG.getContext());
8851
8852 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
8853 SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG);
8854
8855 // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets
8856 // promoted to v8i16, etc.
8857 MVT ElType =
8859 unsigned NumElts = 2 * Op1VT.getVectorNumElements();
8860
8861 EVT ConcatVT = MVT::getVectorVT(ElType, NumElts);
8862 if (Op1VT == MVT::v4i1 || Op1VT == MVT::v8i1) {
8863 // Use MVETRUNC to truncate the combined NewV1::NewV2 into the smaller
8864 // ConcatVT.
8865 SDValue ConVec =
8866 DAG.getNode(ARMISD::MVETRUNC, dl, ConcatVT, NewV1, NewV2);
8867 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
8868 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8869 }
8870
8871 // Extract the vector elements from Op1 and Op2 one by one and truncate them
8872 // to be the right size for the destination. For example, if Op1 is v4i1
8873 // then the promoted vector is v4i32. The result of concatenation gives a
8874 // v8i1, which when promoted is v8i16. That means each i32 element from Op1
8875 // needs truncating to i16 and inserting in the result.
8876 auto ExtractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) {
8877 EVT NewVT = NewV.getValueType();
8878 EVT ConcatVT = ConVec.getValueType();
8879 unsigned ExtScale = 1;
8880 if (NewVT == MVT::v2f64) {
8881 NewV = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, NewV);
8882 ExtScale = 2;
8883 }
8884 for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) {
8885 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV,
8886 DAG.getIntPtrConstant(i * ExtScale, dl));
8887 ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt,
8888 DAG.getConstant(j, dl, MVT::i32));
8889 }
8890 return ConVec;
8891 };
8892 unsigned j = 0;
8893 SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT);
8894 ConVec = ExtractInto(NewV1, ConVec, j);
8895 ConVec = ExtractInto(NewV2, ConVec, j);
8896
8897 // Now return the result of comparing the subvector with zero, which will
8898 // generate a real predicate, i.e. v4i1, v8i1 or v16i1.
8899 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
8900 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8901 };
8902
8903 // Concat each pair of subvectors and pack into the lower half of the array.
8904 SmallVector<SDValue> ConcatOps(Op->ops());
8905 while (ConcatOps.size() > 1) {
8906 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
8907 SDValue V1 = ConcatOps[I];
8908 SDValue V2 = ConcatOps[I + 1];
8909 ConcatOps[I / 2] = ConcatPair(V1, V2);
8910 }
8911 ConcatOps.resize(ConcatOps.size() / 2);
8912 }
8913 return ConcatOps[0];
8914}
8915
8917 const ARMSubtarget *ST) {
8918 EVT VT = Op->getValueType(0);
8919 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
8920 return LowerCONCAT_VECTORS_i1(Op, DAG, ST);
8921
8922 // The only time a CONCAT_VECTORS operation can have legal types is when
8923 // two 64-bit vectors are concatenated to a 128-bit vector.
8924 assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
8925 "unexpected CONCAT_VECTORS");
8926 SDLoc dl(Op);
8927 SDValue Val = DAG.getUNDEF(MVT::v2f64);
8928 SDValue Op0 = Op.getOperand(0);
8929 SDValue Op1 = Op.getOperand(1);
8930 if (!Op0.isUndef())
8931 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
8932 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
8933 DAG.getIntPtrConstant(0, dl));
8934 if (!Op1.isUndef())
8935 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
8936 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
8937 DAG.getIntPtrConstant(1, dl));
8938 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
8939}
8940
8942 const ARMSubtarget *ST) {
8943 SDValue V1 = Op.getOperand(0);
8944 SDValue V2 = Op.getOperand(1);
8945 SDLoc dl(Op);
8946 EVT VT = Op.getValueType();
8947 EVT Op1VT = V1.getValueType();
8948 unsigned NumElts = VT.getVectorNumElements();
8949 unsigned Index = V2->getAsZExtVal();
8950
8951 assert(VT.getScalarSizeInBits() == 1 &&
8952 "Unexpected custom EXTRACT_SUBVECTOR lowering");
8953 assert(ST->hasMVEIntegerOps() &&
8954 "EXTRACT_SUBVECTOR lowering only supported for MVE");
8955
8956 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
8957
8958 // We now have Op1 promoted to a vector of integers, where v8i1 gets
8959 // promoted to v8i16, etc.
8960
8962
8963 if (NumElts == 2) {
8964 EVT SubVT = MVT::v4i32;
8965 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
8966 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j += 2) {
8967 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
8968 DAG.getIntPtrConstant(i, dl));
8969 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
8970 DAG.getConstant(j, dl, MVT::i32));
8971 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
8972 DAG.getConstant(j + 1, dl, MVT::i32));
8973 }
8974 SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, SubVec,
8975 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8976 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
8977 }
8978
8979 EVT SubVT = MVT::getVectorVT(ElType, NumElts);
8980 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
8981 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) {
8982 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
8983 DAG.getIntPtrConstant(i, dl));
8984 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
8985 DAG.getConstant(j, dl, MVT::i32));
8986 }
8987
8988 // Now return the result of comparing the subvector with zero,
8989 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
8990 return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec,
8991 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8992}
8993
8994// Turn a truncate into a predicate (an i1 vector) into icmp(and(x, 1), 0).
8996 const ARMSubtarget *ST) {
8997 assert(ST->hasMVEIntegerOps() && "Expected MVE!");
8998 EVT VT = N->getValueType(0);
8999 assert((VT == MVT::v16i1 || VT == MVT::v8i1 || VT == MVT::v4i1) &&
9000 "Expected a vector i1 type!");
9001 SDValue Op = N->getOperand(0);
9002 EVT FromVT = Op.getValueType();
9003 SDLoc DL(N);
9004
9005 SDValue And =
9006 DAG.getNode(ISD::AND, DL, FromVT, Op, DAG.getConstant(1, DL, FromVT));
9007 return DAG.getNode(ISD::SETCC, DL, VT, And, DAG.getConstant(0, DL, FromVT),
9008 DAG.getCondCode(ISD::SETNE));
9009}
9010
9012 const ARMSubtarget *Subtarget) {
9013 if (!Subtarget->hasMVEIntegerOps())
9014 return SDValue();
9015
9016 EVT ToVT = N->getValueType(0);
9017 if (ToVT.getScalarType() == MVT::i1)
9018 return LowerTruncatei1(N, DAG, Subtarget);
9019
9020 // MVE does not have a single instruction to perform the truncation of a v4i32
9021 // into the lower half of a v8i16, in the same way that a NEON vmovn would.
9022 // Most of the instructions in MVE follow the 'Beats' system, where moving
9023 // values from different lanes is usually something that the instructions
9024 // avoid.
9025 //
9026 // Instead it has top/bottom instructions such as VMOVLT/B and VMOVNT/B,
9027 // which take a the top/bottom half of a larger lane and extend it (or do the
9028 // opposite, truncating into the top/bottom lane from a larger lane). Note
9029 // that because of the way we widen lanes, a v4i16 is really a v4i32 using the
9030 // bottom 16bits from each vector lane. This works really well with T/B
9031 // instructions, but that doesn't extend to v8i32->v8i16 where the lanes need
9032 // to move order.
9033 //
9034 // But truncates and sext/zext are always going to be fairly common from llvm.
9035 // We have several options for how to deal with them:
9036 // - Wherever possible combine them into an instruction that makes them
9037 // "free". This includes loads/stores, which can perform the trunc as part
9038 // of the memory operation. Or certain shuffles that can be turned into
9039 // VMOVN/VMOVL.
9040 // - Lane Interleaving to transform blocks surrounded by ext/trunc. So
9041 // trunc(mul(sext(a), sext(b))) may become
9042 // VMOVNT(VMUL(VMOVLB(a), VMOVLB(b)), VMUL(VMOVLT(a), VMOVLT(b))). (Which in
9043 // this case can use VMULL). This is performed in the
9044 // MVELaneInterleavingPass.
9045 // - Otherwise we have an option. By default we would expand the
9046 // zext/sext/trunc into a series of lane extract/inserts going via GPR
9047 // registers. One for each vector lane in the vector. This can obviously be
9048 // very expensive.
9049 // - The other option is to use the fact that loads/store can extend/truncate
9050 // to turn a trunc into two truncating stack stores and a stack reload. This
9051 // becomes 3 back-to-back memory operations, but at least that is less than
9052 // all the insert/extracts.
9053 //
9054 // In order to do the last, we convert certain trunc's into MVETRUNC, which
9055 // are either optimized where they can be, or eventually lowered into stack
9056 // stores/loads. This prevents us from splitting a v8i16 trunc into two stores
9057 // two early, where other instructions would be better, and stops us from
9058 // having to reconstruct multiple buildvector shuffles into loads/stores.
9059 if (ToVT != MVT::v8i16 && ToVT != MVT::v16i8)
9060 return SDValue();
9061 EVT FromVT = N->getOperand(0).getValueType();
9062 if (FromVT != MVT::v8i32 && FromVT != MVT::v16i16)
9063 return SDValue();
9064
9065 SDValue Lo, Hi;
9066 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
9067 SDLoc DL(N);
9068 return DAG.getNode(ARMISD::MVETRUNC, DL, ToVT, Lo, Hi);
9069}
9070
9072 const ARMSubtarget *Subtarget) {
9073 if (!Subtarget->hasMVEIntegerOps())
9074 return SDValue();
9075
9076 // See LowerTruncate above for an explanation of MVEEXT/MVETRUNC.
9077
9078 EVT ToVT = N->getValueType(0);
9079 if (ToVT != MVT::v16i32 && ToVT != MVT::v8i32 && ToVT != MVT::v16i16)
9080 return SDValue();
9081 SDValue Op = N->getOperand(0);
9082 EVT FromVT = Op.getValueType();
9083 if (FromVT != MVT::v8i16 && FromVT != MVT::v16i8)
9084 return SDValue();
9085
9086 SDLoc DL(N);
9087 EVT ExtVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext());
9088 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8)
9089 ExtVT = MVT::v8i16;
9090
9091 unsigned Opcode =
9093 SDValue Ext = DAG.getNode(Opcode, DL, DAG.getVTList(ExtVT, ExtVT), Op);
9094 SDValue Ext1 = Ext.getValue(1);
9095
9096 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8) {
9097 Ext = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext);
9098 Ext1 = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext1);
9099 }
9100
9101 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Ext, Ext1);
9102}
9103
9104/// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
9105/// element has been zero/sign-extended, depending on the isSigned parameter,
9106/// from an integer type half its size.
9108 bool isSigned) {
9109 // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
9110 EVT VT = N->getValueType(0);
9111 if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
9112 SDNode *BVN = N->getOperand(0).getNode();
9113 if (BVN->getValueType(0) != MVT::v4i32 ||
9114 BVN->getOpcode() != ISD::BUILD_VECTOR)
9115 return false;
9116 unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
9117 unsigned HiElt = 1 - LoElt;
9122 if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
9123 return false;
9124 if (isSigned) {
9125 if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
9126 Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
9127 return true;
9128 } else {
9129 if (Hi0->isZero() && Hi1->isZero())
9130 return true;
9131 }
9132 return false;
9133 }
9134
9135 if (N->getOpcode() != ISD::BUILD_VECTOR)
9136 return false;
9137
9138 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
9139 SDNode *Elt = N->getOperand(i).getNode();
9141 unsigned EltSize = VT.getScalarSizeInBits();
9142 unsigned HalfSize = EltSize / 2;
9143 if (isSigned) {
9144 if (!isIntN(HalfSize, C->getSExtValue()))
9145 return false;
9146 } else {
9147 if (!isUIntN(HalfSize, C->getZExtValue()))
9148 return false;
9149 }
9150 continue;
9151 }
9152 return false;
9153 }
9154
9155 return true;
9156}
9157
9158/// isSignExtended - Check if a node is a vector value that is sign-extended
9159/// or a constant BUILD_VECTOR with sign-extended elements.
9161 if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
9162 return true;
9163 if (isExtendedBUILD_VECTOR(N, DAG, true))
9164 return true;
9165 return false;
9166}
9167
9168/// isZeroExtended - Check if a node is a vector value that is zero-extended (or
9169/// any-extended) or a constant BUILD_VECTOR with zero-extended elements.
9171 if (N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND ||
9173 return true;
9174 if (isExtendedBUILD_VECTOR(N, DAG, false))
9175 return true;
9176 return false;
9177}
9178
9179static EVT getExtensionTo64Bits(const EVT &OrigVT) {
9180 if (OrigVT.getSizeInBits() >= 64)
9181 return OrigVT;
9182
9183 assert(OrigVT.isSimple() && "Expecting a simple value type");
9184
9185 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
9186 switch (OrigSimpleTy) {
9187 default: llvm_unreachable("Unexpected Vector Type");
9188 case MVT::v2i8:
9189 case MVT::v2i16:
9190 return MVT::v2i32;
9191 case MVT::v4i8:
9192 return MVT::v4i16;
9193 }
9194}
9195
9196/// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
9197/// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
9198/// We insert the required extension here to get the vector to fill a D register.
9200 const EVT &OrigTy,
9201 const EVT &ExtTy,
9202 unsigned ExtOpcode) {
9203 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
9204 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
9205 // 64-bits we need to insert a new extension so that it will be 64-bits.
9206 assert(ExtTy.is128BitVector() && "Unexpected extension size");
9207 if (OrigTy.getSizeInBits() >= 64)
9208 return N;
9209
9210 // Must extend size to at least 64 bits to be used as an operand for VMULL.
9211 EVT NewVT = getExtensionTo64Bits(OrigTy);
9212
9213 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
9214}
9215
9216/// SkipLoadExtensionForVMULL - return a load of the original vector size that
9217/// does not do any sign/zero extension. If the original vector is less
9218/// than 64 bits, an appropriate extension will be added after the load to
9219/// reach a total size of 64 bits. We have to add the extension separately
9220/// because ARM does not have a sign/zero extending load for vectors.
9222 EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());
9223
9224 // The load already has the right type.
9225 if (ExtendedTy == LD->getMemoryVT())
9226 return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
9227 LD->getBasePtr(), LD->getPointerInfo(), LD->getAlign(),
9228 LD->getMemOperand()->getFlags());
9229
9230 // We need to create a zextload/sextload. We cannot just create a load
9231 // followed by a zext/zext node because LowerMUL is also run during normal
9232 // operation legalization where we can't create illegal types.
9233 return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
9234 LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
9235 LD->getMemoryVT(), LD->getAlign(),
9236 LD->getMemOperand()->getFlags());
9237}
9238
9239/// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
9240/// ANY_EXTEND, extending load, or BUILD_VECTOR with extended elements, return
9241/// the unextended value. The unextended vector should be 64 bits so that it can
9242/// be used as an operand to a VMULL instruction. If the original vector size
9243/// before extension is less than 64 bits we add a an extension to resize
9244/// the vector to 64 bits.
9246 if (N->getOpcode() == ISD::SIGN_EXTEND ||
9247 N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND)
9248 return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
9249 N->getOperand(0)->getValueType(0),
9250 N->getValueType(0),
9251 N->getOpcode());
9252
9253 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
9254 assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) &&
9255 "Expected extending load");
9256
9257 SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG);
9258 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1));
9259 unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
9260 SDValue extLoad =
9261 DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad);
9262 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad);
9263
9264 return newLoad;
9265 }
9266
9267 // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will
9268 // have been legalized as a BITCAST from v4i32.
9269 if (N->getOpcode() == ISD::BITCAST) {
9270 SDNode *BVN = N->getOperand(0).getNode();
9272 BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
9273 unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
9274 return DAG.getBuildVector(
9275 MVT::v2i32, SDLoc(N),
9276 {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)});
9277 }
9278 // Construct a new BUILD_VECTOR with elements truncated to half the size.
9279 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
9280 EVT VT = N->getValueType(0);
9281 unsigned EltSize = VT.getScalarSizeInBits() / 2;
9282 unsigned NumElts = VT.getVectorNumElements();
9283 MVT TruncVT = MVT::getIntegerVT(EltSize);
9285 SDLoc dl(N);
9286 for (unsigned i = 0; i != NumElts; ++i) {
9287 const APInt &CInt = N->getConstantOperandAPInt(i);
9288 // Element types smaller than 32 bits are not legal, so use i32 elements.
9289 // The values are implicitly truncated so sext vs. zext doesn't matter.
9290 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
9291 }
9292 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
9293}
9294
9295static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
9296 unsigned Opcode = N->getOpcode();
9297 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
9298 SDNode *N0 = N->getOperand(0).getNode();
9299 SDNode *N1 = N->getOperand(1).getNode();
9300 return N0->hasOneUse() && N1->hasOneUse() &&
9301 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
9302 }
9303 return false;
9304}
9305
9306static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
9307 unsigned Opcode = N->getOpcode();
9308 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
9309 SDNode *N0 = N->getOperand(0).getNode();
9310 SDNode *N1 = N->getOperand(1).getNode();
9311 return N0->hasOneUse() && N1->hasOneUse() &&
9312 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
9313 }
9314 return false;
9315}
9316
9318 // Multiplications are only custom-lowered for 128-bit vectors so that
9319 // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
9320 EVT VT = Op.getValueType();
9321 assert(VT.is128BitVector() && VT.isInteger() &&
9322 "unexpected type for custom-lowering ISD::MUL");
9323 SDNode *N0 = Op.getOperand(0).getNode();
9324 SDNode *N1 = Op.getOperand(1).getNode();
9325 unsigned NewOpc = 0;
9326 bool isMLA = false;
9327 bool isN0SExt = isSignExtended(N0, DAG);
9328 bool isN1SExt = isSignExtended(N1, DAG);
9329 if (isN0SExt && isN1SExt)
9330 NewOpc = ARMISD::VMULLs;
9331 else {
9332 bool isN0ZExt = isZeroExtended(N0, DAG);
9333 bool isN1ZExt = isZeroExtended(N1, DAG);
9334 if (isN0ZExt && isN1ZExt)
9335 NewOpc = ARMISD::VMULLu;
9336 else if (isN1SExt || isN1ZExt) {
9337 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
9338 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
9339 if (isN1SExt && isAddSubSExt(N0, DAG)) {
9340 NewOpc = ARMISD::VMULLs;
9341 isMLA = true;
9342 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
9343 NewOpc = ARMISD::VMULLu;
9344 isMLA = true;
9345 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
9346 std::swap(N0, N1);
9347 NewOpc = ARMISD::VMULLu;
9348 isMLA = true;
9349 }
9350 }
9351
9352 if (!NewOpc) {
9353 if (VT == MVT::v2i64)
9354 // Fall through to expand this. It is not legal.
9355 return SDValue();
9356 else
9357 // Other vector multiplications are legal.
9358 return Op;
9359 }
9360 }
9361
9362 // Legalize to a VMULL instruction.
9363 SDLoc DL(Op);
9364 SDValue Op0;
9365 SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
9366 if (!isMLA) {
9367 Op0 = SkipExtensionForVMULL(N0, DAG);
9369 Op1.getValueType().is64BitVector() &&
9370 "unexpected types for extended operands to VMULL");
9371 return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
9372 }
9373
9374 // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
9375 // isel lowering to take advantage of no-stall back to back vmul + vmla.
9376 // vmull q0, d4, d6
9377 // vmlal q0, d5, d6
9378 // is faster than
9379 // vaddl q0, d4, d5
9380 // vmovl q1, d6
9381 // vmul q0, q0, q1
9382 SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);
9383 SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);
9384 EVT Op1VT = Op1.getValueType();
9385 return DAG.getNode(N0->getOpcode(), DL, VT,
9386 DAG.getNode(NewOpc, DL, VT,
9387 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
9388 DAG.getNode(NewOpc, DL, VT,
9389 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
9390}
9391
9393 SelectionDAG &DAG) {
9394 // TODO: Should this propagate fast-math-flags?
9395
9396 // Convert to float
9397 // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
9398 // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
9399 X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
9400 Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
9401 X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
9402 Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
9403 // Get reciprocal estimate.
9404 // float4 recip = vrecpeq_f32(yf);
9405 Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9406 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9407 Y);
9408 // Because char has a smaller range than uchar, we can actually get away
9409 // without any newton steps. This requires that we use a weird bias
9410 // of 0xb000, however (again, this has been exhaustively tested).
9411 // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
9412 X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
9413 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
9414 Y = DAG.getConstant(0xb000, dl, MVT::v4i32);
9415 X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
9416 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
9417 // Convert back to short.
9418 X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
9419 X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
9420 return X;
9421}
9422
9424 SelectionDAG &DAG) {
9425 // TODO: Should this propagate fast-math-flags?
9426
9427 SDValue N2;
9428 // Convert to float.
9429 // float4 yf = vcvt_f32_s32(vmovl_s16(y));
9430 // float4 xf = vcvt_f32_s32(vmovl_s16(x));
9431 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
9432 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
9433 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
9434 N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
9435
9436 // Use reciprocal estimate and one refinement step.
9437 // float4 recip = vrecpeq_f32(yf);
9438 // recip *= vrecpsq_f32(yf, recip);
9439 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9440 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9441 N1);
9442 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9443 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9444 N1, N2);
9445 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9446 // Because short has a smaller range than ushort, we can actually get away
9447 // with only a single newton step. This requires that we use a weird bias
9448 // of 89, however (again, this has been exhaustively tested).
9449 // float4 result = as_float4(as_int4(xf*recip) + 0x89);
9450 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
9451 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
9452 N1 = DAG.getConstant(0x89, dl, MVT::v4i32);
9453 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
9454 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
9455 // Convert back to integer and return.
9456 // return vmovn_s32(vcvt_s32_f32(result));
9457 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
9458 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
9459 return N0;
9460}
9461
9463 const ARMSubtarget *ST) {
9464 EVT VT = Op.getValueType();
9465 assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
9466 "unexpected type for custom-lowering ISD::SDIV");
9467
9468 SDLoc dl(Op);
9469 SDValue N0 = Op.getOperand(0);
9470 SDValue N1 = Op.getOperand(1);
9471 SDValue N2, N3;
9472
9473 if (VT == MVT::v8i8) {
9474 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
9475 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
9476
9477 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9478 DAG.getIntPtrConstant(4, dl));
9479 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9480 DAG.getIntPtrConstant(4, dl));
9481 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9482 DAG.getIntPtrConstant(0, dl));
9483 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9484 DAG.getIntPtrConstant(0, dl));
9485
9486 N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
9487 N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
9488
9489 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
9490 N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
9491
9492 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
9493 return N0;
9494 }
9495 return LowerSDIV_v4i16(N0, N1, dl, DAG);
9496}
9497
9499 const ARMSubtarget *ST) {
9500 // TODO: Should this propagate fast-math-flags?
9501 EVT VT = Op.getValueType();
9502 assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
9503 "unexpected type for custom-lowering ISD::UDIV");
9504
9505 SDLoc dl(Op);
9506 SDValue N0 = Op.getOperand(0);
9507 SDValue N1 = Op.getOperand(1);
9508 SDValue N2, N3;
9509
9510 if (VT == MVT::v8i8) {
9511 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
9512 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
9513
9514 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9515 DAG.getIntPtrConstant(4, dl));
9516 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9517 DAG.getIntPtrConstant(4, dl));
9518 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9519 DAG.getIntPtrConstant(0, dl));
9520 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9521 DAG.getIntPtrConstant(0, dl));
9522
9523 N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
9524 N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
9525
9526 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
9527 N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
9528
9529 N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
9530 DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl,
9531 MVT::i32),
9532 N0);
9533 return N0;
9534 }
9535
9536 // v4i16 sdiv ... Convert to float.
9537 // float4 yf = vcvt_f32_s32(vmovl_u16(y));
9538 // float4 xf = vcvt_f32_s32(vmovl_u16(x));
9539 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
9540 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
9541 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
9542 SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
9543
9544 // Use reciprocal estimate and two refinement steps.
9545 // float4 recip = vrecpeq_f32(yf);
9546 // recip *= vrecpsq_f32(yf, recip);
9547 // recip *= vrecpsq_f32(yf, recip);
9548 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9549 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9550 BN1);
9551 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9552 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9553 BN1, N2);
9554 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9555 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9556 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9557 BN1, N2);
9558 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9559 // Simply multiplying by the reciprocal estimate can leave us a few ulps
9560 // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
9561 // and that it will never cause us to return an answer too large).
9562 // float4 result = as_float4(as_int4(xf*recip) + 2);
9563 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
9564 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
9565 N1 = DAG.getConstant(2, dl, MVT::v4i32);
9566 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
9567 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
9568 // Convert back to integer and return.
9569 // return vmovn_u32(vcvt_s32_f32(result));
9570 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
9571 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
9572 return N0;
9573}
9574
9576 SDNode *N = Op.getNode();
9577 EVT VT = N->getValueType(0);
9578 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
9579
9580 SDValue Carry = Op.getOperand(2);
9581
9582 SDLoc DL(Op);
9583
9584 SDValue Result;
9585 if (Op.getOpcode() == ISD::UADDO_CARRY) {
9586 // This converts the boolean value carry into the carry flag.
9587 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
9588
9589 // Do the addition proper using the carry flag we wanted.
9590 Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0),
9591 Op.getOperand(1), Carry);
9592
9593 // Now convert the carry flag into a boolean value.
9594 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
9595 } else {
9596 // ARMISD::SUBE expects a carry not a borrow like ISD::USUBO_CARRY so we
9597 // have to invert the carry first.
9598 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
9599 DAG.getConstant(1, DL, MVT::i32), Carry);
9600 // This converts the boolean value carry into the carry flag.
9601 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
9602
9603 // Do the subtraction proper using the carry flag we wanted.
9604 Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0),
9605 Op.getOperand(1), Carry);
9606
9607 // Now convert the carry flag into a boolean value.
9608 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
9609 // But the carry returned by ARMISD::SUBE is not a borrow as expected
9610 // by ISD::USUBO_CARRY, so compute 1 - C.
9611 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
9612 DAG.getConstant(1, DL, MVT::i32), Carry);
9613 }
9614
9615 // Return both values.
9616 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry);
9617}
9618
9619SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
9620 bool Signed,
9621 SDValue &Chain) const {
9622 EVT VT = Op.getValueType();
9623 assert((VT == MVT::i32 || VT == MVT::i64) &&
9624 "unexpected type for custom lowering DIV");
9625 SDLoc dl(Op);
9626
9627 const auto &DL = DAG.getDataLayout();
9628 RTLIB::Libcall LC;
9629 if (Signed)
9630 LC = VT == MVT::i32 ? RTLIB::SDIVREM_I32 : RTLIB::SDIVREM_I64;
9631 else
9632 LC = VT == MVT::i32 ? RTLIB::UDIVREM_I32 : RTLIB::UDIVREM_I64;
9633
9634 const char *Name = getLibcallName(LC);
9635 SDValue ES = DAG.getExternalSymbol(Name, getPointerTy(DL));
9636
9638
9639 for (auto AI : {1, 0}) {
9640 SDValue Operand = Op.getOperand(AI);
9641 Args.emplace_back(Operand,
9642 Operand.getValueType().getTypeForEVT(*DAG.getContext()));
9643 }
9644
9645 CallLoweringInfo CLI(DAG);
9646 CLI.setDebugLoc(dl)
9647 .setChain(Chain)
9649 ES, std::move(Args));
9650
9651 return LowerCallTo(CLI).first;
9652}
9653
9654// This is a code size optimisation: return the original SDIV node to
9655// DAGCombiner when we don't want to expand SDIV into a sequence of
9656// instructions, and an empty node otherwise which will cause the
9657// SDIV to be expanded in DAGCombine.
9658SDValue
9659ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
9660 SelectionDAG &DAG,
9661 SmallVectorImpl<SDNode *> &Created) const {
9662 // TODO: Support SREM
9663 if (N->getOpcode() != ISD::SDIV)
9664 return SDValue();
9665
9666 const auto &ST = DAG.getSubtarget<ARMSubtarget>();
9667 const bool MinSize = ST.hasMinSize();
9668 const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode()
9669 : ST.hasDivideInARMMode();
9670
9671 // Don't touch vector types; rewriting this may lead to scalarizing
9672 // the int divs.
9673 if (N->getOperand(0).getValueType().isVector())
9674 return SDValue();
9675
9676 // Bail if MinSize is not set, and also for both ARM and Thumb mode we need
9677 // hwdiv support for this to be really profitable.
9678 if (!(MinSize && HasDivide))
9679 return SDValue();
9680
9681 // ARM mode is a bit simpler than Thumb: we can handle large power
9682 // of 2 immediates with 1 mov instruction; no further checks required,
9683 // just return the sdiv node.
9684 if (!ST.isThumb())
9685 return SDValue(N, 0);
9686
9687 // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV,
9688 // and thus lose the code size benefits of a MOVS that requires only 2.
9689 // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here,
9690 // but as it's doing exactly this, it's not worth the trouble to get TTI.
9691 if (Divisor.sgt(128))
9692 return SDValue();
9693
9694 return SDValue(N, 0);
9695}
9696
9697SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,
9698 bool Signed) const {
9699 assert(Op.getValueType() == MVT::i32 &&
9700 "unexpected type for custom lowering DIV");
9701 SDLoc dl(Op);
9702
9703 SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other,
9704 DAG.getEntryNode(), Op.getOperand(1));
9705
9706 return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
9707}
9708
9710 SDLoc DL(N);
9711 SDValue Op = N->getOperand(1);
9712 if (N->getValueType(0) == MVT::i32)
9713 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op);
9714 SDValue Lo, Hi;
9715 std::tie(Lo, Hi) = DAG.SplitScalar(Op, DL, MVT::i32, MVT::i32);
9716 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain,
9717 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi));
9718}
9719
9720void ARMTargetLowering::ExpandDIV_Windows(
9721 SDValue Op, SelectionDAG &DAG, bool Signed,
9723 const auto &DL = DAG.getDataLayout();
9724
9725 assert(Op.getValueType() == MVT::i64 &&
9726 "unexpected type for custom lowering DIV");
9727 SDLoc dl(Op);
9728
9729 SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode());
9730
9731 SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
9732
9733 SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result);
9734 SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result,
9735 DAG.getConstant(32, dl, getPointerTy(DL)));
9736 Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);
9737
9738 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lower, Upper));
9739}
9740
9742 LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
9743 EVT MemVT = LD->getMemoryVT();
9744 assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
9745 MemVT == MVT::v16i1) &&
9746 "Expected a predicate type!");
9747 assert(MemVT == Op.getValueType());
9748 assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
9749 "Expected a non-extending load");
9750 assert(LD->isUnindexed() && "Expected a unindexed load");
9751
9752 // The basic MVE VLDR on a v2i1/v4i1/v8i1 actually loads the entire 16bit
9753 // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We
9754 // need to make sure that 8/4/2 bits are actually loaded into the correct
9755 // place, which means loading the value and then shuffling the values into
9756 // the bottom bits of the predicate.
9757 // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect
9758 // for BE).
9759 // Speaking of BE, apparently the rest of llvm will assume a reverse order to
9760 // a natural VMSR(load), so needs to be reversed.
9761
9762 SDLoc dl(Op);
9763 SDValue Load = DAG.getExtLoad(
9764 ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(),
9766 LD->getMemOperand());
9767 SDValue Val = Load;
9768 if (DAG.getDataLayout().isBigEndian())
9769 Val = DAG.getNode(ISD::SRL, dl, MVT::i32,
9770 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Load),
9771 DAG.getConstant(32 - MemVT.getSizeInBits(), dl, MVT::i32));
9772 SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Val);
9773 if (MemVT != MVT::v16i1)
9774 Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred,
9775 DAG.getConstant(0, dl, MVT::i32));
9776 return DAG.getMergeValues({Pred, Load.getValue(1)}, dl);
9777}
9778
9779void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
9780 SelectionDAG &DAG) const {
9781 LoadSDNode *LD = cast<LoadSDNode>(N);
9782 EVT MemVT = LD->getMemoryVT();
9783 assert(LD->isUnindexed() && "Loads should be unindexed at this point.");
9784
9785 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
9786 !Subtarget->isThumb1Only() && LD->isVolatile() &&
9787 LD->getAlign() >= Subtarget->getDualLoadStoreAlignment()) {
9788 SDLoc dl(N);
9790 ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}),
9791 {LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand());
9792 SDValue Lo = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 0 : 1);
9793 SDValue Hi = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 1 : 0);
9794 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
9795 Results.append({Pair, Result.getValue(2)});
9796 }
9797}
9798
9800 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
9801 EVT MemVT = ST->getMemoryVT();
9802 assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
9803 MemVT == MVT::v16i1) &&
9804 "Expected a predicate type!");
9805 assert(MemVT == ST->getValue().getValueType());
9806 assert(!ST->isTruncatingStore() && "Expected a non-extending store");
9807 assert(ST->isUnindexed() && "Expected a unindexed store");
9808
9809 // Only store the v2i1 or v4i1 or v8i1 worth of bits, via a buildvector with
9810 // top bits unset and a scalar store.
9811 SDLoc dl(Op);
9812 SDValue Build = ST->getValue();
9813 if (MemVT != MVT::v16i1) {
9815 for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++) {
9816 unsigned Elt = DAG.getDataLayout().isBigEndian()
9817 ? MemVT.getVectorNumElements() - I - 1
9818 : I;
9819 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build,
9820 DAG.getConstant(Elt, dl, MVT::i32)));
9821 }
9822 for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++)
9823 Ops.push_back(DAG.getUNDEF(MVT::i32));
9824 Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops);
9825 }
9826 SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build);
9827 if (MemVT == MVT::v16i1 && DAG.getDataLayout().isBigEndian())
9828 GRP = DAG.getNode(ISD::SRL, dl, MVT::i32,
9829 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, GRP),
9830 DAG.getConstant(16, dl, MVT::i32));
9831 return DAG.getTruncStore(
9832 ST->getChain(), dl, GRP, ST->getBasePtr(),
9834 ST->getMemOperand());
9835}
9836
9838 const ARMSubtarget *Subtarget) {
9839 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
9840 EVT MemVT = ST->getMemoryVT();
9841 assert(ST->isUnindexed() && "Stores should be unindexed at this point.");
9842
9843 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
9844 !Subtarget->isThumb1Only() && ST->isVolatile() &&
9845 ST->getAlign() >= Subtarget->getDualLoadStoreAlignment()) {
9846 SDNode *N = Op.getNode();
9847 SDLoc dl(N);
9848
9849 SDValue Lo = DAG.getNode(
9850 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
9851 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 0 : 1, dl,
9852 MVT::i32));
9853 SDValue Hi = DAG.getNode(
9854 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
9855 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 1 : 0, dl,
9856 MVT::i32));
9857
9858 return DAG.getMemIntrinsicNode(ARMISD::STRD, dl, DAG.getVTList(MVT::Other),
9859 {ST->getChain(), Lo, Hi, ST->getBasePtr()},
9860 MemVT, ST->getMemOperand());
9861 } else if (Subtarget->hasMVEIntegerOps() &&
9862 ((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
9863 MemVT == MVT::v16i1))) {
9864 return LowerPredicateStore(Op, DAG);
9865 }
9866
9867 return SDValue();
9868}
9869
9870static bool isZeroVector(SDValue N) {
9871 return (ISD::isBuildVectorAllZeros(N.getNode()) ||
9872 (N->getOpcode() == ARMISD::VMOVIMM &&
9873 isNullConstant(N->getOperand(0))));
9874}
9875
9878 MVT VT = Op.getSimpleValueType();
9879 SDValue Mask = N->getMask();
9880 SDValue PassThru = N->getPassThru();
9881 SDLoc dl(Op);
9882
9883 if (isZeroVector(PassThru))
9884 return Op;
9885
9886 // MVE Masked loads use zero as the passthru value. Here we convert undef to
9887 // zero too, and other values are lowered to a select.
9888 SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
9889 DAG.getTargetConstant(0, dl, MVT::i32));
9890 SDValue NewLoad = DAG.getMaskedLoad(
9891 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, ZeroVec,
9892 N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
9893 N->getExtensionType(), N->isExpandingLoad());
9894 SDValue Combo = NewLoad;
9895 bool PassThruIsCastZero = (PassThru.getOpcode() == ISD::BITCAST ||
9896 PassThru.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
9897 isZeroVector(PassThru->getOperand(0));
9898 if (!PassThru.isUndef() && !PassThruIsCastZero)
9899 Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
9900 return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl);
9901}
9902
9904 const ARMSubtarget *ST) {
9905 if (!ST->hasMVEIntegerOps())
9906 return SDValue();
9907
9908 SDLoc dl(Op);
9909 unsigned BaseOpcode = 0;
9910 switch (Op->getOpcode()) {
9911 default: llvm_unreachable("Expected VECREDUCE opcode");
9912 case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break;
9913 case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break;
9914 case ISD::VECREDUCE_MUL: BaseOpcode = ISD::MUL; break;
9915 case ISD::VECREDUCE_AND: BaseOpcode = ISD::AND; break;
9916 case ISD::VECREDUCE_OR: BaseOpcode = ISD::OR; break;
9917 case ISD::VECREDUCE_XOR: BaseOpcode = ISD::XOR; break;
9918 case ISD::VECREDUCE_FMAX: BaseOpcode = ISD::FMAXNUM; break;
9919 case ISD::VECREDUCE_FMIN: BaseOpcode = ISD::FMINNUM; break;
9920 }
9921
9922 SDValue Op0 = Op->getOperand(0);
9923 EVT VT = Op0.getValueType();
9924 EVT EltVT = VT.getVectorElementType();
9925 unsigned NumElts = VT.getVectorNumElements();
9926 unsigned NumActiveLanes = NumElts;
9927
9928 assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
9929 NumActiveLanes == 2) &&
9930 "Only expected a power 2 vector size");
9931
9932 // Use Mul(X, Rev(X)) until 4 items remain. Going down to 4 vector elements
9933 // allows us to easily extract vector elements from the lanes.
9934 while (NumActiveLanes > 4) {
9935 unsigned RevOpcode = NumActiveLanes == 16 ? ARMISD::VREV16 : ARMISD::VREV32;
9936 SDValue Rev = DAG.getNode(RevOpcode, dl, VT, Op0);
9937 Op0 = DAG.getNode(BaseOpcode, dl, VT, Op0, Rev);
9938 NumActiveLanes /= 2;
9939 }
9940
9941 SDValue Res;
9942 if (NumActiveLanes == 4) {
9943 // The remaining 4 elements are summed sequentially
9944 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
9945 DAG.getConstant(0 * NumElts / 4, dl, MVT::i32));
9946 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
9947 DAG.getConstant(1 * NumElts / 4, dl, MVT::i32));
9948 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
9949 DAG.getConstant(2 * NumElts / 4, dl, MVT::i32));
9950 SDValue Ext3 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
9951 DAG.getConstant(3 * NumElts / 4, dl, MVT::i32));
9952 SDValue Res0 = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
9953 SDValue Res1 = DAG.getNode(BaseOpcode, dl, EltVT, Ext2, Ext3, Op->getFlags());
9954 Res = DAG.getNode(BaseOpcode, dl, EltVT, Res0, Res1, Op->getFlags());
9955 } else {
9956 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
9957 DAG.getConstant(0, dl, MVT::i32));
9958 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
9959 DAG.getConstant(1, dl, MVT::i32));
9960 Res = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
9961 }
9962
9963 // Result type may be wider than element type.
9964 if (EltVT != Op->getValueType(0))
9965 Res = DAG.getNode(ISD::ANY_EXTEND, dl, Op->getValueType(0), Res);
9966 return Res;
9967}
9968
9970 const ARMSubtarget *ST) {
9971 if (!ST->hasMVEFloatOps())
9972 return SDValue();
9973 return LowerVecReduce(Op, DAG, ST);
9974}
9975
9977 const ARMSubtarget *ST) {
9978 if (!ST->hasNEON())
9979 return SDValue();
9980
9981 SDLoc dl(Op);
9982 SDValue Op0 = Op->getOperand(0);
9983 EVT VT = Op0.getValueType();
9984 EVT EltVT = VT.getVectorElementType();
9985
9986 unsigned PairwiseIntrinsic = 0;
9987 switch (Op->getOpcode()) {
9988 default:
9989 llvm_unreachable("Expected VECREDUCE opcode");
9990 case ISD::VECREDUCE_UMIN:
9991 PairwiseIntrinsic = Intrinsic::arm_neon_vpminu;
9992 break;
9993 case ISD::VECREDUCE_UMAX:
9994 PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxu;
9995 break;
9996 case ISD::VECREDUCE_SMIN:
9997 PairwiseIntrinsic = Intrinsic::arm_neon_vpmins;
9998 break;
9999 case ISD::VECREDUCE_SMAX:
10000 PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxs;
10001 break;
10002 }
10003 SDValue PairwiseOp = DAG.getConstant(PairwiseIntrinsic, dl, MVT::i32);
10004
10005 unsigned NumElts = VT.getVectorNumElements();
10006 unsigned NumActiveLanes = NumElts;
10007
10008 assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
10009 NumActiveLanes == 2) &&
10010 "Only expected a power 2 vector size");
10011
10012 // Split 128-bit vectors, since vpmin/max takes 2 64-bit vectors.
10013 if (VT.is128BitVector()) {
10014 SDValue Lo, Hi;
10015 std::tie(Lo, Hi) = DAG.SplitVector(Op0, dl);
10016 VT = Lo.getValueType();
10017 Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, {PairwiseOp, Lo, Hi});
10018 NumActiveLanes /= 2;
10019 }
10020
10021 // Use pairwise reductions until one lane remains
10022 while (NumActiveLanes > 1) {
10023 Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, {PairwiseOp, Op0, Op0});
10024 NumActiveLanes /= 2;
10025 }
10026
10027 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10028 DAG.getConstant(0, dl, MVT::i32));
10029
10030 // Result type may be wider than element type.
10031 if (EltVT != Op.getValueType()) {
10032 unsigned Extend = 0;
10033 switch (Op->getOpcode()) {
10034 default:
10035 llvm_unreachable("Expected VECREDUCE opcode");
10036 case ISD::VECREDUCE_UMIN:
10037 case ISD::VECREDUCE_UMAX:
10038 Extend = ISD::ZERO_EXTEND;
10039 break;
10040 case ISD::VECREDUCE_SMIN:
10041 case ISD::VECREDUCE_SMAX:
10042 Extend = ISD::SIGN_EXTEND;
10043 break;
10044 }
10045 Res = DAG.getNode(Extend, dl, Op.getValueType(), Res);
10046 }
10047 return Res;
10048}
10049
10051 if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getSuccessOrdering()))
10052 // Acquire/Release load/store is not legal for targets without a dmb or
10053 // equivalent available.
10054 return SDValue();
10055
10056 // Monotonic load/store is legal for all targets.
10057 return Op;
10058}
10059
10062 SelectionDAG &DAG,
10063 const ARMSubtarget *Subtarget) {
10064 SDLoc DL(N);
10065 // Under Power Management extensions, the cycle-count is:
10066 // mrc p15, #0, <Rt>, c9, c13, #0
10067 SDValue Ops[] = { N->getOperand(0), // Chain
10068 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
10069 DAG.getTargetConstant(15, DL, MVT::i32),
10070 DAG.getTargetConstant(0, DL, MVT::i32),
10071 DAG.getTargetConstant(9, DL, MVT::i32),
10072 DAG.getTargetConstant(13, DL, MVT::i32),
10073 DAG.getTargetConstant(0, DL, MVT::i32)
10074 };
10075
10076 SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
10077 DAG.getVTList(MVT::i32, MVT::Other), Ops);
10078 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32,
10079 DAG.getConstant(0, DL, MVT::i32)));
10080 Results.push_back(Cycles32.getValue(1));
10081}
10082
10084 SDValue V1) {
10085 SDLoc dl(V0.getNode());
10086 SDValue RegClass =
10087 DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
10088 SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32);
10089 SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32);
10090 const SDValue Ops[] = {RegClass, V0, SubReg0, V1, SubReg1};
10091 return SDValue(
10092 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
10093}
10094
10096 SDLoc dl(V.getNode());
10097 auto [VLo, VHi] = DAG.SplitScalar(V, dl, MVT::i32, MVT::i32);
10098 bool isBigEndian = DAG.getDataLayout().isBigEndian();
10099 if (isBigEndian)
10100 std::swap(VLo, VHi);
10101 return createGPRPairNode2xi32(DAG, VLo, VHi);
10102}
10103
10106 SelectionDAG &DAG) {
10107 assert(N->getValueType(0) == MVT::i64 &&
10108 "AtomicCmpSwap on types less than 64 should be legal");
10109 SDValue Ops[] = {
10110 createGPRPairNode2xi32(DAG, N->getOperand(1),
10111 DAG.getUNDEF(MVT::i32)), // pointer, temp
10112 createGPRPairNodei64(DAG, N->getOperand(2)), // expected
10113 createGPRPairNodei64(DAG, N->getOperand(3)), // new
10114 N->getOperand(0), // chain in
10115 };
10116 SDNode *CmpSwap = DAG.getMachineNode(
10117 ARM::CMP_SWAP_64, SDLoc(N),
10118 DAG.getVTList(MVT::Untyped, MVT::Untyped, MVT::Other), Ops);
10119
10120 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
10121 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
10122
10123 bool isBigEndian = DAG.getDataLayout().isBigEndian();
10124
10125 SDValue Lo =
10126 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0,
10127 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
10128 SDValue Hi =
10129 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1,
10130 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
10131 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i64, Lo, Hi));
10132 Results.push_back(SDValue(CmpSwap, 2));
10133}
10134
10135SDValue ARMTargetLowering::LowerFSETCC(SDValue Op, SelectionDAG &DAG) const {
10136 SDLoc dl(Op);
10137 EVT VT = Op.getValueType();
10138 SDValue Chain = Op.getOperand(0);
10139 SDValue LHS = Op.getOperand(1);
10140 SDValue RHS = Op.getOperand(2);
10141 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(3))->get();
10142 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
10143
10144 // If we don't have instructions of this float type then soften to a libcall
10145 // and use SETCC instead.
10146 if (isUnsupportedFloatingType(LHS.getValueType())) {
10147 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS,
10148 Chain, IsSignaling);
10149 if (!RHS.getNode()) {
10150 RHS = DAG.getConstant(0, dl, LHS.getValueType());
10151 CC = ISD::SETNE;
10152 }
10153 SDValue Result = DAG.getNode(ISD::SETCC, dl, VT, LHS, RHS,
10154 DAG.getCondCode(CC));
10155 return DAG.getMergeValues({Result, Chain}, dl);
10156 }
10157
10158 ARMCC::CondCodes CondCode, CondCode2;
10159 FPCCToARMCC(CC, CondCode, CondCode2);
10160
10161 SDValue True = DAG.getConstant(1, dl, VT);
10162 SDValue False = DAG.getConstant(0, dl, VT);
10163 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
10164 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling);
10165 SDValue Result = getCMOV(dl, VT, False, True, ARMcc, Cmp, DAG);
10166 if (CondCode2 != ARMCC::AL) {
10167 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
10168 Result = getCMOV(dl, VT, Result, True, ARMcc, Cmp, DAG);
10169 }
10170 return DAG.getMergeValues({Result, Chain}, dl);
10171}
10172
10173SDValue ARMTargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const {
10174 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
10175
10176 EVT VT = getPointerTy(DAG.getDataLayout());
10177 int FI = MFI.CreateFixedObject(4, 0, false);
10178 return DAG.getFrameIndex(FI, VT);
10179}
10180
10181SDValue ARMTargetLowering::LowerFP_TO_BF16(SDValue Op,
10182 SelectionDAG &DAG) const {
10183 SDLoc DL(Op);
10184 MakeLibCallOptions CallOptions;
10185 MVT SVT = Op.getOperand(0).getSimpleValueType();
10186 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16);
10187 SDValue Res =
10188 makeLibCall(DAG, LC, MVT::f32, Op.getOperand(0), CallOptions, DL).first;
10189 return DAG.getBitcast(MVT::i32, Res);
10190}
10191
10192SDValue ARMTargetLowering::LowerCMP(SDValue Op, SelectionDAG &DAG) const {
10193 SDLoc dl(Op);
10194 SDValue LHS = Op.getOperand(0);
10195 SDValue RHS = Op.getOperand(1);
10196
10197 // Determine if this is signed or unsigned comparison
10198 bool IsSigned = (Op.getOpcode() == ISD::SCMP);
10199
10200 // Special case for Thumb1 UCMP only
10201 if (!IsSigned && Subtarget->isThumb1Only()) {
10202 // For Thumb unsigned comparison, use this sequence:
10203 // subs r2, r0, r1 ; r2 = LHS - RHS, sets flags
10204 // sbc r2, r2 ; r2 = r2 - r2 - !carry
10205 // cmp r1, r0 ; compare RHS with LHS
10206 // sbc r1, r1 ; r1 = r1 - r1 - !carry
10207 // subs r0, r2, r1 ; r0 = r2 - r1 (final result)
10208
10209 // First subtraction: LHS - RHS
10210 SDValue Sub1WithFlags = DAG.getNode(
10211 ARMISD::SUBC, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
10212 SDValue Sub1Result = Sub1WithFlags.getValue(0);
10213 SDValue Flags1 = Sub1WithFlags.getValue(1);
10214
10215 // SUBE: Sub1Result - Sub1Result - !carry
10216 // This gives 0 if LHS >= RHS (unsigned), -1 if LHS < RHS (unsigned)
10217 SDValue Sbc1 =
10218 DAG.getNode(ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT),
10219 Sub1Result, Sub1Result, Flags1);
10220 SDValue Sbc1Result = Sbc1.getValue(0);
10221
10222 // Second comparison: RHS vs LHS (reverse comparison)
10223 SDValue CmpFlags = DAG.getNode(ARMISD::CMP, dl, FlagsVT, RHS, LHS);
10224
10225 // SUBE: RHS - RHS - !carry
10226 // This gives 0 if RHS <= LHS (unsigned), -1 if RHS > LHS (unsigned)
10227 SDValue Sbc2 = DAG.getNode(
10228 ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT), RHS, RHS, CmpFlags);
10229 SDValue Sbc2Result = Sbc2.getValue(0);
10230
10231 // Final subtraction: Sbc1Result - Sbc2Result (no flags needed)
10232 SDValue Result =
10233 DAG.getNode(ISD::SUB, dl, MVT::i32, Sbc1Result, Sbc2Result);
10234 if (Op.getValueType() != MVT::i32)
10235 Result = DAG.getSExtOrTrunc(Result, dl, Op.getValueType());
10236
10237 return Result;
10238 }
10239
10240 // For the ARM assembly pattern:
10241 // subs r0, r0, r1 ; subtract RHS from LHS and set flags
10242 // movgt r0, #1 ; if LHS > RHS, set result to 1 (GT for signed, HI for
10243 // unsigned) mvnlt r0, #0 ; if LHS < RHS, set result to -1 (LT for
10244 // signed, LO for unsigned)
10245 // ; if LHS == RHS, result remains 0 from the subs
10246
10247 // Optimization: if RHS is a subtraction against 0, use ADDC instead of SUBC
10248 unsigned Opcode = ARMISD::SUBC;
10249
10250 // Check if RHS is a subtraction against 0: (0 - X)
10251 if (RHS.getOpcode() == ISD::SUB) {
10252 SDValue SubLHS = RHS.getOperand(0);
10253 SDValue SubRHS = RHS.getOperand(1);
10254
10255 // Check if it's 0 - X
10256 if (isNullConstant(SubLHS)) {
10257 bool CanUseAdd = false;
10258 if (IsSigned) {
10259 // For SCMP: only if X is known to never be INT_MIN (to avoid overflow)
10260 if (RHS->getFlags().hasNoSignedWrap() || !DAG.computeKnownBits(SubRHS)
10262 .isMinSignedValue()) {
10263 CanUseAdd = true;
10264 }
10265 } else {
10266 // For UCMP: only if X is known to never be zero
10267 if (DAG.isKnownNeverZero(SubRHS)) {
10268 CanUseAdd = true;
10269 }
10270 }
10271
10272 if (CanUseAdd) {
10273 Opcode = ARMISD::ADDC;
10274 RHS = SubRHS; // Replace RHS with X, so we do LHS + X instead of
10275 // LHS - (0 - X)
10276 }
10277 }
10278 }
10279
10280 // Generate the operation with flags
10281 SDValue OpWithFlags =
10282 DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
10283
10284 SDValue OpResult = OpWithFlags.getValue(0);
10285 SDValue Flags = OpWithFlags.getValue(1);
10286
10287 // Constants for conditional moves
10288 SDValue One = DAG.getConstant(1, dl, MVT::i32);
10289 SDValue MinusOne = DAG.getAllOnesConstant(dl, MVT::i32);
10290
10291 // Select condition codes based on signed vs unsigned
10292 ARMCC::CondCodes GTCond = IsSigned ? ARMCC::GT : ARMCC::HI;
10293 ARMCC::CondCodes LTCond = IsSigned ? ARMCC::LT : ARMCC::LO;
10294
10295 // First conditional move: if greater than, set to 1
10296 SDValue GTCondValue = DAG.getConstant(GTCond, dl, MVT::i32);
10297 SDValue Result1 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, OpResult, One,
10298 GTCondValue, Flags);
10299
10300 // Second conditional move: if less than, set to -1
10301 SDValue LTCondValue = DAG.getConstant(LTCond, dl, MVT::i32);
10302 SDValue Result2 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, Result1, MinusOne,
10303 LTCondValue, Flags);
10304
10305 if (Op.getValueType() != MVT::i32)
10306 Result2 = DAG.getSExtOrTrunc(Result2, dl, Op.getValueType());
10307
10308 return Result2;
10309}
10310
10312 LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
10313 switch (Op.getOpcode()) {
10314 default: llvm_unreachable("Don't know how to custom lower this!");
10315 case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
10316 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
10317 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
10318 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
10319 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
10320 case ISD::SELECT: return LowerSELECT(Op, DAG);
10321 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
10322 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
10323 case ISD::BR_CC: return LowerBR_CC(Op, DAG);
10324 case ISD::BR_JT: return LowerBR_JT(Op, DAG);
10325 case ISD::VASTART: return LowerVASTART(Op, DAG);
10326 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget);
10327 case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget);
10328 case ISD::SINT_TO_FP:
10329 case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
10332 case ISD::FP_TO_SINT:
10333 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
10335 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG, Subtarget);
10336 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
10337 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
10338 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
10339 case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
10340 case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
10341 case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
10342 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG, Subtarget);
10343 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
10344 Subtarget);
10345 case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget);
10346 case ISD::SHL:
10347 case ISD::SRL:
10348 case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget);
10349 case ISD::SREM: return LowerREM(Op.getNode(), DAG);
10350 case ISD::UREM: return LowerREM(Op.getNode(), DAG);
10351 case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG);
10352 case ISD::SRL_PARTS:
10353 case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
10354 case ISD::CTTZ:
10355 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
10356 case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
10357 case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget);
10358 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
10359 case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget);
10360 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget);
10361 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget);
10362 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget);
10363 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
10364 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget);
10365 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget);
10366 case ISD::TRUNCATE: return LowerTruncate(Op.getNode(), DAG, Subtarget);
10367 case ISD::SIGN_EXTEND:
10368 case ISD::ZERO_EXTEND: return LowerVectorExtend(Op.getNode(), DAG, Subtarget);
10369 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
10370 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
10371 case ISD::SET_FPMODE:
10372 return LowerSET_FPMODE(Op, DAG);
10373 case ISD::RESET_FPMODE:
10374 return LowerRESET_FPMODE(Op, DAG);
10375 case ISD::MUL: return LowerMUL(Op, DAG);
10376 case ISD::SDIV:
10377 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
10378 return LowerDIV_Windows(Op, DAG, /* Signed */ true);
10379 return LowerSDIV(Op, DAG, Subtarget);
10380 case ISD::UDIV:
10381 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
10382 return LowerDIV_Windows(Op, DAG, /* Signed */ false);
10383 return LowerUDIV(Op, DAG, Subtarget);
10384 case ISD::UADDO_CARRY:
10385 case ISD::USUBO_CARRY:
10386 return LowerUADDSUBO_CARRY(Op, DAG);
10387 case ISD::SADDO:
10388 case ISD::SSUBO:
10389 return LowerSignedALUO(Op, DAG);
10390 case ISD::UADDO:
10391 case ISD::USUBO:
10392 return LowerUnsignedALUO(Op, DAG);
10393 case ISD::SADDSAT:
10394 case ISD::SSUBSAT:
10395 case ISD::UADDSAT:
10396 case ISD::USUBSAT:
10397 return LowerADDSUBSAT(Op, DAG, Subtarget);
10398 case ISD::LOAD:
10399 return LowerPredicateLoad(Op, DAG);
10400 case ISD::STORE:
10401 return LowerSTORE(Op, DAG, Subtarget);
10402 case ISD::MLOAD:
10403 return LowerMLOAD(Op, DAG);
10404 case ISD::VECREDUCE_MUL:
10405 case ISD::VECREDUCE_AND:
10406 case ISD::VECREDUCE_OR:
10407 case ISD::VECREDUCE_XOR:
10408 return LowerVecReduce(Op, DAG, Subtarget);
10409 case ISD::VECREDUCE_FADD:
10410 case ISD::VECREDUCE_FMUL:
10411 case ISD::VECREDUCE_FMIN:
10412 case ISD::VECREDUCE_FMAX:
10413 return LowerVecReduceF(Op, DAG, Subtarget);
10414 case ISD::VECREDUCE_UMIN:
10415 case ISD::VECREDUCE_UMAX:
10416 case ISD::VECREDUCE_SMIN:
10417 case ISD::VECREDUCE_SMAX:
10418 return LowerVecReduceMinMax(Op, DAG, Subtarget);
10419 case ISD::ATOMIC_LOAD:
10420 case ISD::ATOMIC_STORE:
10421 return LowerAtomicLoadStore(Op, DAG);
10422 case ISD::SDIVREM:
10423 case ISD::UDIVREM: return LowerDivRem(Op, DAG);
10424 case ISD::DYNAMIC_STACKALLOC:
10425 if (Subtarget->isTargetWindows())
10426 return LowerDYNAMIC_STACKALLOC(Op, DAG);
10427 llvm_unreachable("Don't know how to custom lower this!");
10429 case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
10431 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
10432 case ISD::STRICT_FSETCC:
10433 case ISD::STRICT_FSETCCS: return LowerFSETCC(Op, DAG);
10434 case ISD::SPONENTRY:
10435 return LowerSPONENTRY(Op, DAG);
10436 case ISD::FP_TO_BF16:
10437 return LowerFP_TO_BF16(Op, DAG);
10438 case ARMISD::WIN__DBZCHK: return SDValue();
10439 case ISD::UCMP:
10440 case ISD::SCMP:
10441 return LowerCMP(Op, DAG);
10442 case ISD::ABS:
10443 return LowerABS(Op, DAG);
10444 case ISD::STRICT_LROUND:
10446 case ISD::STRICT_LRINT:
10447 case ISD::STRICT_LLRINT: {
10448 assert((Op.getOperand(1).getValueType() == MVT::f16 ||
10449 Op.getOperand(1).getValueType() == MVT::bf16) &&
10450 "Expected custom lowering of rounding operations only for f16");
10451 SDLoc DL(Op);
10452 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
10453 {Op.getOperand(0), Op.getOperand(1)});
10454 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
10455 {Ext.getValue(1), Ext.getValue(0)});
10456 }
10457 }
10458}
10459
10461 SelectionDAG &DAG) {
10462 unsigned IntNo = N->getConstantOperandVal(0);
10463 unsigned Opc = 0;
10464 if (IntNo == Intrinsic::arm_smlald)
10465 Opc = ARMISD::SMLALD;
10466 else if (IntNo == Intrinsic::arm_smlaldx)
10467 Opc = ARMISD::SMLALDX;
10468 else if (IntNo == Intrinsic::arm_smlsld)
10469 Opc = ARMISD::SMLSLD;
10470 else if (IntNo == Intrinsic::arm_smlsldx)
10471 Opc = ARMISD::SMLSLDX;
10472 else
10473 return;
10474
10475 SDLoc dl(N);
10476 SDValue Lo, Hi;
10477 std::tie(Lo, Hi) = DAG.SplitScalar(N->getOperand(3), dl, MVT::i32, MVT::i32);
10478
10479 SDValue LongMul = DAG.getNode(Opc, dl,
10480 DAG.getVTList(MVT::i32, MVT::i32),
10481 N->getOperand(1), N->getOperand(2),
10482 Lo, Hi);
10483 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
10484 LongMul.getValue(0), LongMul.getValue(1)));
10485}
10486
10487/// ReplaceNodeResults - Replace the results of node with an illegal result
10488/// type with new values built out of custom code.
10491 SelectionDAG &DAG) const {
10492 SDValue Res;
10493 switch (N->getOpcode()) {
10494 default:
10495 llvm_unreachable("Don't know how to custom expand this!");
10496 case ISD::READ_REGISTER:
10498 break;
10499 case ISD::BITCAST:
10500 Res = ExpandBITCAST(N, DAG, Subtarget);
10501 break;
10502 case ISD::SRL:
10503 case ISD::SRA:
10504 case ISD::SHL:
10505 Res = Expand64BitShift(N, DAG, Subtarget);
10506 break;
10507 case ISD::SREM:
10508 case ISD::UREM:
10509 Res = LowerREM(N, DAG);
10510 break;
10511 case ISD::SDIVREM:
10512 case ISD::UDIVREM:
10513 Res = LowerDivRem(SDValue(N, 0), DAG);
10514 assert(Res.getNumOperands() == 2 && "DivRem needs two values");
10515 Results.push_back(Res.getValue(0));
10516 Results.push_back(Res.getValue(1));
10517 return;
10518 case ISD::SADDSAT:
10519 case ISD::SSUBSAT:
10520 case ISD::UADDSAT:
10521 case ISD::USUBSAT:
10522 Res = LowerADDSUBSAT(SDValue(N, 0), DAG, Subtarget);
10523 break;
10524 case ISD::READCYCLECOUNTER:
10525 ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
10526 return;
10527 case ISD::UDIV:
10528 case ISD::SDIV:
10529 assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows");
10530 return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV,
10531 Results);
10532 case ISD::ATOMIC_CMP_SWAP:
10534 return;
10536 return ReplaceLongIntrinsic(N, Results, DAG);
10537 case ISD::LOAD:
10538 LowerLOAD(N, Results, DAG);
10539 break;
10540 case ISD::TRUNCATE:
10541 Res = LowerTruncate(N, DAG, Subtarget);
10542 break;
10543 case ISD::SIGN_EXTEND:
10544 case ISD::ZERO_EXTEND:
10545 Res = LowerVectorExtend(N, DAG, Subtarget);
10546 break;
10549 Res = LowerFP_TO_INT_SAT(SDValue(N, 0), DAG, Subtarget);
10550 break;
10551 }
10552 if (Res.getNode())
10553 Results.push_back(Res);
10554}
10555
10556//===----------------------------------------------------------------------===//
10557// ARM Scheduler Hooks
10558//===----------------------------------------------------------------------===//
10559
10560/// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
10561/// registers the function context.
10562void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
10564 MachineBasicBlock *DispatchBB,
10565 int FI) const {
10566 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
10567 "ROPI/RWPI not currently supported with SjLj");
10568 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10569 DebugLoc dl = MI.getDebugLoc();
10570 MachineFunction *MF = MBB->getParent();
10574 const Function &F = MF->getFunction();
10575
10576 bool isThumb = Subtarget->isThumb();
10577 bool isThumb2 = Subtarget->isThumb2();
10578
10579 unsigned PCLabelId = AFI->createPICLabelUId();
10580 unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
10582 ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj);
10583 unsigned CPI = MCP->getConstantPoolIndex(CPV, Align(4));
10584
10585 const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass
10586 : &ARM::GPRRegClass;
10587
10588 // Grab constant pool and fixed stack memory operands.
10589 MachineMemOperand *CPMMO =
10592
10593 MachineMemOperand *FIMMOSt =
10596
10597 // Load the address of the dispatch MBB into the jump buffer.
10598 if (isThumb2) {
10599 // Incoming value: jbuf
10600 // ldr.n r5, LCPI1_1
10601 // orr r5, r5, #1
10602 // add r5, pc
10603 // str r5, [$jbuf, #+4] ; &jbuf[1]
10604 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10605 BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
10607 .addMemOperand(CPMMO)
10609 // Set the low bit because of thumb mode.
10610 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10611 BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
10612 .addReg(NewVReg1, RegState::Kill)
10613 .addImm(0x01)
10615 .add(condCodeOp());
10616 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10617 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
10618 .addReg(NewVReg2, RegState::Kill)
10619 .addImm(PCLabelId);
10620 BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
10621 .addReg(NewVReg3, RegState::Kill)
10622 .addFrameIndex(FI)
10623 .addImm(36) // &jbuf[1] :: pc
10624 .addMemOperand(FIMMOSt)
10626 } else if (isThumb) {
10627 // Incoming value: jbuf
10628 // ldr.n r1, LCPI1_4
10629 // add r1, pc
10630 // mov r2, #1
10631 // orrs r1, r2
10632 // add r2, $jbuf, #+4 ; &jbuf[1]
10633 // str r1, [r2]
10634 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10635 BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
10637 .addMemOperand(CPMMO)
10639 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10640 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
10641 .addReg(NewVReg1, RegState::Kill)
10642 .addImm(PCLabelId);
10643 // Set the low bit because of thumb mode.
10644 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10645 BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
10646 .addReg(ARM::CPSR, RegState::Define)
10647 .addImm(1)
10649 Register NewVReg4 = MRI->createVirtualRegister(TRC);
10650 BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
10651 .addReg(ARM::CPSR, RegState::Define)
10652 .addReg(NewVReg2, RegState::Kill)
10653 .addReg(NewVReg3, RegState::Kill)
10655 Register NewVReg5 = MRI->createVirtualRegister(TRC);
10656 BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
10657 .addFrameIndex(FI)
10658 .addImm(36); // &jbuf[1] :: pc
10659 BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
10660 .addReg(NewVReg4, RegState::Kill)
10661 .addReg(NewVReg5, RegState::Kill)
10662 .addImm(0)
10663 .addMemOperand(FIMMOSt)
10665 } else {
10666 // Incoming value: jbuf
10667 // ldr r1, LCPI1_1
10668 // add r1, pc, r1
10669 // str r1, [$jbuf, #+4] ; &jbuf[1]
10670 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10671 BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1)
10673 .addImm(0)
10674 .addMemOperand(CPMMO)
10676 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10677 BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
10678 .addReg(NewVReg1, RegState::Kill)
10679 .addImm(PCLabelId)
10681 BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
10682 .addReg(NewVReg2, RegState::Kill)
10683 .addFrameIndex(FI)
10684 .addImm(36) // &jbuf[1] :: pc
10685 .addMemOperand(FIMMOSt)
10687 }
10688}
10689
10690void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
10691 MachineBasicBlock *MBB) const {
10692 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10693 DebugLoc dl = MI.getDebugLoc();
10694 MachineFunction *MF = MBB->getParent();
10695 MachineRegisterInfo *MRI = &MF->getRegInfo();
10696 MachineFrameInfo &MFI = MF->getFrameInfo();
10697 int FI = MFI.getFunctionContextIndex();
10698
10699 const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass
10700 : &ARM::GPRnopcRegClass;
10701
10702 // Get a mapping of the call site numbers to all of the landing pads they're
10703 // associated with.
10704 DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad;
10705 unsigned MaxCSNum = 0;
10706 for (MachineBasicBlock &BB : *MF) {
10707 if (!BB.isEHPad())
10708 continue;
10709
10710 // FIXME: We should assert that the EH_LABEL is the first MI in the landing
10711 // pad.
10712 for (MachineInstr &II : BB) {
10713 if (!II.isEHLabel())
10714 continue;
10715
10716 MCSymbol *Sym = II.getOperand(0).getMCSymbol();
10717 if (!MF->hasCallSiteLandingPad(Sym)) continue;
10718
10719 SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym);
10720 for (unsigned Idx : CallSiteIdxs) {
10721 CallSiteNumToLPad[Idx].push_back(&BB);
10722 MaxCSNum = std::max(MaxCSNum, Idx);
10723 }
10724 break;
10725 }
10726 }
10727
10728 // Get an ordered list of the machine basic blocks for the jump table.
10729 std::vector<MachineBasicBlock*> LPadList;
10730 SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs;
10731 LPadList.reserve(CallSiteNumToLPad.size());
10732 for (unsigned I = 1; I <= MaxCSNum; ++I) {
10733 SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
10734 for (MachineBasicBlock *MBB : MBBList) {
10735 LPadList.push_back(MBB);
10736 InvokeBBs.insert_range(MBB->predecessors());
10737 }
10738 }
10739
10740 assert(!LPadList.empty() &&
10741 "No landing pad destinations for the dispatch jump table!");
10742
10743 // Create the jump table and associated information.
10744 MachineJumpTableInfo *JTI =
10745 MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
10746 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
10747
10748 // Create the MBBs for the dispatch code.
10749
10750 // Shove the dispatch's address into the return slot in the function context.
10751 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
10752 DispatchBB->setIsEHPad();
10753
10754 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
10755
10756 BuildMI(TrapBB, dl, TII->get(Subtarget->isThumb() ? ARM::tTRAP : ARM::TRAP));
10757 DispatchBB->addSuccessor(TrapBB);
10758
10759 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
10760 DispatchBB->addSuccessor(DispContBB);
10761
10762 // Insert and MBBs.
10763 MF->insert(MF->end(), DispatchBB);
10764 MF->insert(MF->end(), DispContBB);
10765 MF->insert(MF->end(), TrapBB);
10766
10767 // Insert code into the entry block that creates and registers the function
10768 // context.
10769 SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
10770
10771 MachineMemOperand *FIMMOLd = MF->getMachineMemOperand(
10774
10775 MachineInstrBuilder MIB;
10776 MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
10777
10778 const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
10779 const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
10780
10781 // Add a register mask with no preserved registers. This results in all
10782 // registers being marked as clobbered. This can't work if the dispatch block
10783 // is in a Thumb1 function and is linked with ARM code which uses the FP
10784 // registers, as there is no way to preserve the FP registers in Thumb1 mode.
10786
10787 bool IsPositionIndependent = isPositionIndependent();
10788 unsigned NumLPads = LPadList.size();
10789 if (Subtarget->isThumb2()) {
10790 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10791 BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
10792 .addFrameIndex(FI)
10793 .addImm(4)
10794 .addMemOperand(FIMMOLd)
10796
10797 if (NumLPads < 256) {
10798 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
10799 .addReg(NewVReg1)
10800 .addImm(LPadList.size())
10802 } else {
10803 Register VReg1 = MRI->createVirtualRegister(TRC);
10804 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
10805 .addImm(NumLPads & 0xFFFF)
10807
10808 unsigned VReg2 = VReg1;
10809 if ((NumLPads & 0xFFFF0000) != 0) {
10810 VReg2 = MRI->createVirtualRegister(TRC);
10811 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
10812 .addReg(VReg1)
10813 .addImm(NumLPads >> 16)
10815 }
10816
10817 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
10818 .addReg(NewVReg1)
10819 .addReg(VReg2)
10821 }
10822
10823 BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
10824 .addMBB(TrapBB)
10826 .addReg(ARM::CPSR);
10827
10828 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10829 BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3)
10830 .addJumpTableIndex(MJTI)
10832
10833 Register NewVReg4 = MRI->createVirtualRegister(TRC);
10834 BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
10835 .addReg(NewVReg3, RegState::Kill)
10836 .addReg(NewVReg1)
10839 .add(condCodeOp());
10840
10841 BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
10842 .addReg(NewVReg4, RegState::Kill)
10843 .addReg(NewVReg1)
10844 .addJumpTableIndex(MJTI);
10845 } else if (Subtarget->isThumb()) {
10846 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10847 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
10848 .addFrameIndex(FI)
10849 .addImm(1)
10850 .addMemOperand(FIMMOLd)
10852
10853 if (NumLPads < 256) {
10854 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
10855 .addReg(NewVReg1)
10856 .addImm(NumLPads)
10858 } else {
10859 MachineConstantPool *ConstantPool = MF->getConstantPool();
10860 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
10861 const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
10862
10863 // MachineConstantPool wants an explicit alignment.
10864 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
10865 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
10866
10867 Register VReg1 = MRI->createVirtualRegister(TRC);
10868 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
10869 .addReg(VReg1, RegState::Define)
10872 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
10873 .addReg(NewVReg1)
10874 .addReg(VReg1)
10876 }
10877
10878 BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
10879 .addMBB(TrapBB)
10881 .addReg(ARM::CPSR);
10882
10883 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10884 BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
10885 .addReg(ARM::CPSR, RegState::Define)
10886 .addReg(NewVReg1)
10887 .addImm(2)
10889
10890 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10891 BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
10892 .addJumpTableIndex(MJTI)
10894
10895 Register NewVReg4 = MRI->createVirtualRegister(TRC);
10896 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
10897 .addReg(ARM::CPSR, RegState::Define)
10898 .addReg(NewVReg2, RegState::Kill)
10899 .addReg(NewVReg3)
10901
10902 MachineMemOperand *JTMMOLd =
10903 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
10905
10906 Register NewVReg5 = MRI->createVirtualRegister(TRC);
10907 BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
10908 .addReg(NewVReg4, RegState::Kill)
10909 .addImm(0)
10910 .addMemOperand(JTMMOLd)
10912
10913 unsigned NewVReg6 = NewVReg5;
10914 if (IsPositionIndependent) {
10915 NewVReg6 = MRI->createVirtualRegister(TRC);
10916 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
10917 .addReg(ARM::CPSR, RegState::Define)
10918 .addReg(NewVReg5, RegState::Kill)
10919 .addReg(NewVReg3)
10921 }
10922
10923 BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
10924 .addReg(NewVReg6, RegState::Kill)
10925 .addJumpTableIndex(MJTI);
10926 } else {
10927 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10928 BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
10929 .addFrameIndex(FI)
10930 .addImm(4)
10931 .addMemOperand(FIMMOLd)
10933
10934 if (NumLPads < 256) {
10935 BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
10936 .addReg(NewVReg1)
10937 .addImm(NumLPads)
10939 } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
10940 Register VReg1 = MRI->createVirtualRegister(TRC);
10941 BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
10942 .addImm(NumLPads & 0xFFFF)
10944
10945 unsigned VReg2 = VReg1;
10946 if ((NumLPads & 0xFFFF0000) != 0) {
10947 VReg2 = MRI->createVirtualRegister(TRC);
10948 BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
10949 .addReg(VReg1)
10950 .addImm(NumLPads >> 16)
10952 }
10953
10954 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
10955 .addReg(NewVReg1)
10956 .addReg(VReg2)
10958 } else {
10959 MachineConstantPool *ConstantPool = MF->getConstantPool();
10960 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
10961 const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
10962
10963 // MachineConstantPool wants an explicit alignment.
10964 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
10965 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
10966
10967 Register VReg1 = MRI->createVirtualRegister(TRC);
10968 BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
10969 .addReg(VReg1, RegState::Define)
10971 .addImm(0)
10973 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
10974 .addReg(NewVReg1)
10975 .addReg(VReg1, RegState::Kill)
10977 }
10978
10979 BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
10980 .addMBB(TrapBB)
10982 .addReg(ARM::CPSR);
10983
10984 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10985 BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
10986 .addReg(NewVReg1)
10989 .add(condCodeOp());
10990 Register NewVReg4 = MRI->createVirtualRegister(TRC);
10991 BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
10992 .addJumpTableIndex(MJTI)
10994
10995 MachineMemOperand *JTMMOLd =
10996 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
10998 Register NewVReg5 = MRI->createVirtualRegister(TRC);
10999 BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
11000 .addReg(NewVReg3, RegState::Kill)
11001 .addReg(NewVReg4)
11002 .addImm(0)
11003 .addMemOperand(JTMMOLd)
11005
11006 if (IsPositionIndependent) {
11007 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
11008 .addReg(NewVReg5, RegState::Kill)
11009 .addReg(NewVReg4)
11010 .addJumpTableIndex(MJTI);
11011 } else {
11012 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
11013 .addReg(NewVReg5, RegState::Kill)
11014 .addJumpTableIndex(MJTI);
11015 }
11016 }
11017
11018 // Add the jump table entries as successors to the MBB.
11019 SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs;
11020 for (MachineBasicBlock *CurMBB : LPadList) {
11021 if (SeenMBBs.insert(CurMBB).second)
11022 DispContBB->addSuccessor(CurMBB);
11023 }
11024
11025 // N.B. the order the invoke BBs are processed in doesn't matter here.
11026 const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF);
11028 for (MachineBasicBlock *BB : InvokeBBs) {
11029
11030 // Remove the landing pad successor from the invoke block and replace it
11031 // with the new dispatch block.
11032 SmallVector<MachineBasicBlock*, 4> Successors(BB->successors());
11033 while (!Successors.empty()) {
11034 MachineBasicBlock *SMBB = Successors.pop_back_val();
11035 if (SMBB->isEHPad()) {
11036 BB->removeSuccessor(SMBB);
11037 MBBLPads.push_back(SMBB);
11038 }
11039 }
11040
11041 BB->addSuccessor(DispatchBB, BranchProbability::getZero());
11042 BB->normalizeSuccProbs();
11043
11044 // Find the invoke call and mark all of the callee-saved registers as
11045 // 'implicit defined' so that they're spilled. This prevents code from
11046 // moving instructions to before the EH block, where they will never be
11047 // executed.
11049 II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
11050 if (!II->isCall()) continue;
11051
11052 DenseSet<unsigned> DefRegs;
11054 OI = II->operands_begin(), OE = II->operands_end();
11055 OI != OE; ++OI) {
11056 if (!OI->isReg()) continue;
11057 DefRegs.insert(OI->getReg());
11058 }
11059
11060 MachineInstrBuilder MIB(*MF, &*II);
11061
11062 for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
11063 unsigned Reg = SavedRegs[i];
11064 if (Subtarget->isThumb2() &&
11065 !ARM::tGPRRegClass.contains(Reg) &&
11066 !ARM::hGPRRegClass.contains(Reg))
11067 continue;
11068 if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
11069 continue;
11070 if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
11071 continue;
11072 if (!DefRegs.contains(Reg))
11074 }
11075
11076 break;
11077 }
11078 }
11079
11080 // Mark all former landing pads as non-landing pads. The dispatch is the only
11081 // landing pad now.
11082 for (MachineBasicBlock *MBBLPad : MBBLPads)
11083 MBBLPad->setIsEHPad(false);
11084
11085 // The instruction is gone now.
11086 MI.eraseFromParent();
11087}
11088
11089static
11091 for (MachineBasicBlock *S : MBB->successors())
11092 if (S != Succ)
11093 return S;
11094 llvm_unreachable("Expecting a BB with two successors!");
11095}
11096
11097/// Return the load opcode for a given load size. If load size >= 8,
11098/// neon opcode will be returned.
11099static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) {
11100 if (LdSize >= 8)
11101 return LdSize == 16 ? ARM::VLD1q32wb_fixed
11102 : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0;
11103 if (IsThumb1)
11104 return LdSize == 4 ? ARM::tLDRi
11105 : LdSize == 2 ? ARM::tLDRHi
11106 : LdSize == 1 ? ARM::tLDRBi : 0;
11107 if (IsThumb2)
11108 return LdSize == 4 ? ARM::t2LDR_POST
11109 : LdSize == 2 ? ARM::t2LDRH_POST
11110 : LdSize == 1 ? ARM::t2LDRB_POST : 0;
11111 return LdSize == 4 ? ARM::LDR_POST_IMM
11112 : LdSize == 2 ? ARM::LDRH_POST
11113 : LdSize == 1 ? ARM::LDRB_POST_IMM : 0;
11114}
11115
11116/// Return the store opcode for a given store size. If store size >= 8,
11117/// neon opcode will be returned.
11118static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
11119 if (StSize >= 8)
11120 return StSize == 16 ? ARM::VST1q32wb_fixed
11121 : StSize == 8 ? ARM::VST1d32wb_fixed : 0;
11122 if (IsThumb1)
11123 return StSize == 4 ? ARM::tSTRi
11124 : StSize == 2 ? ARM::tSTRHi
11125 : StSize == 1 ? ARM::tSTRBi : 0;
11126 if (IsThumb2)
11127 return StSize == 4 ? ARM::t2STR_POST
11128 : StSize == 2 ? ARM::t2STRH_POST
11129 : StSize == 1 ? ARM::t2STRB_POST : 0;
11130 return StSize == 4 ? ARM::STR_POST_IMM
11131 : StSize == 2 ? ARM::STRH_POST
11132 : StSize == 1 ? ARM::STRB_POST_IMM : 0;
11133}
11134
11135/// Emit a post-increment load operation with given size. The instructions
11136/// will be added to BB at Pos.
11138 const TargetInstrInfo *TII, const DebugLoc &dl,
11139 unsigned LdSize, unsigned Data, unsigned AddrIn,
11140 unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
11141 unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
11142 assert(LdOpc != 0 && "Should have a load opcode");
11143 if (LdSize >= 8) {
11144 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11145 .addReg(AddrOut, RegState::Define)
11146 .addReg(AddrIn)
11147 .addImm(0)
11149 } else if (IsThumb1) {
11150 // load + update AddrIn
11151 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11152 .addReg(AddrIn)
11153 .addImm(0)
11155 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
11156 .add(t1CondCodeOp())
11157 .addReg(AddrIn)
11158 .addImm(LdSize)
11160 } else if (IsThumb2) {
11161 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11162 .addReg(AddrOut, RegState::Define)
11163 .addReg(AddrIn)
11164 .addImm(LdSize)
11166 } else { // arm
11167 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11168 .addReg(AddrOut, RegState::Define)
11169 .addReg(AddrIn)
11170 .addReg(0)
11171 .addImm(LdSize)
11173 }
11174}
11175
11176/// Emit a post-increment store operation with given size. The instructions
11177/// will be added to BB at Pos.
11179 const TargetInstrInfo *TII, const DebugLoc &dl,
11180 unsigned StSize, unsigned Data, unsigned AddrIn,
11181 unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
11182 unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
11183 assert(StOpc != 0 && "Should have a store opcode");
11184 if (StSize >= 8) {
11185 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11186 .addReg(AddrIn)
11187 .addImm(0)
11188 .addReg(Data)
11190 } else if (IsThumb1) {
11191 // store + update AddrIn
11192 BuildMI(*BB, Pos, dl, TII->get(StOpc))
11193 .addReg(Data)
11194 .addReg(AddrIn)
11195 .addImm(0)
11197 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
11198 .add(t1CondCodeOp())
11199 .addReg(AddrIn)
11200 .addImm(StSize)
11202 } else if (IsThumb2) {
11203 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11204 .addReg(Data)
11205 .addReg(AddrIn)
11206 .addImm(StSize)
11208 } else { // arm
11209 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11210 .addReg(Data)
11211 .addReg(AddrIn)
11212 .addReg(0)
11213 .addImm(StSize)
11215 }
11216}
11217
11219ARMTargetLowering::EmitStructByval(MachineInstr &MI,
11220 MachineBasicBlock *BB) const {
11221 // This pseudo instruction has 3 operands: dst, src, size
11222 // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
11223 // Otherwise, we will generate unrolled scalar copies.
11224 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11225 const BasicBlock *LLVM_BB = BB->getBasicBlock();
11227
11228 Register dest = MI.getOperand(0).getReg();
11229 Register src = MI.getOperand(1).getReg();
11230 unsigned SizeVal = MI.getOperand(2).getImm();
11231 unsigned Alignment = MI.getOperand(3).getImm();
11232 DebugLoc dl = MI.getDebugLoc();
11233
11234 MachineFunction *MF = BB->getParent();
11235 MachineRegisterInfo &MRI = MF->getRegInfo();
11236 unsigned UnitSize = 0;
11237 const TargetRegisterClass *TRC = nullptr;
11238 const TargetRegisterClass *VecTRC = nullptr;
11239
11240 bool IsThumb1 = Subtarget->isThumb1Only();
11241 bool IsThumb2 = Subtarget->isThumb2();
11242 bool IsThumb = Subtarget->isThumb();
11243
11244 if (Alignment & 1) {
11245 UnitSize = 1;
11246 } else if (Alignment & 2) {
11247 UnitSize = 2;
11248 } else {
11249 // Check whether we can use NEON instructions.
11250 if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) &&
11251 Subtarget->hasNEON()) {
11252 if ((Alignment % 16 == 0) && SizeVal >= 16)
11253 UnitSize = 16;
11254 else if ((Alignment % 8 == 0) && SizeVal >= 8)
11255 UnitSize = 8;
11256 }
11257 // Can't use NEON instructions.
11258 if (UnitSize == 0)
11259 UnitSize = 4;
11260 }
11261
11262 // Select the correct opcode and register class for unit size load/store
11263 bool IsNeon = UnitSize >= 8;
11264 TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
11265 if (IsNeon)
11266 VecTRC = UnitSize == 16 ? &ARM::DPairRegClass
11267 : UnitSize == 8 ? &ARM::DPRRegClass
11268 : nullptr;
11269
11270 unsigned BytesLeft = SizeVal % UnitSize;
11271 unsigned LoopSize = SizeVal - BytesLeft;
11272
11273 if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
11274 // Use LDR and STR to copy.
11275 // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
11276 // [destOut] = STR_POST(scratch, destIn, UnitSize)
11277 unsigned srcIn = src;
11278 unsigned destIn = dest;
11279 for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
11280 Register srcOut = MRI.createVirtualRegister(TRC);
11281 Register destOut = MRI.createVirtualRegister(TRC);
11282 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
11283 emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
11284 IsThumb1, IsThumb2);
11285 emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
11286 IsThumb1, IsThumb2);
11287 srcIn = srcOut;
11288 destIn = destOut;
11289 }
11290
11291 // Handle the leftover bytes with LDRB and STRB.
11292 // [scratch, srcOut] = LDRB_POST(srcIn, 1)
11293 // [destOut] = STRB_POST(scratch, destIn, 1)
11294 for (unsigned i = 0; i < BytesLeft; i++) {
11295 Register srcOut = MRI.createVirtualRegister(TRC);
11296 Register destOut = MRI.createVirtualRegister(TRC);
11297 Register scratch = MRI.createVirtualRegister(TRC);
11298 emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
11299 IsThumb1, IsThumb2);
11300 emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
11301 IsThumb1, IsThumb2);
11302 srcIn = srcOut;
11303 destIn = destOut;
11304 }
11305 MI.eraseFromParent(); // The instruction is gone now.
11306 return BB;
11307 }
11308
11309 // Expand the pseudo op to a loop.
11310 // thisMBB:
11311 // ...
11312 // movw varEnd, # --> with thumb2
11313 // movt varEnd, #
11314 // ldrcp varEnd, idx --> without thumb2
11315 // fallthrough --> loopMBB
11316 // loopMBB:
11317 // PHI varPhi, varEnd, varLoop
11318 // PHI srcPhi, src, srcLoop
11319 // PHI destPhi, dst, destLoop
11320 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
11321 // [destLoop] = STR_POST(scratch, destPhi, UnitSize)
11322 // subs varLoop, varPhi, #UnitSize
11323 // bne loopMBB
11324 // fallthrough --> exitMBB
11325 // exitMBB:
11326 // epilogue to handle left-over bytes
11327 // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
11328 // [destOut] = STRB_POST(scratch, destLoop, 1)
11329 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
11330 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
11331 MF->insert(It, loopMBB);
11332 MF->insert(It, exitMBB);
11333
11334 // Set the call frame size on entry to the new basic blocks.
11335 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
11336 loopMBB->setCallFrameSize(CallFrameSize);
11337 exitMBB->setCallFrameSize(CallFrameSize);
11338
11339 // Transfer the remainder of BB and its successor edges to exitMBB.
11340 exitMBB->splice(exitMBB->begin(), BB,
11341 std::next(MachineBasicBlock::iterator(MI)), BB->end());
11343
11344 // Load an immediate to varEnd.
11345 Register varEnd = MRI.createVirtualRegister(TRC);
11346 if (Subtarget->useMovt()) {
11347 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi32imm : ARM::MOVi32imm),
11348 varEnd)
11349 .addImm(LoopSize);
11350 } else if (Subtarget->genExecuteOnly()) {
11351 assert(IsThumb && "Non-thumb expected to have used movt");
11352 BuildMI(BB, dl, TII->get(ARM::tMOVi32imm), varEnd).addImm(LoopSize);
11353 } else {
11354 MachineConstantPool *ConstantPool = MF->getConstantPool();
11356 const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
11357
11358 // MachineConstantPool wants an explicit alignment.
11359 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11360 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11361 MachineMemOperand *CPMMO =
11364
11365 if (IsThumb)
11366 BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci))
11367 .addReg(varEnd, RegState::Define)
11370 .addMemOperand(CPMMO);
11371 else
11372 BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp))
11373 .addReg(varEnd, RegState::Define)
11375 .addImm(0)
11377 .addMemOperand(CPMMO);
11378 }
11379 BB->addSuccessor(loopMBB);
11380
11381 // Generate the loop body:
11382 // varPhi = PHI(varLoop, varEnd)
11383 // srcPhi = PHI(srcLoop, src)
11384 // destPhi = PHI(destLoop, dst)
11385 MachineBasicBlock *entryBB = BB;
11386 BB = loopMBB;
11387 Register varLoop = MRI.createVirtualRegister(TRC);
11388 Register varPhi = MRI.createVirtualRegister(TRC);
11389 Register srcLoop = MRI.createVirtualRegister(TRC);
11390 Register srcPhi = MRI.createVirtualRegister(TRC);
11391 Register destLoop = MRI.createVirtualRegister(TRC);
11392 Register destPhi = MRI.createVirtualRegister(TRC);
11393
11394 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
11395 .addReg(varLoop).addMBB(loopMBB)
11396 .addReg(varEnd).addMBB(entryBB);
11397 BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
11398 .addReg(srcLoop).addMBB(loopMBB)
11399 .addReg(src).addMBB(entryBB);
11400 BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
11401 .addReg(destLoop).addMBB(loopMBB)
11402 .addReg(dest).addMBB(entryBB);
11403
11404 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
11405 // [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
11406 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
11407 emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
11408 IsThumb1, IsThumb2);
11409 emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
11410 IsThumb1, IsThumb2);
11411
11412 // Decrement loop variable by UnitSize.
11413 if (IsThumb1) {
11414 BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop)
11415 .add(t1CondCodeOp())
11416 .addReg(varPhi)
11417 .addImm(UnitSize)
11419 } else {
11420 MachineInstrBuilder MIB =
11421 BuildMI(*BB, BB->end(), dl,
11422 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
11423 MIB.addReg(varPhi)
11424 .addImm(UnitSize)
11426 .add(condCodeOp());
11427 MIB->getOperand(5).setReg(ARM::CPSR);
11428 MIB->getOperand(5).setIsDef(true);
11429 }
11430 BuildMI(*BB, BB->end(), dl,
11431 TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc))
11432 .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
11433
11434 // loopMBB can loop back to loopMBB or fall through to exitMBB.
11435 BB->addSuccessor(loopMBB);
11436 BB->addSuccessor(exitMBB);
11437
11438 // Add epilogue to handle BytesLeft.
11439 BB = exitMBB;
11440 auto StartOfExit = exitMBB->begin();
11441
11442 // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
11443 // [destOut] = STRB_POST(scratch, destLoop, 1)
11444 unsigned srcIn = srcLoop;
11445 unsigned destIn = destLoop;
11446 for (unsigned i = 0; i < BytesLeft; i++) {
11447 Register srcOut = MRI.createVirtualRegister(TRC);
11448 Register destOut = MRI.createVirtualRegister(TRC);
11449 Register scratch = MRI.createVirtualRegister(TRC);
11450 emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,
11451 IsThumb1, IsThumb2);
11452 emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,
11453 IsThumb1, IsThumb2);
11454 srcIn = srcOut;
11455 destIn = destOut;
11456 }
11457
11458 MI.eraseFromParent(); // The instruction is gone now.
11459 return BB;
11460}
11461
11463ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
11464 MachineBasicBlock *MBB) const {
11465 const TargetMachine &TM = getTargetMachine();
11466 const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
11467 DebugLoc DL = MI.getDebugLoc();
11468
11469 assert(Subtarget->isTargetWindows() &&
11470 "__chkstk is only supported on Windows");
11471 assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode");
11472
11473 // __chkstk takes the number of words to allocate on the stack in R4, and
11474 // returns the stack adjustment in number of bytes in R4. This will not
11475 // clober any other registers (other than the obvious lr).
11476 //
11477 // Although, technically, IP should be considered a register which may be
11478 // clobbered, the call itself will not touch it. Windows on ARM is a pure
11479 // thumb-2 environment, so there is no interworking required. As a result, we
11480 // do not expect a veneer to be emitted by the linker, clobbering IP.
11481 //
11482 // Each module receives its own copy of __chkstk, so no import thunk is
11483 // required, again, ensuring that IP is not clobbered.
11484 //
11485 // Finally, although some linkers may theoretically provide a trampoline for
11486 // out of range calls (which is quite common due to a 32M range limitation of
11487 // branches for Thumb), we can generate the long-call version via
11488 // -mcmodel=large, alleviating the need for the trampoline which may clobber
11489 // IP.
11490
11491 switch (TM.getCodeModel()) {
11492 case CodeModel::Tiny:
11493 llvm_unreachable("Tiny code model not available on ARM.");
11494 case CodeModel::Small:
11495 case CodeModel::Medium:
11496 case CodeModel::Kernel:
11497 BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))
11499 .addExternalSymbol("__chkstk")
11502 .addReg(ARM::R12,
11504 .addReg(ARM::CPSR,
11506 break;
11507 case CodeModel::Large: {
11508 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
11509 Register Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11510
11511 BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
11512 .addExternalSymbol("__chkstk");
11518 .addReg(ARM::R12,
11520 .addReg(ARM::CPSR,
11522 break;
11523 }
11524 }
11525
11526 BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP)
11527 .addReg(ARM::SP, RegState::Kill)
11528 .addReg(ARM::R4, RegState::Kill)
11531 .add(condCodeOp());
11532
11533 MI.eraseFromParent();
11534 return MBB;
11535}
11536
11538ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,
11539 MachineBasicBlock *MBB) const {
11540 DebugLoc DL = MI.getDebugLoc();
11541 MachineFunction *MF = MBB->getParent();
11542 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11543
11544 MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock();
11545 MF->insert(++MBB->getIterator(), ContBB);
11546 ContBB->splice(ContBB->begin(), MBB,
11547 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
11549 MBB->addSuccessor(ContBB);
11550
11551 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
11552 BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0));
11553 MF->push_back(TrapBB);
11554 MBB->addSuccessor(TrapBB);
11555
11556 BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8))
11557 .addReg(MI.getOperand(0).getReg())
11558 .addImm(0)
11560 BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc))
11561 .addMBB(TrapBB)
11563 .addReg(ARM::CPSR);
11564
11565 MI.eraseFromParent();
11566 return ContBB;
11567}
11568
11569// The CPSR operand of SelectItr might be missing a kill marker
11570// because there were multiple uses of CPSR, and ISel didn't know
11571// which to mark. Figure out whether SelectItr should have had a
11572// kill marker, and set it if it should. Returns the correct kill
11573// marker value.
11576 const TargetRegisterInfo* TRI) {
11577 // Scan forward through BB for a use/def of CPSR.
11578 MachineBasicBlock::iterator miI(std::next(SelectItr));
11579 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
11580 const MachineInstr& mi = *miI;
11581 if (mi.readsRegister(ARM::CPSR, /*TRI=*/nullptr))
11582 return false;
11583 if (mi.definesRegister(ARM::CPSR, /*TRI=*/nullptr))
11584 break; // Should have kill-flag - update below.
11585 }
11586
11587 // If we hit the end of the block, check whether CPSR is live into a
11588 // successor.
11589 if (miI == BB->end()) {
11590 for (MachineBasicBlock *Succ : BB->successors())
11591 if (Succ->isLiveIn(ARM::CPSR))
11592 return false;
11593 }
11594
11595 // We found a def, or hit the end of the basic block and CPSR wasn't live
11596 // out. SelectMI should have a kill flag on CPSR.
11597 SelectItr->addRegisterKilled(ARM::CPSR, TRI);
11598 return true;
11599}
11600
11601/// Adds logic in loop entry MBB to calculate loop iteration count and adds
11602/// t2WhileLoopSetup and t2WhileLoopStart to generate WLS loop
11604 MachineBasicBlock *TpLoopBody,
11605 MachineBasicBlock *TpExit, Register OpSizeReg,
11606 const TargetInstrInfo *TII, DebugLoc Dl,
11608 // Calculates loop iteration count = ceil(n/16) = (n + 15) >> 4.
11609 Register AddDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11610 BuildMI(TpEntry, Dl, TII->get(ARM::t2ADDri), AddDestReg)
11611 .addUse(OpSizeReg)
11612 .addImm(15)
11614 .addReg(0);
11615
11616 Register LsrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11617 BuildMI(TpEntry, Dl, TII->get(ARM::t2LSRri), LsrDestReg)
11618 .addUse(AddDestReg, RegState::Kill)
11619 .addImm(4)
11621 .addReg(0);
11622
11623 Register TotalIterationsReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11624 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopSetup), TotalIterationsReg)
11625 .addUse(LsrDestReg, RegState::Kill);
11626
11627 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopStart))
11628 .addUse(TotalIterationsReg)
11629 .addMBB(TpExit);
11630
11631 BuildMI(TpEntry, Dl, TII->get(ARM::t2B))
11632 .addMBB(TpLoopBody)
11634
11635 return TotalIterationsReg;
11636}
11637
11638/// Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and
11639/// t2DoLoopEnd. These are used by later passes to generate tail predicated
11640/// loops.
11641static void genTPLoopBody(MachineBasicBlock *TpLoopBody,
11642 MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit,
11643 const TargetInstrInfo *TII, DebugLoc Dl,
11644 MachineRegisterInfo &MRI, Register OpSrcReg,
11645 Register OpDestReg, Register ElementCountReg,
11646 Register TotalIterationsReg, bool IsMemcpy) {
11647 // First insert 4 PHI nodes for: Current pointer to Src (if memcpy), Dest
11648 // array, loop iteration counter, predication counter.
11649
11650 Register SrcPhiReg, CurrSrcReg;
11651 if (IsMemcpy) {
11652 // Current position in the src array
11653 SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11654 CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11655 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg)
11656 .addUse(OpSrcReg)
11657 .addMBB(TpEntry)
11658 .addUse(CurrSrcReg)
11659 .addMBB(TpLoopBody);
11660 }
11661
11662 // Current position in the dest array
11663 Register DestPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11664 Register CurrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11665 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), DestPhiReg)
11666 .addUse(OpDestReg)
11667 .addMBB(TpEntry)
11668 .addUse(CurrDestReg)
11669 .addMBB(TpLoopBody);
11670
11671 // Current loop counter
11672 Register LoopCounterPhiReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11673 Register RemainingLoopIterationsReg =
11674 MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11675 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), LoopCounterPhiReg)
11676 .addUse(TotalIterationsReg)
11677 .addMBB(TpEntry)
11678 .addUse(RemainingLoopIterationsReg)
11679 .addMBB(TpLoopBody);
11680
11681 // Predication counter
11682 Register PredCounterPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11683 Register RemainingElementsReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11684 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), PredCounterPhiReg)
11685 .addUse(ElementCountReg)
11686 .addMBB(TpEntry)
11687 .addUse(RemainingElementsReg)
11688 .addMBB(TpLoopBody);
11689
11690 // Pass predication counter to VCTP
11691 Register VccrReg = MRI.createVirtualRegister(&ARM::VCCRRegClass);
11692 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VCTP8), VccrReg)
11693 .addUse(PredCounterPhiReg)
11695 .addReg(0)
11696 .addReg(0);
11697
11698 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2SUBri), RemainingElementsReg)
11699 .addUse(PredCounterPhiReg)
11700 .addImm(16)
11702 .addReg(0);
11703
11704 // VLDRB (only if memcpy) and VSTRB instructions, predicated using VPR
11705 Register SrcValueReg;
11706 if (IsMemcpy) {
11707 SrcValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass);
11708 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post))
11709 .addDef(CurrSrcReg)
11710 .addDef(SrcValueReg)
11711 .addReg(SrcPhiReg)
11712 .addImm(16)
11714 .addUse(VccrReg)
11715 .addReg(0);
11716 } else
11717 SrcValueReg = OpSrcReg;
11718
11719 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VSTRBU8_post))
11720 .addDef(CurrDestReg)
11721 .addUse(SrcValueReg)
11722 .addReg(DestPhiReg)
11723 .addImm(16)
11725 .addUse(VccrReg)
11726 .addReg(0);
11727
11728 // Add the pseudoInstrs for decrementing the loop counter and marking the
11729 // end:t2DoLoopDec and t2DoLoopEnd
11730 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopDec), RemainingLoopIterationsReg)
11731 .addUse(LoopCounterPhiReg)
11732 .addImm(1);
11733
11734 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopEnd))
11735 .addUse(RemainingLoopIterationsReg)
11736 .addMBB(TpLoopBody);
11737
11738 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2B))
11739 .addMBB(TpExit)
11741}
11742
11744 // KCFI is supported in all ARM/Thumb modes
11745 return true;
11746}
11747
11751 const TargetInstrInfo *TII) const {
11752 assert(MBBI->isCall() && MBBI->getCFIType() &&
11753 "Invalid call instruction for a KCFI check");
11754
11755 MachineOperand *TargetOp = nullptr;
11756 switch (MBBI->getOpcode()) {
11757 // ARM mode opcodes
11758 case ARM::BLX:
11759 case ARM::BLX_pred:
11760 case ARM::BLX_noip:
11761 case ARM::BLX_pred_noip:
11762 case ARM::BX_CALL:
11763 TargetOp = &MBBI->getOperand(0);
11764 break;
11765 case ARM::TCRETURNri:
11766 case ARM::TCRETURNrinotr12:
11767 case ARM::TAILJMPr:
11768 case ARM::TAILJMPr4:
11769 TargetOp = &MBBI->getOperand(0);
11770 break;
11771 // Thumb mode opcodes (Thumb1 and Thumb2)
11772 // Note: Most Thumb call instructions have predicate operands before the
11773 // target register Format: tBLXr pred, predreg, target_register, ...
11774 case ARM::tBLXr: // Thumb1/Thumb2: BLX register (requires V5T)
11775 case ARM::tBLXr_noip: // Thumb1/Thumb2: BLX register, no IP clobber
11776 case ARM::tBX_CALL: // Thumb1 only: BX call (push LR, BX)
11777 TargetOp = &MBBI->getOperand(2);
11778 break;
11779 // Tail call instructions don't have predicates, target is operand 0
11780 case ARM::tTAILJMPr: // Thumb1/Thumb2: Tail call via register
11781 TargetOp = &MBBI->getOperand(0);
11782 break;
11783 default:
11784 llvm_unreachable("Unexpected CFI call opcode");
11785 }
11786
11787 assert(TargetOp && TargetOp->isReg() && "Invalid target operand");
11788 TargetOp->setIsRenamable(false);
11789
11790 // Select the appropriate KCFI_CHECK variant based on the instruction set
11791 unsigned KCFICheckOpcode;
11792 if (Subtarget->isThumb()) {
11793 if (Subtarget->isThumb2()) {
11794 KCFICheckOpcode = ARM::KCFI_CHECK_Thumb2;
11795 } else {
11796 KCFICheckOpcode = ARM::KCFI_CHECK_Thumb1;
11797 }
11798 } else {
11799 KCFICheckOpcode = ARM::KCFI_CHECK_ARM;
11800 }
11801
11802 return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(KCFICheckOpcode))
11803 .addReg(TargetOp->getReg())
11804 .addImm(MBBI->getCFIType())
11805 .getInstr();
11806}
11807
11810 MachineBasicBlock *BB) const {
11811 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11812 DebugLoc dl = MI.getDebugLoc();
11813 bool isThumb2 = Subtarget->isThumb2();
11814 switch (MI.getOpcode()) {
11815 default: {
11816 MI.print(errs());
11817 llvm_unreachable("Unexpected instr type to insert");
11818 }
11819
11820 // Thumb1 post-indexed loads are really just single-register LDMs.
11821 case ARM::tLDR_postidx: {
11822 MachineOperand Def(MI.getOperand(1));
11823 BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
11824 .add(Def) // Rn_wb
11825 .add(MI.getOperand(2)) // Rn
11826 .add(MI.getOperand(3)) // PredImm
11827 .add(MI.getOperand(4)) // PredReg
11828 .add(MI.getOperand(0)) // Rt
11829 .cloneMemRefs(MI);
11830 MI.eraseFromParent();
11831 return BB;
11832 }
11833
11834 case ARM::MVE_MEMCPYLOOPINST:
11835 case ARM::MVE_MEMSETLOOPINST: {
11836
11837 // Transformation below expands MVE_MEMCPYLOOPINST/MVE_MEMSETLOOPINST Pseudo
11838 // into a Tail Predicated (TP) Loop. It adds the instructions to calculate
11839 // the iteration count =ceil(size_in_bytes/16)) in the TP entry block and
11840 // adds the relevant instructions in the TP loop Body for generation of a
11841 // WLSTP loop.
11842
11843 // Below is relevant portion of the CFG after the transformation.
11844 // The Machine Basic Blocks are shown along with branch conditions (in
11845 // brackets). Note that TP entry/exit MBBs depict the entry/exit of this
11846 // portion of the CFG and may not necessarily be the entry/exit of the
11847 // function.
11848
11849 // (Relevant) CFG after transformation:
11850 // TP entry MBB
11851 // |
11852 // |-----------------|
11853 // (n <= 0) (n > 0)
11854 // | |
11855 // | TP loop Body MBB<--|
11856 // | | |
11857 // \ |___________|
11858 // \ /
11859 // TP exit MBB
11860
11861 MachineFunction *MF = BB->getParent();
11862 MachineFunctionProperties &Properties = MF->getProperties();
11864
11865 Register OpDestReg = MI.getOperand(0).getReg();
11866 Register OpSrcReg = MI.getOperand(1).getReg();
11867 Register OpSizeReg = MI.getOperand(2).getReg();
11868
11869 // Allocate the required MBBs and add to parent function.
11870 MachineBasicBlock *TpEntry = BB;
11871 MachineBasicBlock *TpLoopBody = MF->CreateMachineBasicBlock();
11872 MachineBasicBlock *TpExit;
11873
11874 MF->push_back(TpLoopBody);
11875
11876 // If any instructions are present in the current block after
11877 // MVE_MEMCPYLOOPINST or MVE_MEMSETLOOPINST, split the current block and
11878 // move the instructions into the newly created exit block. If there are no
11879 // instructions add an explicit branch to the FallThrough block and then
11880 // split.
11881 //
11882 // The split is required for two reasons:
11883 // 1) A terminator(t2WhileLoopStart) will be placed at that site.
11884 // 2) Since a TPLoopBody will be added later, any phis in successive blocks
11885 // need to be updated. splitAt() already handles this.
11886 TpExit = BB->splitAt(MI, false);
11887 if (TpExit == BB) {
11888 assert(BB->canFallThrough() && "Exit Block must be Fallthrough of the "
11889 "block containing memcpy/memset Pseudo");
11890 TpExit = BB->getFallThrough();
11891 BuildMI(BB, dl, TII->get(ARM::t2B))
11892 .addMBB(TpExit)
11894 TpExit = BB->splitAt(MI, false);
11895 }
11896
11897 // Add logic for iteration count
11898 Register TotalIterationsReg =
11899 genTPEntry(TpEntry, TpLoopBody, TpExit, OpSizeReg, TII, dl, MRI);
11900
11901 // Add the vectorized (and predicated) loads/store instructions
11902 bool IsMemcpy = MI.getOpcode() == ARM::MVE_MEMCPYLOOPINST;
11903 genTPLoopBody(TpLoopBody, TpEntry, TpExit, TII, dl, MRI, OpSrcReg,
11904 OpDestReg, OpSizeReg, TotalIterationsReg, IsMemcpy);
11905
11906 // Required to avoid conflict with the MachineVerifier during testing.
11907 Properties.resetNoPHIs();
11908
11909 // Connect the blocks
11910 TpEntry->addSuccessor(TpLoopBody);
11911 TpLoopBody->addSuccessor(TpLoopBody);
11912 TpLoopBody->addSuccessor(TpExit);
11913
11914 // Reorder for a more natural layout
11915 TpLoopBody->moveAfter(TpEntry);
11916 TpExit->moveAfter(TpLoopBody);
11917
11918 // Finally, remove the memcpy Pseudo Instruction
11919 MI.eraseFromParent();
11920
11921 // Return the exit block as it may contain other instructions requiring a
11922 // custom inserter
11923 return TpExit;
11924 }
11925
11926 // The Thumb2 pre-indexed stores have the same MI operands, they just
11927 // define them differently in the .td files from the isel patterns, so
11928 // they need pseudos.
11929 case ARM::t2STR_preidx:
11930 MI.setDesc(TII->get(ARM::t2STR_PRE));
11931 return BB;
11932 case ARM::t2STRB_preidx:
11933 MI.setDesc(TII->get(ARM::t2STRB_PRE));
11934 return BB;
11935 case ARM::t2STRH_preidx:
11936 MI.setDesc(TII->get(ARM::t2STRH_PRE));
11937 return BB;
11938
11939 case ARM::STRi_preidx:
11940 case ARM::STRBi_preidx: {
11941 unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM
11942 : ARM::STRB_PRE_IMM;
11943 // Decode the offset.
11944 unsigned Offset = MI.getOperand(4).getImm();
11945 bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
11947 if (isSub)
11948 Offset = -Offset;
11949
11950 MachineMemOperand *MMO = *MI.memoperands_begin();
11951 BuildMI(*BB, MI, dl, TII->get(NewOpc))
11952 .add(MI.getOperand(0)) // Rn_wb
11953 .add(MI.getOperand(1)) // Rt
11954 .add(MI.getOperand(2)) // Rn
11955 .addImm(Offset) // offset (skip GPR==zero_reg)
11956 .add(MI.getOperand(5)) // pred
11957 .add(MI.getOperand(6))
11958 .addMemOperand(MMO);
11959 MI.eraseFromParent();
11960 return BB;
11961 }
11962 case ARM::STRr_preidx:
11963 case ARM::STRBr_preidx:
11964 case ARM::STRH_preidx: {
11965 unsigned NewOpc;
11966 switch (MI.getOpcode()) {
11967 default: llvm_unreachable("unexpected opcode!");
11968 case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
11969 case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
11970 case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
11971 }
11972 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
11973 for (const MachineOperand &MO : MI.operands())
11974 MIB.add(MO);
11975 MI.eraseFromParent();
11976 return BB;
11977 }
11978
11979 case ARM::tMOVCCr_pseudo: {
11980 // To "insert" a SELECT_CC instruction, we actually have to insert the
11981 // diamond control-flow pattern. The incoming instruction knows the
11982 // destination vreg to set, the condition code register to branch on, the
11983 // true/false values to select between, and a branch opcode to use.
11984 const BasicBlock *LLVM_BB = BB->getBasicBlock();
11986
11987 // thisMBB:
11988 // ...
11989 // TrueVal = ...
11990 // cmpTY ccX, r1, r2
11991 // bCC copy1MBB
11992 // fallthrough --> copy0MBB
11993 MachineBasicBlock *thisMBB = BB;
11994 MachineFunction *F = BB->getParent();
11995 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
11996 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
11997 F->insert(It, copy0MBB);
11998 F->insert(It, sinkMBB);
11999
12000 // Set the call frame size on entry to the new basic blocks.
12001 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
12002 copy0MBB->setCallFrameSize(CallFrameSize);
12003 sinkMBB->setCallFrameSize(CallFrameSize);
12004
12005 // Check whether CPSR is live past the tMOVCCr_pseudo.
12006 const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
12007 if (!MI.killsRegister(ARM::CPSR, /*TRI=*/nullptr) &&
12008 !checkAndUpdateCPSRKill(MI, thisMBB, TRI)) {
12009 copy0MBB->addLiveIn(ARM::CPSR);
12010 sinkMBB->addLiveIn(ARM::CPSR);
12011 }
12012
12013 // Transfer the remainder of BB and its successor edges to sinkMBB.
12014 sinkMBB->splice(sinkMBB->begin(), BB,
12015 std::next(MachineBasicBlock::iterator(MI)), BB->end());
12017
12018 BB->addSuccessor(copy0MBB);
12019 BB->addSuccessor(sinkMBB);
12020
12021 BuildMI(BB, dl, TII->get(ARM::tBcc))
12022 .addMBB(sinkMBB)
12023 .addImm(MI.getOperand(3).getImm())
12024 .addReg(MI.getOperand(4).getReg());
12025
12026 // copy0MBB:
12027 // %FalseValue = ...
12028 // # fallthrough to sinkMBB
12029 BB = copy0MBB;
12030
12031 // Update machine-CFG edges
12032 BB->addSuccessor(sinkMBB);
12033
12034 // sinkMBB:
12035 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
12036 // ...
12037 BB = sinkMBB;
12038 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg())
12039 .addReg(MI.getOperand(1).getReg())
12040 .addMBB(copy0MBB)
12041 .addReg(MI.getOperand(2).getReg())
12042 .addMBB(thisMBB);
12043
12044 MI.eraseFromParent(); // The pseudo instruction is gone now.
12045 return BB;
12046 }
12047
12048 case ARM::BCCi64:
12049 case ARM::BCCZi64: {
12050 // If there is an unconditional branch to the other successor, remove it.
12051 BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end());
12052
12053 // Compare both parts that make up the double comparison separately for
12054 // equality.
12055 bool RHSisZero = MI.getOpcode() == ARM::BCCZi64;
12056
12057 Register LHS1 = MI.getOperand(1).getReg();
12058 Register LHS2 = MI.getOperand(2).getReg();
12059 if (RHSisZero) {
12060 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
12061 .addReg(LHS1)
12062 .addImm(0)
12064 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
12065 .addReg(LHS2).addImm(0)
12066 .addImm(ARMCC::EQ).addReg(ARM::CPSR);
12067 } else {
12068 Register RHS1 = MI.getOperand(3).getReg();
12069 Register RHS2 = MI.getOperand(4).getReg();
12070 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
12071 .addReg(LHS1)
12072 .addReg(RHS1)
12074 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
12075 .addReg(LHS2).addReg(RHS2)
12076 .addImm(ARMCC::EQ).addReg(ARM::CPSR);
12077 }
12078
12079 MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB();
12080 MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
12081 if (MI.getOperand(0).getImm() == ARMCC::NE)
12082 std::swap(destMBB, exitMBB);
12083
12084 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
12085 .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
12086 if (isThumb2)
12087 BuildMI(BB, dl, TII->get(ARM::t2B))
12088 .addMBB(exitMBB)
12090 else
12091 BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
12092
12093 MI.eraseFromParent(); // The pseudo instruction is gone now.
12094 return BB;
12095 }
12096
12097 case ARM::Int_eh_sjlj_setjmp:
12098 case ARM::Int_eh_sjlj_setjmp_nofp:
12099 case ARM::tInt_eh_sjlj_setjmp:
12100 case ARM::t2Int_eh_sjlj_setjmp:
12101 case ARM::t2Int_eh_sjlj_setjmp_nofp:
12102 return BB;
12103
12104 case ARM::Int_eh_sjlj_setup_dispatch:
12105 EmitSjLjDispatchBlock(MI, BB);
12106 return BB;
12107 case ARM::COPY_STRUCT_BYVAL_I32:
12108 ++NumLoopByVals;
12109 return EmitStructByval(MI, BB);
12110 case ARM::WIN__CHKSTK:
12111 return EmitLowered__chkstk(MI, BB);
12112 case ARM::WIN__DBZCHK:
12113 return EmitLowered__dbzchk(MI, BB);
12114 }
12115}
12116
12117/// Attaches vregs to MEMCPY that it will use as scratch registers
12118/// when it is expanded into LDM/STM. This is done as a post-isel lowering
12119/// instead of as a custom inserter because we need the use list from the SDNode.
12120static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
12121 MachineInstr &MI, const SDNode *Node) {
12122 bool isThumb1 = Subtarget->isThumb1Only();
12123
12124 MachineFunction *MF = MI.getParent()->getParent();
12126 MachineInstrBuilder MIB(*MF, MI);
12127
12128 // If the new dst/src is unused mark it as dead.
12129 if (!Node->hasAnyUseOfValue(0)) {
12130 MI.getOperand(0).setIsDead(true);
12131 }
12132 if (!Node->hasAnyUseOfValue(1)) {
12133 MI.getOperand(1).setIsDead(true);
12134 }
12135
12136 // The MEMCPY both defines and kills the scratch registers.
12137 for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) {
12138 Register TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
12139 : &ARM::GPRRegClass);
12141 }
12142}
12143
12145 SDNode *Node) const {
12146 if (MI.getOpcode() == ARM::MEMCPY) {
12147 attachMEMCPYScratchRegs(Subtarget, MI, Node);
12148 return;
12149 }
12150
12151 const MCInstrDesc *MCID = &MI.getDesc();
12152 // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
12153 // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
12154 // operand is still set to noreg. If needed, set the optional operand's
12155 // register to CPSR, and remove the redundant implicit def.
12156 //
12157 // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR).
12158
12159 // Rename pseudo opcodes.
12160 unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode());
12161 unsigned ccOutIdx;
12162 if (NewOpc) {
12163 const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
12164 MCID = &TII->get(NewOpc);
12165
12166 assert(MCID->getNumOperands() ==
12167 MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize()
12168 && "converted opcode should be the same except for cc_out"
12169 " (and, on Thumb1, pred)");
12170
12171 MI.setDesc(*MCID);
12172
12173 // Add the optional cc_out operand
12174 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
12175
12176 // On Thumb1, move all input operands to the end, then add the predicate
12177 if (Subtarget->isThumb1Only()) {
12178 for (unsigned c = MCID->getNumOperands() - 4; c--;) {
12179 MI.addOperand(MI.getOperand(1));
12180 MI.removeOperand(1);
12181 }
12182
12183 // Restore the ties
12184 for (unsigned i = MI.getNumOperands(); i--;) {
12185 const MachineOperand& op = MI.getOperand(i);
12186 if (op.isReg() && op.isUse()) {
12187 int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO);
12188 if (DefIdx != -1)
12189 MI.tieOperands(DefIdx, i);
12190 }
12191 }
12192
12194 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false));
12195 ccOutIdx = 1;
12196 } else
12197 ccOutIdx = MCID->getNumOperands() - 1;
12198 } else
12199 ccOutIdx = MCID->getNumOperands() - 1;
12200
12201 // Any ARM instruction that sets the 's' bit should specify an optional
12202 // "cc_out" operand in the last operand position.
12203 if (!MI.hasOptionalDef() || !MCID->operands()[ccOutIdx].isOptionalDef()) {
12204 assert(!NewOpc && "Optional cc_out operand required");
12205 return;
12206 }
12207 // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
12208 // since we already have an optional CPSR def.
12209 bool definesCPSR = false;
12210 bool deadCPSR = false;
12211 for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e;
12212 ++i) {
12213 const MachineOperand &MO = MI.getOperand(i);
12214 if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
12215 definesCPSR = true;
12216 if (MO.isDead())
12217 deadCPSR = true;
12218 MI.removeOperand(i);
12219 break;
12220 }
12221 }
12222 if (!definesCPSR) {
12223 assert(!NewOpc && "Optional cc_out operand required");
12224 return;
12225 }
12226 assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
12227 if (deadCPSR) {
12228 assert(!MI.getOperand(ccOutIdx).getReg() &&
12229 "expect uninitialized optional cc_out operand");
12230 // Thumb1 instructions must have the S bit even if the CPSR is dead.
12231 if (!Subtarget->isThumb1Only())
12232 return;
12233 }
12234
12235 // If this instruction was defined with an optional CPSR def and its dag node
12236 // had a live implicit CPSR def, then activate the optional CPSR def.
12237 MachineOperand &MO = MI.getOperand(ccOutIdx);
12238 MO.setReg(ARM::CPSR);
12239 MO.setIsDef(true);
12240}
12241
12242//===----------------------------------------------------------------------===//
12243// ARM Optimization Hooks
12244//===----------------------------------------------------------------------===//
12245
12246// Helper function that checks if N is a null or all ones constant.
12247static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
12249}
12250
12251// Return true if N is conditionally 0 or all ones.
12252// Detects these expressions where cc is an i1 value:
12253//
12254// (select cc 0, y) [AllOnes=0]
12255// (select cc y, 0) [AllOnes=0]
12256// (zext cc) [AllOnes=0]
12257// (sext cc) [AllOnes=0/1]
12258// (select cc -1, y) [AllOnes=1]
12259// (select cc y, -1) [AllOnes=1]
12260//
12261// Invert is set when N is the null/all ones constant when CC is false.
12262// OtherOp is set to the alternative value of N.
12264 SDValue &CC, bool &Invert,
12265 SDValue &OtherOp,
12266 SelectionDAG &DAG) {
12267 switch (N->getOpcode()) {
12268 default: return false;
12269 case ISD::SELECT: {
12270 CC = N->getOperand(0);
12271 SDValue N1 = N->getOperand(1);
12272 SDValue N2 = N->getOperand(2);
12273 if (isZeroOrAllOnes(N1, AllOnes)) {
12274 Invert = false;
12275 OtherOp = N2;
12276 return true;
12277 }
12278 if (isZeroOrAllOnes(N2, AllOnes)) {
12279 Invert = true;
12280 OtherOp = N1;
12281 return true;
12282 }
12283 return false;
12284 }
12285 case ISD::ZERO_EXTEND:
12286 // (zext cc) can never be the all ones value.
12287 if (AllOnes)
12288 return false;
12289 [[fallthrough]];
12290 case ISD::SIGN_EXTEND: {
12291 SDLoc dl(N);
12292 EVT VT = N->getValueType(0);
12293 CC = N->getOperand(0);
12294 if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC)
12295 return false;
12296 Invert = !AllOnes;
12297 if (AllOnes)
12298 // When looking for an AllOnes constant, N is an sext, and the 'other'
12299 // value is 0.
12300 OtherOp = DAG.getConstant(0, dl, VT);
12301 else if (N->getOpcode() == ISD::ZERO_EXTEND)
12302 // When looking for a 0 constant, N can be zext or sext.
12303 OtherOp = DAG.getConstant(1, dl, VT);
12304 else
12305 OtherOp = DAG.getAllOnesConstant(dl, VT);
12306 return true;
12307 }
12308 }
12309}
12310
12311// Combine a constant select operand into its use:
12312//
12313// (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
12314// (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
12315// (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1]
12316// (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
12317// (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
12318//
12319// The transform is rejected if the select doesn't have a constant operand that
12320// is null, or all ones when AllOnes is set.
12321//
12322// Also recognize sext/zext from i1:
12323//
12324// (add (zext cc), x) -> (select cc (add x, 1), x)
12325// (add (sext cc), x) -> (select cc (add x, -1), x)
12326//
12327// These transformations eventually create predicated instructions.
12328//
12329// @param N The node to transform.
12330// @param Slct The N operand that is a select.
12331// @param OtherOp The other N operand (x above).
12332// @param DCI Context.
12333// @param AllOnes Require the select constant to be all ones instead of null.
12334// @returns The new node, or SDValue() on failure.
12335static
12338 bool AllOnes = false) {
12339 SelectionDAG &DAG = DCI.DAG;
12340 EVT VT = N->getValueType(0);
12341 SDValue NonConstantVal;
12342 SDValue CCOp;
12343 bool SwapSelectOps;
12344 if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
12345 NonConstantVal, DAG))
12346 return SDValue();
12347
12348 // Slct is now know to be the desired identity constant when CC is true.
12349 SDValue TrueVal = OtherOp;
12350 SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
12351 OtherOp, NonConstantVal);
12352 // Unless SwapSelectOps says CC should be false.
12353 if (SwapSelectOps)
12354 std::swap(TrueVal, FalseVal);
12355
12356 return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
12357 CCOp, TrueVal, FalseVal);
12358}
12359
12360// Attempt combineSelectAndUse on each operand of a commutative operator N.
12361static
12364 SDValue N0 = N->getOperand(0);
12365 SDValue N1 = N->getOperand(1);
12366 if (N0.getNode()->hasOneUse())
12367 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes))
12368 return Result;
12369 if (N1.getNode()->hasOneUse())
12370 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes))
12371 return Result;
12372 return SDValue();
12373}
12374
12376 // VUZP shuffle node.
12377 if (N->getOpcode() == ARMISD::VUZP)
12378 return true;
12379
12380 // "VUZP" on i32 is an alias for VTRN.
12381 if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32)
12382 return true;
12383
12384 return false;
12385}
12386
12389 const ARMSubtarget *Subtarget) {
12390 // Look for ADD(VUZP.0, VUZP.1).
12391 if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() ||
12392 N0 == N1)
12393 return SDValue();
12394
12395 // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD.
12396 if (!N->getValueType(0).is64BitVector())
12397 return SDValue();
12398
12399 // Generate vpadd.
12400 SelectionDAG &DAG = DCI.DAG;
12401 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12402 SDLoc dl(N);
12403 SDNode *Unzip = N0.getNode();
12404 EVT VT = N->getValueType(0);
12405
12407 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl,
12408 TLI.getPointerTy(DAG.getDataLayout())));
12409 Ops.push_back(Unzip->getOperand(0));
12410 Ops.push_back(Unzip->getOperand(1));
12411
12412 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
12413}
12414
12417 const ARMSubtarget *Subtarget) {
12418 // Check for two extended operands.
12419 if (!(N0.getOpcode() == ISD::SIGN_EXTEND &&
12420 N1.getOpcode() == ISD::SIGN_EXTEND) &&
12421 !(N0.getOpcode() == ISD::ZERO_EXTEND &&
12422 N1.getOpcode() == ISD::ZERO_EXTEND))
12423 return SDValue();
12424
12425 SDValue N00 = N0.getOperand(0);
12426 SDValue N10 = N1.getOperand(0);
12427
12428 // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1))
12429 if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() ||
12430 N00 == N10)
12431 return SDValue();
12432
12433 // We only recognize Q register paddl here; this can't be reached until
12434 // after type legalization.
12435 if (!N00.getValueType().is64BitVector() ||
12437 return SDValue();
12438
12439 // Generate vpaddl.
12440 SelectionDAG &DAG = DCI.DAG;
12441 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12442 SDLoc dl(N);
12443 EVT VT = N->getValueType(0);
12444
12446 // Form vpaddl.sN or vpaddl.uN depending on the kind of extension.
12447 unsigned Opcode;
12448 if (N0.getOpcode() == ISD::SIGN_EXTEND)
12449 Opcode = Intrinsic::arm_neon_vpaddls;
12450 else
12451 Opcode = Intrinsic::arm_neon_vpaddlu;
12452 Ops.push_back(DAG.getConstant(Opcode, dl,
12453 TLI.getPointerTy(DAG.getDataLayout())));
12454 EVT ElemTy = N00.getValueType().getVectorElementType();
12455 unsigned NumElts = VT.getVectorNumElements();
12456 EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2);
12457 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT,
12458 N00.getOperand(0), N00.getOperand(1));
12459 Ops.push_back(Concat);
12460
12461 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
12462}
12463
12464// FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in
12465// an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is
12466// much easier to match.
12467static SDValue
12470 const ARMSubtarget *Subtarget) {
12471 // Only perform optimization if after legalize, and if NEON is available. We
12472 // also expected both operands to be BUILD_VECTORs.
12473 if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
12474 || N0.getOpcode() != ISD::BUILD_VECTOR
12475 || N1.getOpcode() != ISD::BUILD_VECTOR)
12476 return SDValue();
12477
12478 // Check output type since VPADDL operand elements can only be 8, 16, or 32.
12479 EVT VT = N->getValueType(0);
12480 if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
12481 return SDValue();
12482
12483 // Check that the vector operands are of the right form.
12484 // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
12485 // operands, where N is the size of the formed vector.
12486 // Each EXTRACT_VECTOR should have the same input vector and odd or even
12487 // index such that we have a pair wise add pattern.
12488
12489 // Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
12491 return SDValue();
12492 SDValue Vec = N0->getOperand(0)->getOperand(0);
12493 SDNode *V = Vec.getNode();
12494 unsigned nextIndex = 0;
12495
12496 // For each operands to the ADD which are BUILD_VECTORs,
12497 // check to see if each of their operands are an EXTRACT_VECTOR with
12498 // the same vector and appropriate index.
12499 for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
12502
12503 SDValue ExtVec0 = N0->getOperand(i);
12504 SDValue ExtVec1 = N1->getOperand(i);
12505
12506 // First operand is the vector, verify its the same.
12507 if (V != ExtVec0->getOperand(0).getNode() ||
12508 V != ExtVec1->getOperand(0).getNode())
12509 return SDValue();
12510
12511 // Second is the constant, verify its correct.
12514
12515 // For the constant, we want to see all the even or all the odd.
12516 if (!C0 || !C1 || C0->getZExtValue() != nextIndex
12517 || C1->getZExtValue() != nextIndex+1)
12518 return SDValue();
12519
12520 // Increment index.
12521 nextIndex+=2;
12522 } else
12523 return SDValue();
12524 }
12525
12526 // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure
12527 // we're using the entire input vector, otherwise there's a size/legality
12528 // mismatch somewhere.
12529 if (nextIndex != Vec.getValueType().getVectorNumElements() ||
12531 return SDValue();
12532
12533 // Create VPADDL node.
12534 SelectionDAG &DAG = DCI.DAG;
12535 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12536
12537 SDLoc dl(N);
12538
12539 // Build operand list.
12541 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl,
12542 TLI.getPointerTy(DAG.getDataLayout())));
12543
12544 // Input is the vector.
12545 Ops.push_back(Vec);
12546
12547 // Get widened type and narrowed type.
12548 MVT widenType;
12549 unsigned numElem = VT.getVectorNumElements();
12550
12551 EVT inputLaneType = Vec.getValueType().getVectorElementType();
12552 switch (inputLaneType.getSimpleVT().SimpleTy) {
12553 case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
12554 case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
12555 case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
12556 default:
12557 llvm_unreachable("Invalid vector element type for padd optimization.");
12558 }
12559
12560 SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops);
12561 unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;
12562 return DAG.getNode(ExtOp, dl, VT, tmp);
12563}
12564
12566 if (V->getOpcode() == ISD::UMUL_LOHI ||
12567 V->getOpcode() == ISD::SMUL_LOHI)
12568 return V;
12569 return SDValue();
12570}
12571
12572static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode,
12574 const ARMSubtarget *Subtarget) {
12575 if (!Subtarget->hasBaseDSP())
12576 return SDValue();
12577
12578 // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and
12579 // accumulates the product into a 64-bit value. The 16-bit values will
12580 // be sign extended somehow or SRA'd into 32-bit values
12581 // (addc (adde (mul 16bit, 16bit), lo), hi)
12582 SDValue Mul = AddcNode->getOperand(0);
12583 SDValue Lo = AddcNode->getOperand(1);
12584 if (Mul.getOpcode() != ISD::MUL) {
12585 Lo = AddcNode->getOperand(0);
12586 Mul = AddcNode->getOperand(1);
12587 if (Mul.getOpcode() != ISD::MUL)
12588 return SDValue();
12589 }
12590
12591 SDValue SRA = AddeNode->getOperand(0);
12592 SDValue Hi = AddeNode->getOperand(1);
12593 if (SRA.getOpcode() != ISD::SRA) {
12594 SRA = AddeNode->getOperand(1);
12595 Hi = AddeNode->getOperand(0);
12596 if (SRA.getOpcode() != ISD::SRA)
12597 return SDValue();
12598 }
12599 if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) {
12600 if (Const->getZExtValue() != 31)
12601 return SDValue();
12602 } else
12603 return SDValue();
12604
12605 if (SRA.getOperand(0) != Mul)
12606 return SDValue();
12607
12608 SelectionDAG &DAG = DCI.DAG;
12609 SDLoc dl(AddcNode);
12610 unsigned Opcode = 0;
12611 SDValue Op0;
12612 SDValue Op1;
12613
12614 if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) {
12615 Opcode = ARMISD::SMLALBB;
12616 Op0 = Mul.getOperand(0);
12617 Op1 = Mul.getOperand(1);
12618 } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) {
12619 Opcode = ARMISD::SMLALBT;
12620 Op0 = Mul.getOperand(0);
12621 Op1 = Mul.getOperand(1).getOperand(0);
12622 } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) {
12623 Opcode = ARMISD::SMLALTB;
12624 Op0 = Mul.getOperand(0).getOperand(0);
12625 Op1 = Mul.getOperand(1);
12626 } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) {
12627 Opcode = ARMISD::SMLALTT;
12628 Op0 = Mul->getOperand(0).getOperand(0);
12629 Op1 = Mul->getOperand(1).getOperand(0);
12630 }
12631
12632 if (!Op0 || !Op1)
12633 return SDValue();
12634
12635 SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
12636 Op0, Op1, Lo, Hi);
12637 // Replace the ADDs' nodes uses by the MLA node's values.
12638 SDValue HiMLALResult(SMLAL.getNode(), 1);
12639 SDValue LoMLALResult(SMLAL.getNode(), 0);
12640
12641 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
12642 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
12643
12644 // Return original node to notify the driver to stop replacing.
12645 SDValue resNode(AddcNode, 0);
12646 return resNode;
12647}
12648
12651 const ARMSubtarget *Subtarget) {
12652 // Look for multiply add opportunities.
12653 // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
12654 // each add nodes consumes a value from ISD::UMUL_LOHI and there is
12655 // a glue link from the first add to the second add.
12656 // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
12657 // a S/UMLAL instruction.
12658 // UMUL_LOHI
12659 // / :lo \ :hi
12660 // V \ [no multiline comment]
12661 // loAdd -> ADDC |
12662 // \ :carry /
12663 // V V
12664 // ADDE <- hiAdd
12665 //
12666 // In the special case where only the higher part of a signed result is used
12667 // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts
12668 // a constant with the exact value of 0x80000000, we recognize we are dealing
12669 // with a "rounded multiply and add" (or subtract) and transform it into
12670 // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively.
12671
12672 assert((AddeSubeNode->getOpcode() == ARMISD::ADDE ||
12673 AddeSubeNode->getOpcode() == ARMISD::SUBE) &&
12674 "Expect an ADDE or SUBE");
12675
12676 assert(AddeSubeNode->getNumOperands() == 3 &&
12677 AddeSubeNode->getOperand(2).getValueType() == MVT::i32 &&
12678 "ADDE node has the wrong inputs");
12679
12680 // Check that we are chained to the right ADDC or SUBC node.
12681 SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode();
12682 if ((AddeSubeNode->getOpcode() == ARMISD::ADDE &&
12683 AddcSubcNode->getOpcode() != ARMISD::ADDC) ||
12684 (AddeSubeNode->getOpcode() == ARMISD::SUBE &&
12685 AddcSubcNode->getOpcode() != ARMISD::SUBC))
12686 return SDValue();
12687
12688 SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0);
12689 SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1);
12690
12691 // Check if the two operands are from the same mul_lohi node.
12692 if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode())
12693 return SDValue();
12694
12695 assert(AddcSubcNode->getNumValues() == 2 &&
12696 AddcSubcNode->getValueType(0) == MVT::i32 &&
12697 "Expect ADDC with two result values. First: i32");
12698
12699 // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it
12700 // maybe a SMLAL which multiplies two 16-bit values.
12701 if (AddeSubeNode->getOpcode() == ARMISD::ADDE &&
12702 AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI &&
12703 AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI &&
12704 AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI &&
12705 AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI)
12706 return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget);
12707
12708 // Check for the triangle shape.
12709 SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0);
12710 SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1);
12711
12712 // Make sure that the ADDE/SUBE operands are not coming from the same node.
12713 if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode())
12714 return SDValue();
12715
12716 // Find the MUL_LOHI node walking up ADDE/SUBE's operands.
12717 bool IsLeftOperandMUL = false;
12718 SDValue MULOp = findMUL_LOHI(AddeSubeOp0);
12719 if (MULOp == SDValue())
12720 MULOp = findMUL_LOHI(AddeSubeOp1);
12721 else
12722 IsLeftOperandMUL = true;
12723 if (MULOp == SDValue())
12724 return SDValue();
12725
12726 // Figure out the right opcode.
12727 unsigned Opc = MULOp->getOpcode();
12728 unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
12729
12730 // Figure out the high and low input values to the MLAL node.
12731 SDValue *HiAddSub = nullptr;
12732 SDValue *LoMul = nullptr;
12733 SDValue *LowAddSub = nullptr;
12734
12735 // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI.
12736 if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1)))
12737 return SDValue();
12738
12739 if (IsLeftOperandMUL)
12740 HiAddSub = &AddeSubeOp1;
12741 else
12742 HiAddSub = &AddeSubeOp0;
12743
12744 // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node
12745 // whose low result is fed to the ADDC/SUBC we are checking.
12746
12747 if (AddcSubcOp0 == MULOp.getValue(0)) {
12748 LoMul = &AddcSubcOp0;
12749 LowAddSub = &AddcSubcOp1;
12750 }
12751 if (AddcSubcOp1 == MULOp.getValue(0)) {
12752 LoMul = &AddcSubcOp1;
12753 LowAddSub = &AddcSubcOp0;
12754 }
12755
12756 if (!LoMul)
12757 return SDValue();
12758
12759 // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC
12760 // the replacement below will create a cycle.
12761 if (AddcSubcNode == HiAddSub->getNode() ||
12762 AddcSubcNode->isPredecessorOf(HiAddSub->getNode()))
12763 return SDValue();
12764
12765 // Create the merged node.
12766 SelectionDAG &DAG = DCI.DAG;
12767
12768 // Start building operand list.
12770 Ops.push_back(LoMul->getOperand(0));
12771 Ops.push_back(LoMul->getOperand(1));
12772
12773 // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be
12774 // the case, we must be doing signed multiplication and only use the higher
12775 // part of the result of the MLAL, furthermore the LowAddSub must be a constant
12776 // addition or subtraction with the value of 0x800000.
12777 if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() &&
12778 FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) &&
12779 LowAddSub->getNode()->getOpcode() == ISD::Constant &&
12780 static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() ==
12781 0x80000000) {
12782 Ops.push_back(*HiAddSub);
12783 if (AddcSubcNode->getOpcode() == ARMISD::SUBC) {
12784 FinalOpc = ARMISD::SMMLSR;
12785 } else {
12786 FinalOpc = ARMISD::SMMLAR;
12787 }
12788 SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops);
12789 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode);
12790
12791 return SDValue(AddeSubeNode, 0);
12792 } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC)
12793 // SMMLS is generated during instruction selection and the rest of this
12794 // function can not handle the case where AddcSubcNode is a SUBC.
12795 return SDValue();
12796
12797 // Finish building the operand list for {U/S}MLAL
12798 Ops.push_back(*LowAddSub);
12799 Ops.push_back(*HiAddSub);
12800
12801 SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode),
12802 DAG.getVTList(MVT::i32, MVT::i32), Ops);
12803
12804 // Replace the ADDs' nodes uses by the MLA node's values.
12805 SDValue HiMLALResult(MLALNode.getNode(), 1);
12806 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult);
12807
12808 SDValue LoMLALResult(MLALNode.getNode(), 0);
12809 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult);
12810
12811 // Return original node to notify the driver to stop replacing.
12812 return SDValue(AddeSubeNode, 0);
12813}
12814
12817 const ARMSubtarget *Subtarget) {
12818 // UMAAL is similar to UMLAL except that it adds two unsigned values.
12819 // While trying to combine for the other MLAL nodes, first search for the
12820 // chance to use UMAAL. Check if Addc uses a node which has already
12821 // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde
12822 // as the addend, and it's handled in PerformUMLALCombine.
12823
12824 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
12825 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
12826
12827 // Check that we have a glued ADDC node.
12828 SDNode* AddcNode = AddeNode->getOperand(2).getNode();
12829 if (AddcNode->getOpcode() != ARMISD::ADDC)
12830 return SDValue();
12831
12832 // Find the converted UMAAL or quit if it doesn't exist.
12833 SDNode *UmlalNode = nullptr;
12834 SDValue AddHi;
12835 if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) {
12836 UmlalNode = AddcNode->getOperand(0).getNode();
12837 AddHi = AddcNode->getOperand(1);
12838 } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) {
12839 UmlalNode = AddcNode->getOperand(1).getNode();
12840 AddHi = AddcNode->getOperand(0);
12841 } else {
12842 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
12843 }
12844
12845 // The ADDC should be glued to an ADDE node, which uses the same UMLAL as
12846 // the ADDC as well as Zero.
12847 if (!isNullConstant(UmlalNode->getOperand(3)))
12848 return SDValue();
12849
12850 if ((isNullConstant(AddeNode->getOperand(0)) &&
12851 AddeNode->getOperand(1).getNode() == UmlalNode) ||
12852 (AddeNode->getOperand(0).getNode() == UmlalNode &&
12853 isNullConstant(AddeNode->getOperand(1)))) {
12854 SelectionDAG &DAG = DCI.DAG;
12855 SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1),
12856 UmlalNode->getOperand(2), AddHi };
12857 SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode),
12858 DAG.getVTList(MVT::i32, MVT::i32), Ops);
12859
12860 // Replace the ADDs' nodes uses by the UMAAL node's values.
12861 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1));
12862 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0));
12863
12864 // Return original node to notify the driver to stop replacing.
12865 return SDValue(AddeNode, 0);
12866 }
12867 return SDValue();
12868}
12869
12871 const ARMSubtarget *Subtarget) {
12872 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
12873 return SDValue();
12874
12875 // Check that we have a pair of ADDC and ADDE as operands.
12876 // Both addends of the ADDE must be zero.
12877 SDNode* AddcNode = N->getOperand(2).getNode();
12878 SDNode* AddeNode = N->getOperand(3).getNode();
12879 if ((AddcNode->getOpcode() == ARMISD::ADDC) &&
12880 (AddeNode->getOpcode() == ARMISD::ADDE) &&
12881 isNullConstant(AddeNode->getOperand(0)) &&
12882 isNullConstant(AddeNode->getOperand(1)) &&
12883 (AddeNode->getOperand(2).getNode() == AddcNode))
12884 return DAG.getNode(ARMISD::UMAAL, SDLoc(N),
12885 DAG.getVTList(MVT::i32, MVT::i32),
12886 {N->getOperand(0), N->getOperand(1),
12887 AddcNode->getOperand(0), AddcNode->getOperand(1)});
12888 else
12889 return SDValue();
12890}
12891
12894 const ARMSubtarget *Subtarget) {
12895 SelectionDAG &DAG(DCI.DAG);
12896
12897 if (N->getOpcode() == ARMISD::SUBC && N->hasAnyUseOfValue(1)) {
12898 // (SUBC (ADDE 0, 0, C), 1) -> C
12899 SDValue LHS = N->getOperand(0);
12900 SDValue RHS = N->getOperand(1);
12901 if (LHS->getOpcode() == ARMISD::ADDE &&
12902 isNullConstant(LHS->getOperand(0)) &&
12903 isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) {
12904 return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2));
12905 }
12906 }
12907
12908 if (Subtarget->isThumb1Only()) {
12909 SDValue RHS = N->getOperand(1);
12911 int32_t imm = C->getSExtValue();
12912 if (imm < 0 && imm > std::numeric_limits<int>::min()) {
12913 SDLoc DL(N);
12914 RHS = DAG.getConstant(-imm, DL, MVT::i32);
12915 unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC
12916 : ARMISD::ADDC;
12917 return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS);
12918 }
12919 }
12920 }
12921
12922 return SDValue();
12923}
12924
12927 const ARMSubtarget *Subtarget) {
12928 if (Subtarget->isThumb1Only()) {
12929 SelectionDAG &DAG = DCI.DAG;
12930 SDValue RHS = N->getOperand(1);
12932 int64_t imm = C->getSExtValue();
12933 if (imm < 0) {
12934 SDLoc DL(N);
12935
12936 // The with-carry-in form matches bitwise not instead of the negation.
12937 // Effectively, the inverse interpretation of the carry flag already
12938 // accounts for part of the negation.
12939 RHS = DAG.getConstant(~imm, DL, MVT::i32);
12940
12941 unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE
12942 : ARMISD::ADDE;
12943 return DAG.getNode(Opcode, DL, N->getVTList(),
12944 N->getOperand(0), RHS, N->getOperand(2));
12945 }
12946 }
12947 } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) {
12948 return AddCombineTo64bitMLAL(N, DCI, Subtarget);
12949 }
12950 return SDValue();
12951}
12952
12955 const ARMSubtarget *Subtarget) {
12956 if (!Subtarget->hasMVEIntegerOps())
12957 return SDValue();
12958
12959 SDLoc dl(N);
12960 SDValue SetCC;
12961 SDValue LHS;
12962 SDValue RHS;
12963 ISD::CondCode CC;
12964 SDValue TrueVal;
12965 SDValue FalseVal;
12966
12967 if (N->getOpcode() == ISD::SELECT &&
12968 N->getOperand(0)->getOpcode() == ISD::SETCC) {
12969 SetCC = N->getOperand(0);
12970 LHS = SetCC->getOperand(0);
12971 RHS = SetCC->getOperand(1);
12972 CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
12973 TrueVal = N->getOperand(1);
12974 FalseVal = N->getOperand(2);
12975 } else if (N->getOpcode() == ISD::SELECT_CC) {
12976 LHS = N->getOperand(0);
12977 RHS = N->getOperand(1);
12978 CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
12979 TrueVal = N->getOperand(2);
12980 FalseVal = N->getOperand(3);
12981 } else {
12982 return SDValue();
12983 }
12984
12985 unsigned int Opcode = 0;
12986 if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMIN ||
12987 FalseVal->getOpcode() == ISD::VECREDUCE_UMIN) &&
12988 (CC == ISD::SETULT || CC == ISD::SETUGT)) {
12989 Opcode = ARMISD::VMINVu;
12990 if (CC == ISD::SETUGT)
12991 std::swap(TrueVal, FalseVal);
12992 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMIN ||
12993 FalseVal->getOpcode() == ISD::VECREDUCE_SMIN) &&
12994 (CC == ISD::SETLT || CC == ISD::SETGT)) {
12995 Opcode = ARMISD::VMINVs;
12996 if (CC == ISD::SETGT)
12997 std::swap(TrueVal, FalseVal);
12998 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMAX ||
12999 FalseVal->getOpcode() == ISD::VECREDUCE_UMAX) &&
13000 (CC == ISD::SETUGT || CC == ISD::SETULT)) {
13001 Opcode = ARMISD::VMAXVu;
13002 if (CC == ISD::SETULT)
13003 std::swap(TrueVal, FalseVal);
13004 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMAX ||
13005 FalseVal->getOpcode() == ISD::VECREDUCE_SMAX) &&
13006 (CC == ISD::SETGT || CC == ISD::SETLT)) {
13007 Opcode = ARMISD::VMAXVs;
13008 if (CC == ISD::SETLT)
13009 std::swap(TrueVal, FalseVal);
13010 } else
13011 return SDValue();
13012
13013 // Normalise to the right hand side being the vector reduction
13014 switch (TrueVal->getOpcode()) {
13015 case ISD::VECREDUCE_UMIN:
13016 case ISD::VECREDUCE_SMIN:
13017 case ISD::VECREDUCE_UMAX:
13018 case ISD::VECREDUCE_SMAX:
13019 std::swap(LHS, RHS);
13020 std::swap(TrueVal, FalseVal);
13021 break;
13022 }
13023
13024 EVT VectorType = FalseVal->getOperand(0).getValueType();
13025
13026 if (VectorType != MVT::v16i8 && VectorType != MVT::v8i16 &&
13027 VectorType != MVT::v4i32)
13028 return SDValue();
13029
13030 EVT VectorScalarType = VectorType.getVectorElementType();
13031
13032 // The values being selected must also be the ones being compared
13033 if (TrueVal != LHS || FalseVal != RHS)
13034 return SDValue();
13035
13036 EVT LeftType = LHS->getValueType(0);
13037 EVT RightType = RHS->getValueType(0);
13038
13039 // The types must match the reduced type too
13040 if (LeftType != VectorScalarType || RightType != VectorScalarType)
13041 return SDValue();
13042
13043 // Legalise the scalar to an i32
13044 if (VectorScalarType != MVT::i32)
13045 LHS = DCI.DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
13046
13047 // Generate the reduction as an i32 for legalisation purposes
13048 auto Reduction =
13049 DCI.DAG.getNode(Opcode, dl, MVT::i32, LHS, RHS->getOperand(0));
13050
13051 // The result isn't actually an i32 so truncate it back to its original type
13052 if (VectorScalarType != MVT::i32)
13053 Reduction = DCI.DAG.getNode(ISD::TRUNCATE, dl, VectorScalarType, Reduction);
13054
13055 return Reduction;
13056}
13057
13058// A special combine for the vqdmulh family of instructions. This is one of the
13059// potential set of patterns that could patch this instruction. The base pattern
13060// you would expect to be min(max(ashr(mul(mul(sext(x), 2), sext(y)), 16))).
13061// This matches the different min(max(ashr(mul(mul(sext(x), sext(y)), 2), 16))),
13062// which llvm will have optimized to min(ashr(mul(sext(x), sext(y)), 15))) as
13063// the max is unnecessary.
13065 EVT VT = N->getValueType(0);
13066 SDValue Shft;
13067 ConstantSDNode *Clamp;
13068
13069 if (!VT.isVector() || VT.getScalarSizeInBits() > 64)
13070 return SDValue();
13071
13072 if (N->getOpcode() == ISD::SMIN) {
13073 Shft = N->getOperand(0);
13074 Clamp = isConstOrConstSplat(N->getOperand(1));
13075 } else if (N->getOpcode() == ISD::VSELECT) {
13076 // Detect a SMIN, which for an i64 node will be a vselect/setcc, not a smin.
13077 SDValue Cmp = N->getOperand(0);
13078 if (Cmp.getOpcode() != ISD::SETCC ||
13079 cast<CondCodeSDNode>(Cmp.getOperand(2))->get() != ISD::SETLT ||
13080 Cmp.getOperand(0) != N->getOperand(1) ||
13081 Cmp.getOperand(1) != N->getOperand(2))
13082 return SDValue();
13083 Shft = N->getOperand(1);
13084 Clamp = isConstOrConstSplat(N->getOperand(2));
13085 } else
13086 return SDValue();
13087
13088 if (!Clamp)
13089 return SDValue();
13090
13091 MVT ScalarType;
13092 int ShftAmt = 0;
13093 switch (Clamp->getSExtValue()) {
13094 case (1 << 7) - 1:
13095 ScalarType = MVT::i8;
13096 ShftAmt = 7;
13097 break;
13098 case (1 << 15) - 1:
13099 ScalarType = MVT::i16;
13100 ShftAmt = 15;
13101 break;
13102 case (1ULL << 31) - 1:
13103 ScalarType = MVT::i32;
13104 ShftAmt = 31;
13105 break;
13106 default:
13107 return SDValue();
13108 }
13109
13110 if (Shft.getOpcode() != ISD::SRA)
13111 return SDValue();
13113 if (!N1 || N1->getSExtValue() != ShftAmt)
13114 return SDValue();
13115
13116 SDValue Mul = Shft.getOperand(0);
13117 if (Mul.getOpcode() != ISD::MUL)
13118 return SDValue();
13119
13120 SDValue Ext0 = Mul.getOperand(0);
13121 SDValue Ext1 = Mul.getOperand(1);
13122 if (Ext0.getOpcode() != ISD::SIGN_EXTEND ||
13123 Ext1.getOpcode() != ISD::SIGN_EXTEND)
13124 return SDValue();
13125 EVT VecVT = Ext0.getOperand(0).getValueType();
13126 if (!VecVT.isPow2VectorType() || VecVT.getVectorNumElements() == 1)
13127 return SDValue();
13128 if (Ext1.getOperand(0).getValueType() != VecVT ||
13129 VecVT.getScalarType() != ScalarType ||
13130 VT.getScalarSizeInBits() < ScalarType.getScalarSizeInBits() * 2)
13131 return SDValue();
13132
13133 SDLoc DL(Mul);
13134 unsigned LegalLanes = 128 / (ShftAmt + 1);
13135 EVT LegalVecVT = MVT::getVectorVT(ScalarType, LegalLanes);
13136 // For types smaller than legal vectors extend to be legal and only use needed
13137 // lanes.
13138 if (VecVT.getSizeInBits() < 128) {
13139 EVT ExtVecVT =
13141 VecVT.getVectorNumElements());
13142 SDValue Inp0 =
13143 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext0.getOperand(0));
13144 SDValue Inp1 =
13145 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext1.getOperand(0));
13146 Inp0 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp0);
13147 Inp1 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp1);
13148 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
13149 SDValue Trunc = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, ExtVecVT, VQDMULH);
13150 Trunc = DAG.getNode(ISD::TRUNCATE, DL, VecVT, Trunc);
13151 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Trunc);
13152 }
13153
13154 // For larger types, split into legal sized chunks.
13155 assert(VecVT.getSizeInBits() % 128 == 0 && "Expected a power2 type");
13156 unsigned NumParts = VecVT.getSizeInBits() / 128;
13158 for (unsigned I = 0; I < NumParts; ++I) {
13159 SDValue Inp0 =
13160 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext0.getOperand(0),
13161 DAG.getVectorIdxConstant(I * LegalLanes, DL));
13162 SDValue Inp1 =
13163 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext1.getOperand(0),
13164 DAG.getVectorIdxConstant(I * LegalLanes, DL));
13165 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
13166 Parts.push_back(VQDMULH);
13167 }
13168 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT,
13169 DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Parts));
13170}
13171
13174 const ARMSubtarget *Subtarget) {
13175 if (!Subtarget->hasMVEIntegerOps())
13176 return SDValue();
13177
13178 if (SDValue V = PerformVQDMULHCombine(N, DCI.DAG))
13179 return V;
13180
13181 // Transforms vselect(not(cond), lhs, rhs) into vselect(cond, rhs, lhs).
13182 //
13183 // We need to re-implement this optimization here as the implementation in the
13184 // Target-Independent DAGCombiner does not handle the kind of constant we make
13185 // (it calls isConstOrConstSplat with AllowTruncation set to false - and for
13186 // good reason, allowing truncation there would break other targets).
13187 //
13188 // Currently, this is only done for MVE, as it's the only target that benefits
13189 // from this transformation (e.g. VPNOT+VPSEL becomes a single VPSEL).
13190 if (N->getOperand(0).getOpcode() != ISD::XOR)
13191 return SDValue();
13192 SDValue XOR = N->getOperand(0);
13193
13194 // Check if the XOR's RHS is either a 1, or a BUILD_VECTOR of 1s.
13195 // It is important to check with truncation allowed as the BUILD_VECTORs we
13196 // generate in those situations will truncate their operands.
13197 ConstantSDNode *Const =
13198 isConstOrConstSplat(XOR->getOperand(1), /*AllowUndefs*/ false,
13199 /*AllowTruncation*/ true);
13200 if (!Const || !Const->isOne())
13201 return SDValue();
13202
13203 // Rewrite into vselect(cond, rhs, lhs).
13204 SDValue Cond = XOR->getOperand(0);
13205 SDValue LHS = N->getOperand(1);
13206 SDValue RHS = N->getOperand(2);
13207 EVT Type = N->getValueType(0);
13208 return DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), Type, Cond, RHS, LHS);
13209}
13210
13211// Convert vsetcc([0,1,2,..], splat(n), ult) -> vctp n
13214 const ARMSubtarget *Subtarget) {
13215 SDValue Op0 = N->getOperand(0);
13216 SDValue Op1 = N->getOperand(1);
13217 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
13218 EVT VT = N->getValueType(0);
13219
13220 if (!Subtarget->hasMVEIntegerOps() ||
13222 return SDValue();
13223
13224 if (CC == ISD::SETUGE) {
13225 std::swap(Op0, Op1);
13226 CC = ISD::SETULT;
13227 }
13228
13229 if (CC != ISD::SETULT || VT.getScalarSizeInBits() != 1 ||
13231 return SDValue();
13232
13233 // Check first operand is BuildVector of 0,1,2,...
13234 for (unsigned I = 0; I < VT.getVectorNumElements(); I++) {
13235 if (!Op0.getOperand(I).isUndef() &&
13237 Op0.getConstantOperandVal(I) == I))
13238 return SDValue();
13239 }
13240
13241 // The second is a Splat of Op1S
13242 SDValue Op1S = DCI.DAG.getSplatValue(Op1);
13243 if (!Op1S)
13244 return SDValue();
13245
13246 unsigned Opc;
13247 switch (VT.getVectorNumElements()) {
13248 case 2:
13249 Opc = Intrinsic::arm_mve_vctp64;
13250 break;
13251 case 4:
13252 Opc = Intrinsic::arm_mve_vctp32;
13253 break;
13254 case 8:
13255 Opc = Intrinsic::arm_mve_vctp16;
13256 break;
13257 case 16:
13258 Opc = Intrinsic::arm_mve_vctp8;
13259 break;
13260 default:
13261 return SDValue();
13262 }
13263
13264 SDLoc DL(N);
13265 return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
13266 DCI.DAG.getConstant(Opc, DL, MVT::i32),
13267 DCI.DAG.getZExtOrTrunc(Op1S, DL, MVT::i32));
13268}
13269
13270/// PerformADDECombine - Target-specific dag combine transform from
13271/// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or
13272/// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
13275 const ARMSubtarget *Subtarget) {
13276 // Only ARM and Thumb2 support UMLAL/SMLAL.
13277 if (Subtarget->isThumb1Only())
13278 return PerformAddeSubeCombine(N, DCI, Subtarget);
13279
13280 // Only perform the checks after legalize when the pattern is available.
13281 if (DCI.isBeforeLegalize()) return SDValue();
13282
13283 return AddCombineTo64bitUMAAL(N, DCI, Subtarget);
13284}
13285
13286/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
13287/// operands N0 and N1. This is a helper for PerformADDCombine that is
13288/// called with the default operands, and if that fails, with commuted
13289/// operands.
13292 const ARMSubtarget *Subtarget){
13293 // Attempt to create vpadd for this add.
13294 if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget))
13295 return Result;
13296
13297 // Attempt to create vpaddl for this add.
13298 if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget))
13299 return Result;
13300 if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI,
13301 Subtarget))
13302 return Result;
13303
13304 // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
13305 if (N0.getNode()->hasOneUse())
13306 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI))
13307 return Result;
13308 return SDValue();
13309}
13310
13312 EVT VT = N->getValueType(0);
13313 SDValue N0 = N->getOperand(0);
13314 SDValue N1 = N->getOperand(1);
13315 SDLoc dl(N);
13316
13317 auto IsVecReduce = [](SDValue Op) {
13318 switch (Op.getOpcode()) {
13319 case ISD::VECREDUCE_ADD:
13320 case ARMISD::VADDVs:
13321 case ARMISD::VADDVu:
13322 case ARMISD::VMLAVs:
13323 case ARMISD::VMLAVu:
13324 return true;
13325 }
13326 return false;
13327 };
13328
13329 auto DistrubuteAddAddVecReduce = [&](SDValue N0, SDValue N1) {
13330 // Distribute add(X, add(vecreduce(Y), vecreduce(Z))) ->
13331 // add(add(X, vecreduce(Y)), vecreduce(Z))
13332 // to make better use of vaddva style instructions.
13333 if (VT == MVT::i32 && N1.getOpcode() == ISD::ADD && !IsVecReduce(N0) &&
13334 IsVecReduce(N1.getOperand(0)) && IsVecReduce(N1.getOperand(1)) &&
13335 !isa<ConstantSDNode>(N0) && N1->hasOneUse()) {
13336 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0, N1.getOperand(0));
13337 return DAG.getNode(ISD::ADD, dl, VT, Add0, N1.getOperand(1));
13338 }
13339 // And turn add(add(A, reduce(B)), add(C, reduce(D))) ->
13340 // add(add(add(A, C), reduce(B)), reduce(D))
13341 if (VT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
13342 N1.getOpcode() == ISD::ADD && N0->hasOneUse() && N1->hasOneUse()) {
13343 unsigned N0RedOp = 0;
13344 if (!IsVecReduce(N0.getOperand(N0RedOp))) {
13345 N0RedOp = 1;
13346 if (!IsVecReduce(N0.getOperand(N0RedOp)))
13347 return SDValue();
13348 }
13349
13350 unsigned N1RedOp = 0;
13351 if (!IsVecReduce(N1.getOperand(N1RedOp)))
13352 N1RedOp = 1;
13353 if (!IsVecReduce(N1.getOperand(N1RedOp)))
13354 return SDValue();
13355
13356 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0.getOperand(1 - N0RedOp),
13357 N1.getOperand(1 - N1RedOp));
13358 SDValue Add1 =
13359 DAG.getNode(ISD::ADD, dl, VT, Add0, N0.getOperand(N0RedOp));
13360 return DAG.getNode(ISD::ADD, dl, VT, Add1, N1.getOperand(N1RedOp));
13361 }
13362 return SDValue();
13363 };
13364 if (SDValue R = DistrubuteAddAddVecReduce(N0, N1))
13365 return R;
13366 if (SDValue R = DistrubuteAddAddVecReduce(N1, N0))
13367 return R;
13368
13369 // Distribute add(vecreduce(load(Y)), vecreduce(load(Z)))
13370 // Or add(add(X, vecreduce(load(Y))), vecreduce(load(Z)))
13371 // by ascending load offsets. This can help cores prefetch if the order of
13372 // loads is more predictable.
13373 auto DistrubuteVecReduceLoad = [&](SDValue N0, SDValue N1, bool IsForward) {
13374 // Check if two reductions are known to load data where one is before/after
13375 // another. Return negative if N0 loads data before N1, positive if N1 is
13376 // before N0 and 0 otherwise if nothing is known.
13377 auto IsKnownOrderedLoad = [&](SDValue N0, SDValue N1) {
13378 // Look through to the first operand of a MUL, for the VMLA case.
13379 // Currently only looks at the first operand, in the hope they are equal.
13380 if (N0.getOpcode() == ISD::MUL)
13381 N0 = N0.getOperand(0);
13382 if (N1.getOpcode() == ISD::MUL)
13383 N1 = N1.getOperand(0);
13384
13385 // Return true if the two operands are loads to the same object and the
13386 // offset of the first is known to be less than the offset of the second.
13387 LoadSDNode *Load0 = dyn_cast<LoadSDNode>(N0);
13388 LoadSDNode *Load1 = dyn_cast<LoadSDNode>(N1);
13389 if (!Load0 || !Load1 || Load0->getChain() != Load1->getChain() ||
13390 !Load0->isSimple() || !Load1->isSimple() || Load0->isIndexed() ||
13391 Load1->isIndexed())
13392 return 0;
13393
13394 auto BaseLocDecomp0 = BaseIndexOffset::match(Load0, DAG);
13395 auto BaseLocDecomp1 = BaseIndexOffset::match(Load1, DAG);
13396
13397 if (!BaseLocDecomp0.getBase() ||
13398 BaseLocDecomp0.getBase() != BaseLocDecomp1.getBase() ||
13399 !BaseLocDecomp0.hasValidOffset() || !BaseLocDecomp1.hasValidOffset())
13400 return 0;
13401 if (BaseLocDecomp0.getOffset() < BaseLocDecomp1.getOffset())
13402 return -1;
13403 if (BaseLocDecomp0.getOffset() > BaseLocDecomp1.getOffset())
13404 return 1;
13405 return 0;
13406 };
13407
13408 SDValue X;
13409 if (N0.getOpcode() == ISD::ADD && N0->hasOneUse()) {
13410 if (IsVecReduce(N0.getOperand(0)) && IsVecReduce(N0.getOperand(1))) {
13411 int IsBefore = IsKnownOrderedLoad(N0.getOperand(0).getOperand(0),
13412 N0.getOperand(1).getOperand(0));
13413 if (IsBefore < 0) {
13414 X = N0.getOperand(0);
13415 N0 = N0.getOperand(1);
13416 } else if (IsBefore > 0) {
13417 X = N0.getOperand(1);
13418 N0 = N0.getOperand(0);
13419 } else
13420 return SDValue();
13421 } else if (IsVecReduce(N0.getOperand(0))) {
13422 X = N0.getOperand(1);
13423 N0 = N0.getOperand(0);
13424 } else if (IsVecReduce(N0.getOperand(1))) {
13425 X = N0.getOperand(0);
13426 N0 = N0.getOperand(1);
13427 } else
13428 return SDValue();
13429 } else if (IsForward && IsVecReduce(N0) && IsVecReduce(N1) &&
13430 IsKnownOrderedLoad(N0.getOperand(0), N1.getOperand(0)) < 0) {
13431 // Note this is backward to how you would expect. We create
13432 // add(reduce(load + 16), reduce(load + 0)) so that the
13433 // add(reduce(load+16), X) is combined into VADDVA(X, load+16)), leaving
13434 // the X as VADDV(load + 0)
13435 return DAG.getNode(ISD::ADD, dl, VT, N1, N0);
13436 } else
13437 return SDValue();
13438
13439 if (!IsVecReduce(N0) || !IsVecReduce(N1))
13440 return SDValue();
13441
13442 if (IsKnownOrderedLoad(N1.getOperand(0), N0.getOperand(0)) >= 0)
13443 return SDValue();
13444
13445 // Switch from add(add(X, N0), N1) to add(add(X, N1), N0)
13446 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, X, N1);
13447 return DAG.getNode(ISD::ADD, dl, VT, Add0, N0);
13448 };
13449 if (SDValue R = DistrubuteVecReduceLoad(N0, N1, true))
13450 return R;
13451 if (SDValue R = DistrubuteVecReduceLoad(N1, N0, false))
13452 return R;
13453 return SDValue();
13454}
13455
13457 const ARMSubtarget *Subtarget) {
13458 if (!Subtarget->hasMVEIntegerOps())
13459 return SDValue();
13460
13462 return R;
13463
13464 EVT VT = N->getValueType(0);
13465 SDValue N0 = N->getOperand(0);
13466 SDValue N1 = N->getOperand(1);
13467 SDLoc dl(N);
13468
13469 if (VT != MVT::i64)
13470 return SDValue();
13471
13472 // We are looking for a i64 add of a VADDLVx. Due to these being i64's, this
13473 // will look like:
13474 // t1: i32,i32 = ARMISD::VADDLVs x
13475 // t2: i64 = build_pair t1, t1:1
13476 // t3: i64 = add t2, y
13477 // Otherwise we try to push the add up above VADDLVAx, to potentially allow
13478 // the add to be simplified separately.
13479 // We also need to check for sext / zext and commutitive adds.
13480 auto MakeVecReduce = [&](unsigned Opcode, unsigned OpcodeA, SDValue NA,
13481 SDValue NB) {
13482 if (NB->getOpcode() != ISD::BUILD_PAIR)
13483 return SDValue();
13484 SDValue VecRed = NB->getOperand(0);
13485 if ((VecRed->getOpcode() != Opcode && VecRed->getOpcode() != OpcodeA) ||
13486 VecRed.getResNo() != 0 ||
13487 NB->getOperand(1) != SDValue(VecRed.getNode(), 1))
13488 return SDValue();
13489
13490 if (VecRed->getOpcode() == OpcodeA) {
13491 // add(NA, VADDLVA(Inp), Y) -> VADDLVA(add(NA, Inp), Y)
13492 SDValue Inp = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
13493 VecRed.getOperand(0), VecRed.getOperand(1));
13494 NA = DAG.getNode(ISD::ADD, dl, MVT::i64, Inp, NA);
13495 }
13496
13498 std::tie(Ops[0], Ops[1]) = DAG.SplitScalar(NA, dl, MVT::i32, MVT::i32);
13499
13500 unsigned S = VecRed->getOpcode() == OpcodeA ? 2 : 0;
13501 for (unsigned I = S, E = VecRed.getNumOperands(); I < E; I++)
13502 Ops.push_back(VecRed->getOperand(I));
13503 SDValue Red =
13504 DAG.getNode(OpcodeA, dl, DAG.getVTList({MVT::i32, MVT::i32}), Ops);
13505 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Red,
13506 SDValue(Red.getNode(), 1));
13507 };
13508
13509 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N0, N1))
13510 return M;
13511 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N0, N1))
13512 return M;
13513 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N1, N0))
13514 return M;
13515 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N1, N0))
13516 return M;
13517 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N0, N1))
13518 return M;
13519 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N0, N1))
13520 return M;
13521 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N1, N0))
13522 return M;
13523 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N1, N0))
13524 return M;
13525 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N0, N1))
13526 return M;
13527 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N0, N1))
13528 return M;
13529 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N1, N0))
13530 return M;
13531 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N1, N0))
13532 return M;
13533 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N0, N1))
13534 return M;
13535 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N0, N1))
13536 return M;
13537 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N1, N0))
13538 return M;
13539 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N1, N0))
13540 return M;
13541 return SDValue();
13542}
13543
13544bool
13546 CombineLevel Level) const {
13547 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
13548 N->getOpcode() == ISD::SRL) &&
13549 "Expected shift op");
13550
13551 SDValue ShiftLHS = N->getOperand(0);
13552 if (!ShiftLHS->hasOneUse())
13553 return false;
13554
13555 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
13556 !ShiftLHS.getOperand(0)->hasOneUse())
13557 return false;
13558
13559 if (Level == BeforeLegalizeTypes)
13560 return true;
13561
13562 if (N->getOpcode() != ISD::SHL)
13563 return true;
13564
13565 if (Subtarget->isThumb1Only()) {
13566 // Avoid making expensive immediates by commuting shifts. (This logic
13567 // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted
13568 // for free.)
13569 if (N->getOpcode() != ISD::SHL)
13570 return true;
13571 SDValue N1 = N->getOperand(0);
13572 if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND &&
13573 N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR)
13574 return true;
13575 if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) {
13576 if (Const->getAPIntValue().ult(256))
13577 return false;
13578 if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) &&
13579 Const->getAPIntValue().sgt(-256))
13580 return false;
13581 }
13582 return true;
13583 }
13584
13585 // Turn off commute-with-shift transform after legalization, so it doesn't
13586 // conflict with PerformSHLSimplify. (We could try to detect when
13587 // PerformSHLSimplify would trigger more precisely, but it isn't
13588 // really necessary.)
13589 return false;
13590}
13591
13593 const SDNode *N) const {
13594 assert(N->getOpcode() == ISD::XOR &&
13595 (N->getOperand(0).getOpcode() == ISD::SHL ||
13596 N->getOperand(0).getOpcode() == ISD::SRL) &&
13597 "Expected XOR(SHIFT) pattern");
13598
13599 // Only commute if the entire NOT mask is a hidden shifted mask.
13600 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
13601 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
13602 if (XorC && ShiftC) {
13603 unsigned MaskIdx, MaskLen;
13604 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
13605 unsigned ShiftAmt = ShiftC->getZExtValue();
13606 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
13607 if (N->getOperand(0).getOpcode() == ISD::SHL)
13608 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
13609 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
13610 }
13611 }
13612
13613 return false;
13614}
13615
13617 const SDNode *N) const {
13618 assert(((N->getOpcode() == ISD::SHL &&
13619 N->getOperand(0).getOpcode() == ISD::SRL) ||
13620 (N->getOpcode() == ISD::SRL &&
13621 N->getOperand(0).getOpcode() == ISD::SHL)) &&
13622 "Expected shift-shift mask");
13623
13624 if (!Subtarget->isThumb1Only())
13625 return true;
13626
13627 EVT VT = N->getValueType(0);
13628 if (VT.getScalarSizeInBits() > 32)
13629 return true;
13630
13631 return false;
13632}
13633
13635 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
13636 SDValue Y) const {
13637 return Subtarget->hasMVEIntegerOps() && isTypeLegal(VT) &&
13638 SelectOpcode == ISD::VSELECT;
13639}
13640
13642 if (!Subtarget->hasNEON()) {
13643 if (Subtarget->isThumb1Only())
13644 return VT.getScalarSizeInBits() <= 32;
13645 return true;
13646 }
13647 return VT.isScalarInteger();
13648}
13649
13651 EVT VT) const {
13652 if (!isOperationLegalOrCustom(Op, VT) || !FPVT.isSimple())
13653 return false;
13654
13655 switch (FPVT.getSimpleVT().SimpleTy) {
13656 case MVT::f16:
13657 return Subtarget->hasVFP2Base();
13658 case MVT::f32:
13659 return Subtarget->hasVFP2Base();
13660 case MVT::f64:
13661 return Subtarget->hasFP64();
13662 case MVT::v4f32:
13663 case MVT::v8f16:
13664 return Subtarget->hasMVEFloatOps();
13665 default:
13666 return false;
13667 }
13668}
13669
13672 const ARMSubtarget *ST) {
13673 // Allow the generic combiner to identify potential bswaps.
13674 if (DCI.isBeforeLegalize())
13675 return SDValue();
13676
13677 // DAG combiner will fold:
13678 // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
13679 // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2
13680 // Other code patterns that can be also be modified have the following form:
13681 // b + ((a << 1) | 510)
13682 // b + ((a << 1) & 510)
13683 // b + ((a << 1) ^ 510)
13684 // b + ((a << 1) + 510)
13685
13686 // Many instructions can perform the shift for free, but it requires both
13687 // the operands to be registers. If c1 << c2 is too large, a mov immediate
13688 // instruction will needed. So, unfold back to the original pattern if:
13689 // - if c1 and c2 are small enough that they don't require mov imms.
13690 // - the user(s) of the node can perform an shl
13691
13692 // No shifted operands for 16-bit instructions.
13693 if (ST->isThumb() && ST->isThumb1Only())
13694 return SDValue();
13695
13696 // Check that all the users could perform the shl themselves.
13697 for (auto *U : N->users()) {
13698 switch(U->getOpcode()) {
13699 default:
13700 return SDValue();
13701 case ISD::SUB:
13702 case ISD::ADD:
13703 case ISD::AND:
13704 case ISD::OR:
13705 case ISD::XOR:
13706 case ISD::SETCC:
13707 case ARMISD::CMP:
13708 // Check that the user isn't already using a constant because there
13709 // aren't any instructions that support an immediate operand and a
13710 // shifted operand.
13711 if (isa<ConstantSDNode>(U->getOperand(0)) ||
13712 isa<ConstantSDNode>(U->getOperand(1)))
13713 return SDValue();
13714
13715 // Check that it's not already using a shift.
13716 if (U->getOperand(0).getOpcode() == ISD::SHL ||
13717 U->getOperand(1).getOpcode() == ISD::SHL)
13718 return SDValue();
13719 break;
13720 }
13721 }
13722
13723 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR &&
13724 N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND)
13725 return SDValue();
13726
13727 if (N->getOperand(0).getOpcode() != ISD::SHL)
13728 return SDValue();
13729
13730 SDValue SHL = N->getOperand(0);
13731
13732 auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
13733 auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
13734 if (!C1ShlC2 || !C2)
13735 return SDValue();
13736
13737 APInt C2Int = C2->getAPIntValue();
13738 APInt C1Int = C1ShlC2->getAPIntValue();
13739 unsigned C2Width = C2Int.getBitWidth();
13740 if (C2Int.uge(C2Width))
13741 return SDValue();
13742 uint64_t C2Value = C2Int.getZExtValue();
13743
13744 // Check that performing a lshr will not lose any information.
13745 APInt Mask = APInt::getHighBitsSet(C2Width, C2Width - C2Value);
13746 if ((C1Int & Mask) != C1Int)
13747 return SDValue();
13748
13749 // Shift the first constant.
13750 C1Int.lshrInPlace(C2Int);
13751
13752 // The immediates are encoded as an 8-bit value that can be rotated.
13753 auto LargeImm = [](const APInt &Imm) {
13754 unsigned Zeros = Imm.countl_zero() + Imm.countr_zero();
13755 return Imm.getBitWidth() - Zeros > 8;
13756 };
13757
13758 if (LargeImm(C1Int) || LargeImm(C2Int))
13759 return SDValue();
13760
13761 SelectionDAG &DAG = DCI.DAG;
13762 SDLoc dl(N);
13763 SDValue X = SHL.getOperand(0);
13764 SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X,
13765 DAG.getConstant(C1Int, dl, MVT::i32));
13766 // Shift left to compensate for the lshr of C1Int.
13767 SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1));
13768
13769 LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump();
13770 SHL.dump(); N->dump());
13771 LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump());
13772 return Res;
13773}
13774
13775
13776/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
13777///
13780 const ARMSubtarget *Subtarget) {
13781 SDValue N0 = N->getOperand(0);
13782 SDValue N1 = N->getOperand(1);
13783
13784 // Only works one way, because it needs an immediate operand.
13785 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
13786 return Result;
13787
13788 if (SDValue Result = PerformADDVecReduce(N, DCI.DAG, Subtarget))
13789 return Result;
13790
13791 // First try with the default operand order.
13792 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
13793 return Result;
13794
13795 // If that didn't work, try again with the operands commuted.
13796 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
13797}
13798
13799// Combine (sub 0, (csinc X, Y, CC)) -> (csinv -X, Y, CC)
13800// providing -X is as cheap as X (currently, just a constant).
13802 if (N->getValueType(0) != MVT::i32 || !isNullConstant(N->getOperand(0)))
13803 return SDValue();
13804 SDValue CSINC = N->getOperand(1);
13805 if (CSINC.getOpcode() != ARMISD::CSINC || !CSINC.hasOneUse())
13806 return SDValue();
13807
13809 if (!X)
13810 return SDValue();
13811
13812 return DAG.getNode(ARMISD::CSINV, SDLoc(N), MVT::i32,
13813 DAG.getNode(ISD::SUB, SDLoc(N), MVT::i32, N->getOperand(0),
13814 CSINC.getOperand(0)),
13815 CSINC.getOperand(1), CSINC.getOperand(2),
13816 CSINC.getOperand(3));
13817}
13818
13820 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
13821}
13822
13823// Try to fold
13824//
13825// (neg (cmov X, Y)) -> (cmov (neg X), (neg Y))
13826//
13827// The folding helps cmov to be matched with csneg without generating
13828// redundant neg instruction.
13830 if (!isNegatedInteger(SDValue(N, 0)))
13831 return SDValue();
13832
13833 SDValue CMov = N->getOperand(1);
13834 if (CMov.getOpcode() != ARMISD::CMOV || !CMov->hasOneUse())
13835 return SDValue();
13836
13837 SDValue N0 = CMov.getOperand(0);
13838 SDValue N1 = CMov.getOperand(1);
13839
13840 // If neither of them are negations, it's not worth the folding as it
13841 // introduces two additional negations while reducing one negation.
13842 if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
13843 return SDValue();
13844
13845 SDLoc DL(N);
13846 EVT VT = CMov.getValueType();
13847
13848 SDValue N0N = DAG.getNegative(N0, DL, VT);
13849 SDValue N1N = DAG.getNegative(N1, DL, VT);
13850 return DAG.getNode(ARMISD::CMOV, DL, VT, N0N, N1N, CMov.getOperand(2),
13851 CMov.getOperand(3));
13852}
13853
13854/// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
13855///
13858 const ARMSubtarget *Subtarget) {
13859 SDValue N0 = N->getOperand(0);
13860 SDValue N1 = N->getOperand(1);
13861
13862 // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
13863 if (N1.getNode()->hasOneUse())
13864 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI))
13865 return Result;
13866
13867 if (SDValue R = PerformSubCSINCCombine(N, DCI.DAG))
13868 return R;
13869
13870 if (SDValue Val = performNegCMovCombine(N, DCI.DAG))
13871 return Val;
13872
13873 if (!Subtarget->hasMVEIntegerOps() || !N->getValueType(0).isVector())
13874 return SDValue();
13875
13876 // Fold (sub (ARMvmovImm 0), (ARMvdup x)) -> (ARMvdup (sub 0, x))
13877 // so that we can readily pattern match more mve instructions which can use
13878 // a scalar operand.
13879 SDValue VDup = N->getOperand(1);
13880 if (VDup->getOpcode() != ARMISD::VDUP)
13881 return SDValue();
13882
13883 SDValue VMov = N->getOperand(0);
13884 if (VMov->getOpcode() == ISD::BITCAST)
13885 VMov = VMov->getOperand(0);
13886
13887 if (VMov->getOpcode() != ARMISD::VMOVIMM || !isZeroVector(VMov))
13888 return SDValue();
13889
13890 SDLoc dl(N);
13891 SDValue Negate = DCI.DAG.getNode(ISD::SUB, dl, MVT::i32,
13892 DCI.DAG.getConstant(0, dl, MVT::i32),
13893 VDup->getOperand(0));
13894 return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), Negate);
13895}
13896
13897/// PerformVMULCombine
13898/// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
13899/// special multiplier accumulator forwarding.
13900/// vmul d3, d0, d2
13901/// vmla d3, d1, d2
13902/// is faster than
13903/// vadd d3, d0, d1
13904/// vmul d3, d3, d2
13905// However, for (A + B) * (A + B),
13906// vadd d2, d0, d1
13907// vmul d3, d0, d2
13908// vmla d3, d1, d2
13909// is slower than
13910// vadd d2, d0, d1
13911// vmul d3, d2, d2
13914 const ARMSubtarget *Subtarget) {
13915 if (!Subtarget->hasVMLxForwarding())
13916 return SDValue();
13917
13918 SelectionDAG &DAG = DCI.DAG;
13919 SDValue N0 = N->getOperand(0);
13920 SDValue N1 = N->getOperand(1);
13921 unsigned Opcode = N0.getOpcode();
13922 if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
13923 Opcode != ISD::FADD && Opcode != ISD::FSUB) {
13924 Opcode = N1.getOpcode();
13925 if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
13926 Opcode != ISD::FADD && Opcode != ISD::FSUB)
13927 return SDValue();
13928 std::swap(N0, N1);
13929 }
13930
13931 if (N0 == N1)
13932 return SDValue();
13933
13934 EVT VT = N->getValueType(0);
13935 SDLoc DL(N);
13936 SDValue N00 = N0->getOperand(0);
13937 SDValue N01 = N0->getOperand(1);
13938 return DAG.getNode(Opcode, DL, VT,
13939 DAG.getNode(ISD::MUL, DL, VT, N00, N1),
13940 DAG.getNode(ISD::MUL, DL, VT, N01, N1));
13941}
13942
13944 const ARMSubtarget *Subtarget) {
13945 EVT VT = N->getValueType(0);
13946 if (VT != MVT::v2i64)
13947 return SDValue();
13948
13949 SDValue N0 = N->getOperand(0);
13950 SDValue N1 = N->getOperand(1);
13951
13952 auto IsSignExt = [&](SDValue Op) {
13953 if (Op->getOpcode() != ISD::SIGN_EXTEND_INREG)
13954 return SDValue();
13955 EVT VT = cast<VTSDNode>(Op->getOperand(1))->getVT();
13956 if (VT.getScalarSizeInBits() == 32)
13957 return Op->getOperand(0);
13958 return SDValue();
13959 };
13960 auto IsZeroExt = [&](SDValue Op) {
13961 // Zero extends are a little more awkward. At the point we are matching
13962 // this, we are looking for an AND with a (-1, 0, -1, 0) buildvector mask.
13963 // That might be before of after a bitcast depending on how the and is
13964 // placed. Because this has to look through bitcasts, it is currently only
13965 // supported on LE.
13966 if (!Subtarget->isLittle())
13967 return SDValue();
13968
13969 SDValue And = Op;
13970 if (And->getOpcode() == ISD::BITCAST)
13971 And = And->getOperand(0);
13972 if (And->getOpcode() != ISD::AND)
13973 return SDValue();
13974 SDValue Mask = And->getOperand(1);
13975 if (Mask->getOpcode() == ISD::BITCAST)
13976 Mask = Mask->getOperand(0);
13977
13978 if (Mask->getOpcode() != ISD::BUILD_VECTOR ||
13979 Mask.getValueType() != MVT::v4i32)
13980 return SDValue();
13981 if (isAllOnesConstant(Mask->getOperand(0)) &&
13982 isNullConstant(Mask->getOperand(1)) &&
13983 isAllOnesConstant(Mask->getOperand(2)) &&
13984 isNullConstant(Mask->getOperand(3)))
13985 return And->getOperand(0);
13986 return SDValue();
13987 };
13988
13989 SDLoc dl(N);
13990 if (SDValue Op0 = IsSignExt(N0)) {
13991 if (SDValue Op1 = IsSignExt(N1)) {
13992 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
13993 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
13994 return DAG.getNode(ARMISD::VMULLs, dl, VT, New0a, New1a);
13995 }
13996 }
13997 if (SDValue Op0 = IsZeroExt(N0)) {
13998 if (SDValue Op1 = IsZeroExt(N1)) {
13999 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
14000 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
14001 return DAG.getNode(ARMISD::VMULLu, dl, VT, New0a, New1a);
14002 }
14003 }
14004
14005 return SDValue();
14006}
14007
14010 const ARMSubtarget *Subtarget) {
14011 SelectionDAG &DAG = DCI.DAG;
14012
14013 EVT VT = N->getValueType(0);
14014 if (Subtarget->hasMVEIntegerOps() && VT == MVT::v2i64)
14015 return PerformMVEVMULLCombine(N, DAG, Subtarget);
14016
14017 if (Subtarget->isThumb1Only())
14018 return SDValue();
14019
14020 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
14021 return SDValue();
14022
14023 if (VT.is64BitVector() || VT.is128BitVector())
14024 return PerformVMULCombine(N, DCI, Subtarget);
14025 if (VT != MVT::i32)
14026 return SDValue();
14027
14028 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
14029 if (!C)
14030 return SDValue();
14031
14032 int64_t MulAmt = C->getSExtValue();
14033 unsigned ShiftAmt = llvm::countr_zero<uint64_t>(MulAmt);
14034
14035 ShiftAmt = ShiftAmt & (32 - 1);
14036 SDValue V = N->getOperand(0);
14037 SDLoc DL(N);
14038
14039 SDValue Res;
14040 MulAmt >>= ShiftAmt;
14041
14042 if (MulAmt >= 0) {
14043 if (llvm::has_single_bit<uint32_t>(MulAmt - 1)) {
14044 // (mul x, 2^N + 1) => (add (shl x, N), x)
14045 Res = DAG.getNode(ISD::ADD, DL, VT,
14046 V,
14047 DAG.getNode(ISD::SHL, DL, VT,
14048 V,
14049 DAG.getConstant(Log2_32(MulAmt - 1), DL,
14050 MVT::i32)));
14051 } else if (llvm::has_single_bit<uint32_t>(MulAmt + 1)) {
14052 // (mul x, 2^N - 1) => (sub (shl x, N), x)
14053 Res = DAG.getNode(ISD::SUB, DL, VT,
14054 DAG.getNode(ISD::SHL, DL, VT,
14055 V,
14056 DAG.getConstant(Log2_32(MulAmt + 1), DL,
14057 MVT::i32)),
14058 V);
14059 } else
14060 return SDValue();
14061 } else {
14062 uint64_t MulAmtAbs = -MulAmt;
14063 if (llvm::has_single_bit<uint32_t>(MulAmtAbs + 1)) {
14064 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
14065 Res = DAG.getNode(ISD::SUB, DL, VT,
14066 V,
14067 DAG.getNode(ISD::SHL, DL, VT,
14068 V,
14069 DAG.getConstant(Log2_32(MulAmtAbs + 1), DL,
14070 MVT::i32)));
14071 } else if (llvm::has_single_bit<uint32_t>(MulAmtAbs - 1)) {
14072 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
14073 Res = DAG.getNode(ISD::ADD, DL, VT,
14074 V,
14075 DAG.getNode(ISD::SHL, DL, VT,
14076 V,
14077 DAG.getConstant(Log2_32(MulAmtAbs - 1), DL,
14078 MVT::i32)));
14079 Res = DAG.getNode(ISD::SUB, DL, VT,
14080 DAG.getConstant(0, DL, MVT::i32), Res);
14081 } else
14082 return SDValue();
14083 }
14084
14085 if (ShiftAmt != 0)
14086 Res = DAG.getNode(ISD::SHL, DL, VT,
14087 Res, DAG.getConstant(ShiftAmt, DL, MVT::i32));
14088
14089 // Do not add new nodes to DAG combiner worklist.
14090 DCI.CombineTo(N, Res, false);
14091 return SDValue();
14092}
14093
14096 const ARMSubtarget *Subtarget) {
14097 // Allow DAGCombine to pattern-match before we touch the canonical form.
14098 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
14099 return SDValue();
14100
14101 if (N->getValueType(0) != MVT::i32)
14102 return SDValue();
14103
14104 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
14105 if (!N1C)
14106 return SDValue();
14107
14108 uint32_t C1 = (uint32_t)N1C->getZExtValue();
14109 // Don't transform uxtb/uxth.
14110 if (C1 == 255 || C1 == 65535)
14111 return SDValue();
14112
14113 SDNode *N0 = N->getOperand(0).getNode();
14114 if (!N0->hasOneUse())
14115 return SDValue();
14116
14117 if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL)
14118 return SDValue();
14119
14120 bool LeftShift = N0->getOpcode() == ISD::SHL;
14121
14123 if (!N01C)
14124 return SDValue();
14125
14126 uint32_t C2 = (uint32_t)N01C->getZExtValue();
14127 if (!C2 || C2 >= 32)
14128 return SDValue();
14129
14130 // Clear irrelevant bits in the mask.
14131 if (LeftShift)
14132 C1 &= (-1U << C2);
14133 else
14134 C1 &= (-1U >> C2);
14135
14136 SelectionDAG &DAG = DCI.DAG;
14137 SDLoc DL(N);
14138
14139 // We have a pattern of the form "(and (shl x, c2) c1)" or
14140 // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to
14141 // transform to a pair of shifts, to save materializing c1.
14142
14143 // First pattern: right shift, then mask off leading bits.
14144 // FIXME: Use demanded bits?
14145 if (!LeftShift && isMask_32(C1)) {
14146 uint32_t C3 = llvm::countl_zero(C1);
14147 if (C2 < C3) {
14148 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
14149 DAG.getConstant(C3 - C2, DL, MVT::i32));
14150 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
14151 DAG.getConstant(C3, DL, MVT::i32));
14152 }
14153 }
14154
14155 // First pattern, reversed: left shift, then mask off trailing bits.
14156 if (LeftShift && isMask_32(~C1)) {
14157 uint32_t C3 = llvm::countr_zero(C1);
14158 if (C2 < C3) {
14159 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
14160 DAG.getConstant(C3 - C2, DL, MVT::i32));
14161 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
14162 DAG.getConstant(C3, DL, MVT::i32));
14163 }
14164 }
14165
14166 // Second pattern: left shift, then mask off leading bits.
14167 // FIXME: Use demanded bits?
14168 if (LeftShift && isShiftedMask_32(C1)) {
14169 uint32_t Trailing = llvm::countr_zero(C1);
14170 uint32_t C3 = llvm::countl_zero(C1);
14171 if (Trailing == C2 && C2 + C3 < 32) {
14172 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
14173 DAG.getConstant(C2 + C3, DL, MVT::i32));
14174 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
14175 DAG.getConstant(C3, DL, MVT::i32));
14176 }
14177 }
14178
14179 // Second pattern, reversed: right shift, then mask off trailing bits.
14180 // FIXME: Handle other patterns of known/demanded bits.
14181 if (!LeftShift && isShiftedMask_32(C1)) {
14182 uint32_t Leading = llvm::countl_zero(C1);
14183 uint32_t C3 = llvm::countr_zero(C1);
14184 if (Leading == C2 && C2 + C3 < 32) {
14185 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
14186 DAG.getConstant(C2 + C3, DL, MVT::i32));
14187 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
14188 DAG.getConstant(C3, DL, MVT::i32));
14189 }
14190 }
14191
14192 // Transform "(and (shl x, c2) c1)" into "(shl (and x, c1>>c2), c2)"
14193 // if "c1 >> c2" is a cheaper immediate than "c1"
14194 if (LeftShift &&
14195 HasLowerConstantMaterializationCost(C1 >> C2, C1, Subtarget)) {
14196
14197 SDValue And = DAG.getNode(ISD::AND, DL, MVT::i32, N0->getOperand(0),
14198 DAG.getConstant(C1 >> C2, DL, MVT::i32));
14199 return DAG.getNode(ISD::SHL, DL, MVT::i32, And,
14200 DAG.getConstant(C2, DL, MVT::i32));
14201 }
14202
14203 return SDValue();
14204}
14205
14208 const ARMSubtarget *Subtarget) {
14209 // Attempt to use immediate-form VBIC
14210 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
14211 SDLoc dl(N);
14212 EVT VT = N->getValueType(0);
14213 SelectionDAG &DAG = DCI.DAG;
14214
14215 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT) || VT == MVT::v2i1 ||
14216 VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)
14217 return SDValue();
14218
14219 APInt SplatBits, SplatUndef;
14220 unsigned SplatBitSize;
14221 bool HasAnyUndefs;
14222 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
14223 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14224 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
14225 SplatBitSize == 64) {
14226 EVT VbicVT;
14227 SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(),
14228 SplatUndef.getZExtValue(), SplatBitSize,
14229 DAG, dl, VbicVT, VT, OtherModImm);
14230 if (Val.getNode()) {
14231 SDValue Input =
14232 DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VbicVT, N->getOperand(0));
14233 SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
14234 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vbic);
14235 }
14236 }
14237 }
14238
14239 if (!Subtarget->isThumb1Only()) {
14240 // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
14241 if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))
14242 return Result;
14243
14244 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14245 return Result;
14246 }
14247
14248 if (Subtarget->isThumb1Only())
14249 if (SDValue Result = CombineANDShift(N, DCI, Subtarget))
14250 return Result;
14251
14252 return SDValue();
14253}
14254
14255// Try combining OR nodes to SMULWB, SMULWT.
14258 const ARMSubtarget *Subtarget) {
14259 if (!Subtarget->hasV6Ops() ||
14260 (Subtarget->isThumb() &&
14261 (!Subtarget->hasThumb2() || !Subtarget->hasDSP())))
14262 return SDValue();
14263
14264 SDValue SRL = OR->getOperand(0);
14265 SDValue SHL = OR->getOperand(1);
14266
14267 if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
14268 SRL = OR->getOperand(1);
14269 SHL = OR->getOperand(0);
14270 }
14271 if (!isSRL16(SRL) || !isSHL16(SHL))
14272 return SDValue();
14273
14274 // The first operands to the shifts need to be the two results from the
14275 // same smul_lohi node.
14276 if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||
14277 SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
14278 return SDValue();
14279
14280 SDNode *SMULLOHI = SRL.getOperand(0).getNode();
14281 if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||
14282 SHL.getOperand(0) != SDValue(SMULLOHI, 1))
14283 return SDValue();
14284
14285 // Now we have:
14286 // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
14287 // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
14288 // For SMUWB the 16-bit value will signed extended somehow.
14289 // For SMULWT only the SRA is required.
14290 // Check both sides of SMUL_LOHI
14291 SDValue OpS16 = SMULLOHI->getOperand(0);
14292 SDValue OpS32 = SMULLOHI->getOperand(1);
14293
14294 SelectionDAG &DAG = DCI.DAG;
14295 if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) {
14296 OpS16 = OpS32;
14297 OpS32 = SMULLOHI->getOperand(0);
14298 }
14299
14300 SDLoc dl(OR);
14301 unsigned Opcode = 0;
14302 if (isS16(OpS16, DAG))
14303 Opcode = ARMISD::SMULWB;
14304 else if (isSRA16(OpS16)) {
14305 Opcode = ARMISD::SMULWT;
14306 OpS16 = OpS16->getOperand(0);
14307 }
14308 else
14309 return SDValue();
14310
14311 SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16);
14312 DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res);
14313 return SDValue(OR, 0);
14314}
14315
14318 const ARMSubtarget *Subtarget) {
14319 // BFI is only available on V6T2+
14320 if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
14321 return SDValue();
14322
14323 EVT VT = N->getValueType(0);
14324 SDValue N0 = N->getOperand(0);
14325 SDValue N1 = N->getOperand(1);
14326 SelectionDAG &DAG = DCI.DAG;
14327 SDLoc DL(N);
14328 // 1) or (and A, mask), val => ARMbfi A, val, mask
14329 // iff (val & mask) == val
14330 //
14331 // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
14332 // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
14333 // && mask == ~mask2
14334 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
14335 // && ~mask == mask2
14336 // (i.e., copy a bitfield value into another bitfield of the same width)
14337
14338 if (VT != MVT::i32)
14339 return SDValue();
14340
14341 SDValue N00 = N0.getOperand(0);
14342
14343 // The value and the mask need to be constants so we can verify this is
14344 // actually a bitfield set. If the mask is 0xffff, we can do better
14345 // via a movt instruction, so don't use BFI in that case.
14346 SDValue MaskOp = N0.getOperand(1);
14348 if (!MaskC)
14349 return SDValue();
14350 unsigned Mask = MaskC->getZExtValue();
14351 if (Mask == 0xffff)
14352 return SDValue();
14353 SDValue Res;
14354 // Case (1): or (and A, mask), val => ARMbfi A, val, mask
14356 if (N1C) {
14357 unsigned Val = N1C->getZExtValue();
14358 if ((Val & ~Mask) != Val)
14359 return SDValue();
14360
14361 if (ARM::isBitFieldInvertedMask(Mask)) {
14362 Val >>= llvm::countr_zero(~Mask);
14363
14364 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
14365 DAG.getConstant(Val, DL, MVT::i32),
14366 DAG.getConstant(Mask, DL, MVT::i32));
14367
14368 DCI.CombineTo(N, Res, false);
14369 // Return value from the original node to inform the combiner than N is
14370 // now dead.
14371 return SDValue(N, 0);
14372 }
14373 } else if (N1.getOpcode() == ISD::AND) {
14374 // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
14376 if (!N11C)
14377 return SDValue();
14378 unsigned Mask2 = N11C->getZExtValue();
14379
14380 // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
14381 // as is to match.
14382 if (ARM::isBitFieldInvertedMask(Mask) &&
14383 (Mask == ~Mask2)) {
14384 // The pack halfword instruction works better for masks that fit it,
14385 // so use that when it's available.
14386 if (Subtarget->hasDSP() &&
14387 (Mask == 0xffff || Mask == 0xffff0000))
14388 return SDValue();
14389 // 2a
14390 unsigned amt = llvm::countr_zero(Mask2);
14391 Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
14392 DAG.getConstant(amt, DL, MVT::i32));
14393 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
14394 DAG.getConstant(Mask, DL, MVT::i32));
14395 DCI.CombineTo(N, Res, false);
14396 // Return value from the original node to inform the combiner than N is
14397 // now dead.
14398 return SDValue(N, 0);
14399 } else if (ARM::isBitFieldInvertedMask(~Mask) &&
14400 (~Mask == Mask2)) {
14401 // The pack halfword instruction works better for masks that fit it,
14402 // so use that when it's available.
14403 if (Subtarget->hasDSP() &&
14404 (Mask2 == 0xffff || Mask2 == 0xffff0000))
14405 return SDValue();
14406 // 2b
14407 unsigned lsb = llvm::countr_zero(Mask);
14408 Res = DAG.getNode(ISD::SRL, DL, VT, N00,
14409 DAG.getConstant(lsb, DL, MVT::i32));
14410 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
14411 DAG.getConstant(Mask2, DL, MVT::i32));
14412 DCI.CombineTo(N, Res, false);
14413 // Return value from the original node to inform the combiner than N is
14414 // now dead.
14415 return SDValue(N, 0);
14416 }
14417 }
14418
14419 if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
14420 N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
14422 // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
14423 // where lsb(mask) == #shamt and masked bits of B are known zero.
14424 SDValue ShAmt = N00.getOperand(1);
14425 unsigned ShAmtC = ShAmt->getAsZExtVal();
14426 unsigned LSB = llvm::countr_zero(Mask);
14427 if (ShAmtC != LSB)
14428 return SDValue();
14429
14430 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
14431 DAG.getConstant(~Mask, DL, MVT::i32));
14432
14433 DCI.CombineTo(N, Res, false);
14434 // Return value from the original node to inform the combiner than N is
14435 // now dead.
14436 return SDValue(N, 0);
14437 }
14438
14439 return SDValue();
14440}
14441
14442static bool isValidMVECond(unsigned CC, bool IsFloat) {
14443 switch (CC) {
14444 case ARMCC::EQ:
14445 case ARMCC::NE:
14446 case ARMCC::LE:
14447 case ARMCC::GT:
14448 case ARMCC::GE:
14449 case ARMCC::LT:
14450 return true;
14451 case ARMCC::HS:
14452 case ARMCC::HI:
14453 return !IsFloat;
14454 default:
14455 return false;
14456 };
14457}
14458
14460 if (N->getOpcode() == ARMISD::VCMP)
14461 return (ARMCC::CondCodes)N->getConstantOperandVal(2);
14462 else if (N->getOpcode() == ARMISD::VCMPZ)
14463 return (ARMCC::CondCodes)N->getConstantOperandVal(1);
14464 else
14465 llvm_unreachable("Not a VCMP/VCMPZ!");
14466}
14467
14470 return isValidMVECond(CC, N->getOperand(0).getValueType().isFloatingPoint());
14471}
14472
14474 const ARMSubtarget *Subtarget) {
14475 // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain
14476 // together with predicates
14477 EVT VT = N->getValueType(0);
14478 SDLoc DL(N);
14479 SDValue N0 = N->getOperand(0);
14480 SDValue N1 = N->getOperand(1);
14481
14482 auto IsFreelyInvertable = [&](SDValue V) {
14483 if (V->getOpcode() == ARMISD::VCMP || V->getOpcode() == ARMISD::VCMPZ)
14484 return CanInvertMVEVCMP(V);
14485 return false;
14486 };
14487
14488 // At least one operand must be freely invertable.
14489 if (!(IsFreelyInvertable(N0) || IsFreelyInvertable(N1)))
14490 return SDValue();
14491
14492 SDValue NewN0 = DAG.getLogicalNOT(DL, N0, VT);
14493 SDValue NewN1 = DAG.getLogicalNOT(DL, N1, VT);
14494 SDValue And = DAG.getNode(ISD::AND, DL, VT, NewN0, NewN1);
14495 return DAG.getLogicalNOT(DL, And, VT);
14496}
14497
14498/// PerformORCombine - Target-specific dag combine xforms for ISD::OR
14501 const ARMSubtarget *Subtarget) {
14502 // Attempt to use immediate-form VORR
14503 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
14504 SDLoc dl(N);
14505 EVT VT = N->getValueType(0);
14506 SelectionDAG &DAG = DCI.DAG;
14507
14508 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
14509 return SDValue();
14510
14511 if (Subtarget->hasMVEIntegerOps() && (VT == MVT::v2i1 || VT == MVT::v4i1 ||
14512 VT == MVT::v8i1 || VT == MVT::v16i1))
14513 return PerformORCombine_i1(N, DAG, Subtarget);
14514
14515 APInt SplatBits, SplatUndef;
14516 unsigned SplatBitSize;
14517 bool HasAnyUndefs;
14518 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
14519 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14520 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
14521 SplatBitSize == 64) {
14522 EVT VorrVT;
14523 SDValue Val =
14524 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
14525 SplatBitSize, DAG, dl, VorrVT, VT, OtherModImm);
14526 if (Val.getNode()) {
14527 SDValue Input =
14528 DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VorrVT, N->getOperand(0));
14529 SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
14530 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vorr);
14531 }
14532 }
14533 }
14534
14535 if (!Subtarget->isThumb1Only()) {
14536 // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
14537 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
14538 return Result;
14539 if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget))
14540 return Result;
14541 }
14542
14543 SDValue N0 = N->getOperand(0);
14544 SDValue N1 = N->getOperand(1);
14545
14546 // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
14547 if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
14549
14550 // The code below optimizes (or (and X, Y), Z).
14551 // The AND operand needs to have a single user to make these optimizations
14552 // profitable.
14553 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
14554 return SDValue();
14555
14556 APInt SplatUndef;
14557 unsigned SplatBitSize;
14558 bool HasAnyUndefs;
14559
14560 APInt SplatBits0, SplatBits1;
14563 // Ensure that the second operand of both ands are constants
14564 if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
14565 HasAnyUndefs) && !HasAnyUndefs) {
14566 if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
14567 HasAnyUndefs) && !HasAnyUndefs) {
14568 // Ensure that the bit width of the constants are the same and that
14569 // the splat arguments are logical inverses as per the pattern we
14570 // are trying to simplify.
14571 if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
14572 SplatBits0 == ~SplatBits1) {
14573 // Canonicalize the vector type to make instruction selection
14574 // simpler.
14575 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
14576 SDValue Result = DAG.getNode(ARMISD::VBSP, dl, CanonicalVT,
14577 N0->getOperand(1),
14578 N0->getOperand(0),
14579 N1->getOperand(0));
14580 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Result);
14581 }
14582 }
14583 }
14584 }
14585
14586 // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
14587 // reasonable.
14588 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
14589 if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget))
14590 return Res;
14591 }
14592
14593 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14594 return Result;
14595
14596 return SDValue();
14597}
14598
14601 const ARMSubtarget *Subtarget) {
14602 EVT VT = N->getValueType(0);
14603 SelectionDAG &DAG = DCI.DAG;
14604
14605 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
14606 return SDValue();
14607
14608 if (!Subtarget->isThumb1Only()) {
14609 // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
14610 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
14611 return Result;
14612
14613 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14614 return Result;
14615 }
14616
14617 if (Subtarget->hasMVEIntegerOps()) {
14618 // fold (xor(vcmp/z, 1)) into a vcmp with the opposite condition.
14619 SDValue N0 = N->getOperand(0);
14620 SDValue N1 = N->getOperand(1);
14621 const TargetLowering *TLI = Subtarget->getTargetLowering();
14622 if (TLI->isConstTrueVal(N1) &&
14623 (N0->getOpcode() == ARMISD::VCMP || N0->getOpcode() == ARMISD::VCMPZ)) {
14624 if (CanInvertMVEVCMP(N0)) {
14625 SDLoc DL(N0);
14627
14629 Ops.push_back(N0->getOperand(0));
14630 if (N0->getOpcode() == ARMISD::VCMP)
14631 Ops.push_back(N0->getOperand(1));
14632 Ops.push_back(DAG.getConstant(CC, DL, MVT::i32));
14633 return DAG.getNode(N0->getOpcode(), DL, N0->getValueType(0), Ops);
14634 }
14635 }
14636 }
14637
14638 return SDValue();
14639}
14640
14641// ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it,
14642// and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and
14643// their position in "to" (Rd).
14644static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) {
14645 assert(N->getOpcode() == ARMISD::BFI);
14646
14647 SDValue From = N->getOperand(1);
14648 ToMask = ~N->getConstantOperandAPInt(2);
14649 FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.popcount());
14650
14651 // If the Base came from a SHR #C, we can deduce that it is really testing bit
14652 // #C in the base of the SHR.
14653 if (From->getOpcode() == ISD::SRL &&
14654 isa<ConstantSDNode>(From->getOperand(1))) {
14655 APInt Shift = From->getConstantOperandAPInt(1);
14656 assert(Shift.getLimitedValue() < 32 && "Shift too large!");
14657 FromMask <<= Shift.getLimitedValue(31);
14658 From = From->getOperand(0);
14659 }
14660
14661 return From;
14662}
14663
14664// If A and B contain one contiguous set of bits, does A | B == A . B?
14665//
14666// Neither A nor B must be zero.
14667static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) {
14668 unsigned LastActiveBitInA = A.countr_zero();
14669 unsigned FirstActiveBitInB = B.getBitWidth() - B.countl_zero() - 1;
14670 return LastActiveBitInA - 1 == FirstActiveBitInB;
14671}
14672
14674 // We have a BFI in N. Find a BFI it can combine with, if one exists.
14675 APInt ToMask, FromMask;
14676 SDValue From = ParseBFI(N, ToMask, FromMask);
14677 SDValue To = N->getOperand(0);
14678
14679 SDValue V = To;
14680 if (V.getOpcode() != ARMISD::BFI)
14681 return SDValue();
14682
14683 APInt NewToMask, NewFromMask;
14684 SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);
14685 if (NewFrom != From)
14686 return SDValue();
14687
14688 // Do the written bits conflict with any we've seen so far?
14689 if ((NewToMask & ToMask).getBoolValue())
14690 // Conflicting bits.
14691 return SDValue();
14692
14693 // Are the new bits contiguous when combined with the old bits?
14694 if (BitsProperlyConcatenate(ToMask, NewToMask) &&
14695 BitsProperlyConcatenate(FromMask, NewFromMask))
14696 return V;
14697 if (BitsProperlyConcatenate(NewToMask, ToMask) &&
14698 BitsProperlyConcatenate(NewFromMask, FromMask))
14699 return V;
14700
14701 return SDValue();
14702}
14703
14705 SDValue N0 = N->getOperand(0);
14706 SDValue N1 = N->getOperand(1);
14707
14708 if (N1.getOpcode() == ISD::AND) {
14709 // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
14710 // the bits being cleared by the AND are not demanded by the BFI.
14712 if (!N11C)
14713 return SDValue();
14714 unsigned InvMask = N->getConstantOperandVal(2);
14715 unsigned LSB = llvm::countr_zero(~InvMask);
14716 unsigned Width = llvm::bit_width<unsigned>(~InvMask) - LSB;
14717 assert(Width <
14718 static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
14719 "undefined behavior");
14720 unsigned Mask = (1u << Width) - 1;
14721 unsigned Mask2 = N11C->getZExtValue();
14722 if ((Mask & (~Mask2)) == 0)
14723 return DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
14724 N->getOperand(0), N1.getOperand(0), N->getOperand(2));
14725 return SDValue();
14726 }
14727
14728 // Look for another BFI to combine with.
14729 if (SDValue CombineBFI = FindBFIToCombineWith(N)) {
14730 // We've found a BFI.
14731 APInt ToMask1, FromMask1;
14732 SDValue From1 = ParseBFI(N, ToMask1, FromMask1);
14733
14734 APInt ToMask2, FromMask2;
14735 SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2);
14736 assert(From1 == From2);
14737 (void)From2;
14738
14739 // Create a new BFI, combining the two together.
14740 APInt NewFromMask = FromMask1 | FromMask2;
14741 APInt NewToMask = ToMask1 | ToMask2;
14742
14743 EVT VT = N->getValueType(0);
14744 SDLoc dl(N);
14745
14746 if (NewFromMask[0] == 0)
14747 From1 = DAG.getNode(ISD::SRL, dl, VT, From1,
14748 DAG.getConstant(NewFromMask.countr_zero(), dl, VT));
14749 return DAG.getNode(ARMISD::BFI, dl, VT, CombineBFI.getOperand(0), From1,
14750 DAG.getConstant(~NewToMask, dl, VT));
14751 }
14752
14753 // Reassociate BFI(BFI (A, B, M1), C, M2) to BFI(BFI (A, C, M2), B, M1) so
14754 // that lower bit insertions are performed first, providing that M1 and M2
14755 // do no overlap. This can allow multiple BFI instructions to be combined
14756 // together by the other folds above.
14757 if (N->getOperand(0).getOpcode() == ARMISD::BFI) {
14758 APInt ToMask1 = ~N->getConstantOperandAPInt(2);
14759 APInt ToMask2 = ~N0.getConstantOperandAPInt(2);
14760
14761 if (!N0.hasOneUse() || (ToMask1 & ToMask2) != 0 ||
14762 ToMask1.countl_zero() < ToMask2.countl_zero())
14763 return SDValue();
14764
14765 EVT VT = N->getValueType(0);
14766 SDLoc dl(N);
14767 SDValue BFI1 = DAG.getNode(ARMISD::BFI, dl, VT, N0.getOperand(0),
14768 N->getOperand(1), N->getOperand(2));
14769 return DAG.getNode(ARMISD::BFI, dl, VT, BFI1, N0.getOperand(1),
14770 N0.getOperand(2));
14771 }
14772
14773 return SDValue();
14774}
14775
14776// Check that N is CMPZ(CSINC(0, 0, CC, X)),
14777// or CMPZ(CMOV(1, 0, CC, X))
14778// return X if valid.
14780 if (Cmp->getOpcode() != ARMISD::CMPZ || !isNullConstant(Cmp->getOperand(1)))
14781 return SDValue();
14782 SDValue CSInc = Cmp->getOperand(0);
14783
14784 // Ignore any `And 1` nodes that may not yet have been removed. We are
14785 // looking for a value that produces 1/0, so these have no effect on the
14786 // code.
14787 while (CSInc.getOpcode() == ISD::AND &&
14788 isa<ConstantSDNode>(CSInc.getOperand(1)) &&
14789 CSInc.getConstantOperandVal(1) == 1 && CSInc->hasOneUse())
14790 CSInc = CSInc.getOperand(0);
14791
14792 if (CSInc.getOpcode() == ARMISD::CSINC &&
14793 isNullConstant(CSInc.getOperand(0)) &&
14794 isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
14796 return CSInc.getOperand(3);
14797 }
14798 if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(0)) &&
14799 isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
14801 return CSInc.getOperand(3);
14802 }
14803 if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(1)) &&
14804 isNullConstant(CSInc.getOperand(0)) && CSInc->hasOneUse()) {
14807 return CSInc.getOperand(3);
14808 }
14809 return SDValue();
14810}
14811
14813 // Given CMPZ(CSINC(C, 0, 0, EQ), 0), we can just use C directly. As in
14814 // t92: flags = ARMISD::CMPZ t74, 0
14815 // t93: i32 = ARMISD::CSINC 0, 0, 1, t92
14816 // t96: flags = ARMISD::CMPZ t93, 0
14817 // t114: i32 = ARMISD::CSINV 0, 0, 0, t96
14819 if (SDValue C = IsCMPZCSINC(N, Cond))
14820 if (Cond == ARMCC::EQ)
14821 return C;
14822 return SDValue();
14823}
14824
14826 // Fold away an unneccessary CMPZ/CSINC
14827 // CSXYZ A, B, C1 (CMPZ (CSINC 0, 0, C2, D), 0) ->
14828 // if C1==EQ -> CSXYZ A, B, C2, D
14829 // if C1==NE -> CSXYZ A, B, NOT(C2), D
14831 if (SDValue C = IsCMPZCSINC(N->getOperand(3).getNode(), Cond)) {
14832 if (N->getConstantOperandVal(2) == ARMCC::EQ)
14833 return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
14834 N->getOperand(1),
14835 DAG.getConstant(Cond, SDLoc(N), MVT::i32), C);
14836 if (N->getConstantOperandVal(2) == ARMCC::NE)
14837 return DAG.getNode(
14838 N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
14839 N->getOperand(1),
14841 }
14842 return SDValue();
14843}
14844
14845/// PerformVMOVRRDCombine - Target-specific dag combine xforms for
14846/// ARMISD::VMOVRRD.
14849 const ARMSubtarget *Subtarget) {
14850 // vmovrrd(vmovdrr x, y) -> x,y
14851 SDValue InDouble = N->getOperand(0);
14852 if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64())
14853 return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
14854
14855 // vmovrrd(load f64) -> (load i32), (load i32)
14856 SDNode *InNode = InDouble.getNode();
14857 if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
14858 InNode->getValueType(0) == MVT::f64 &&
14859 InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
14860 !cast<LoadSDNode>(InNode)->isVolatile()) {
14861 // TODO: Should this be done for non-FrameIndex operands?
14862 LoadSDNode *LD = cast<LoadSDNode>(InNode);
14863
14864 SelectionDAG &DAG = DCI.DAG;
14865 SDLoc DL(LD);
14866 SDValue BasePtr = LD->getBasePtr();
14867 SDValue NewLD1 =
14868 DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(),
14869 LD->getAlign(), LD->getMemOperand()->getFlags());
14870
14871 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
14872 DAG.getConstant(4, DL, MVT::i32));
14873
14874 SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr,
14875 LD->getPointerInfo().getWithOffset(4),
14876 commonAlignment(LD->getAlign(), 4),
14877 LD->getMemOperand()->getFlags());
14878
14879 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
14880 if (DCI.DAG.getDataLayout().isBigEndian())
14881 std::swap (NewLD1, NewLD2);
14882 SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
14883 return Result;
14884 }
14885
14886 // VMOVRRD(extract(..(build_vector(a, b, c, d)))) -> a,b or c,d
14887 // VMOVRRD(extract(insert_vector(insert_vector(.., a, l1), b, l2))) -> a,b
14888 if (InDouble.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
14889 isa<ConstantSDNode>(InDouble.getOperand(1))) {
14890 SDValue BV = InDouble.getOperand(0);
14891 // Look up through any nop bitcasts and vector_reg_casts. bitcasts may
14892 // change lane order under big endian.
14893 bool BVSwap = BV.getOpcode() == ISD::BITCAST;
14894 while (
14895 (BV.getOpcode() == ISD::BITCAST ||
14896 BV.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
14897 (BV.getValueType() == MVT::v2f64 || BV.getValueType() == MVT::v2i64)) {
14898 BVSwap = BV.getOpcode() == ISD::BITCAST;
14899 BV = BV.getOperand(0);
14900 }
14901 if (BV.getValueType() != MVT::v4i32)
14902 return SDValue();
14903
14904 // Handle buildvectors, pulling out the correct lane depending on
14905 // endianness.
14906 unsigned Offset = InDouble.getConstantOperandVal(1) == 1 ? 2 : 0;
14907 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
14908 SDValue Op0 = BV.getOperand(Offset);
14909 SDValue Op1 = BV.getOperand(Offset + 1);
14910 if (!Subtarget->isLittle() && BVSwap)
14911 std::swap(Op0, Op1);
14912
14913 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
14914 }
14915
14916 // A chain of insert_vectors, grabbing the correct value of the chain of
14917 // inserts.
14918 SDValue Op0, Op1;
14919 while (BV.getOpcode() == ISD::INSERT_VECTOR_ELT) {
14920 if (isa<ConstantSDNode>(BV.getOperand(2))) {
14921 if (BV.getConstantOperandVal(2) == Offset && !Op0)
14922 Op0 = BV.getOperand(1);
14923 if (BV.getConstantOperandVal(2) == Offset + 1 && !Op1)
14924 Op1 = BV.getOperand(1);
14925 }
14926 BV = BV.getOperand(0);
14927 }
14928 if (!Subtarget->isLittle() && BVSwap)
14929 std::swap(Op0, Op1);
14930 if (Op0 && Op1)
14931 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
14932 }
14933
14934 return SDValue();
14935}
14936
14937/// PerformVMOVDRRCombine - Target-specific dag combine xforms for
14938/// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands.
14940 // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
14941 SDValue Op0 = N->getOperand(0);
14942 SDValue Op1 = N->getOperand(1);
14943 if (Op0.getOpcode() == ISD::BITCAST)
14944 Op0 = Op0.getOperand(0);
14945 if (Op1.getOpcode() == ISD::BITCAST)
14946 Op1 = Op1.getOperand(0);
14947 if (Op0.getOpcode() == ARMISD::VMOVRRD &&
14948 Op0.getNode() == Op1.getNode() &&
14949 Op0.getResNo() == 0 && Op1.getResNo() == 1)
14950 return DAG.getNode(ISD::BITCAST, SDLoc(N),
14951 N->getValueType(0), Op0.getOperand(0));
14952 return SDValue();
14953}
14954
14957 SDValue Op0 = N->getOperand(0);
14958
14959 // VMOVhr (VMOVrh (X)) -> X
14960 if (Op0->getOpcode() == ARMISD::VMOVrh)
14961 return Op0->getOperand(0);
14962
14963 // FullFP16: half values are passed in S-registers, and we don't
14964 // need any of the bitcast and moves:
14965 //
14966 // t2: f32,ch1,gl1? = CopyFromReg ch, Register:f32 %0, gl?
14967 // t5: i32 = bitcast t2
14968 // t18: f16 = ARMISD::VMOVhr t5
14969 // =>
14970 // tN: f16,ch2,gl2? = CopyFromReg ch, Register::f32 %0, gl?
14971 if (Op0->getOpcode() == ISD::BITCAST) {
14972 SDValue Copy = Op0->getOperand(0);
14973 if (Copy.getValueType() == MVT::f32 &&
14974 Copy->getOpcode() == ISD::CopyFromReg) {
14975 bool HasGlue = Copy->getNumOperands() == 3;
14976 SDValue Ops[] = {Copy->getOperand(0), Copy->getOperand(1),
14977 HasGlue ? Copy->getOperand(2) : SDValue()};
14978 EVT OutTys[] = {N->getValueType(0), MVT::Other, MVT::Glue};
14979 SDValue NewCopy =
14981 DCI.DAG.getVTList(ArrayRef(OutTys, HasGlue ? 3 : 2)),
14982 ArrayRef(Ops, HasGlue ? 3 : 2));
14983
14984 // Update Users, Chains, and Potential Glue.
14985 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), NewCopy.getValue(0));
14986 DCI.DAG.ReplaceAllUsesOfValueWith(Copy.getValue(1), NewCopy.getValue(1));
14987 if (HasGlue)
14988 DCI.DAG.ReplaceAllUsesOfValueWith(Copy.getValue(2),
14989 NewCopy.getValue(2));
14990
14991 return NewCopy;
14992 }
14993 }
14994
14995 // fold (VMOVhr (load x)) -> (load (f16*)x)
14996 if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(Op0)) {
14997 if (LN0->hasOneUse() && LN0->isUnindexed() &&
14998 LN0->getMemoryVT() == MVT::i16) {
14999 SDValue Load =
15000 DCI.DAG.getLoad(N->getValueType(0), SDLoc(N), LN0->getChain(),
15001 LN0->getBasePtr(), LN0->getMemOperand());
15002 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
15003 DCI.DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Load.getValue(1));
15004 return Load;
15005 }
15006 }
15007
15008 // Only the bottom 16 bits of the source register are used.
15009 APInt DemandedMask = APInt::getLowBitsSet(32, 16);
15010 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
15011 if (TLI.SimplifyDemandedBits(Op0, DemandedMask, DCI))
15012 return SDValue(N, 0);
15013
15014 return SDValue();
15015}
15016
15018 SDValue N0 = N->getOperand(0);
15019 EVT VT = N->getValueType(0);
15020
15021 // fold (VMOVrh (fpconst x)) -> const x
15023 APFloat V = C->getValueAPF();
15024 return DAG.getConstant(V.bitcastToAPInt().getZExtValue(), SDLoc(N), VT);
15025 }
15026
15027 // fold (VMOVrh (load x)) -> (zextload (i16*)x)
15028 if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) {
15029 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
15030
15031 SDValue Load =
15032 DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(),
15033 LN0->getBasePtr(), MVT::i16, LN0->getMemOperand());
15034 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
15035 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
15036 return Load;
15037 }
15038
15039 // Fold VMOVrh(extract(x, n)) -> vgetlaneu(x, n)
15040 if (N0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15042 return DAG.getNode(ARMISD::VGETLANEu, SDLoc(N), VT, N0->getOperand(0),
15043 N0->getOperand(1));
15044
15045 return SDValue();
15046}
15047
15048/// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
15049/// are normal, non-volatile loads. If so, it is profitable to bitcast an
15050/// i64 vector to have f64 elements, since the value can then be loaded
15051/// directly into a VFP register.
15053 unsigned NumElts = N->getValueType(0).getVectorNumElements();
15054 for (unsigned i = 0; i < NumElts; ++i) {
15055 SDNode *Elt = N->getOperand(i).getNode();
15056 if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
15057 return true;
15058 }
15059 return false;
15060}
15061
15062/// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
15063/// ISD::BUILD_VECTOR.
15066 const ARMSubtarget *Subtarget) {
15067 // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
15068 // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value
15069 // into a pair of GPRs, which is fine when the value is used as a scalar,
15070 // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
15071 SelectionDAG &DAG = DCI.DAG;
15072 if (N->getNumOperands() == 2)
15073 if (SDValue RV = PerformVMOVDRRCombine(N, DAG))
15074 return RV;
15075
15076 // Load i64 elements as f64 values so that type legalization does not split
15077 // them up into i32 values.
15078 EVT VT = N->getValueType(0);
15079 if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
15080 return SDValue();
15081 SDLoc dl(N);
15083 unsigned NumElts = VT.getVectorNumElements();
15084 for (unsigned i = 0; i < NumElts; ++i) {
15085 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
15086 Ops.push_back(V);
15087 // Make the DAGCombiner fold the bitcast.
15088 DCI.AddToWorklist(V.getNode());
15089 }
15090 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
15091 SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops);
15092 return DAG.getNode(ISD::BITCAST, dl, VT, BV);
15093}
15094
15095/// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
15096static SDValue
15098 // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
15099 // At that time, we may have inserted bitcasts from integer to float.
15100 // If these bitcasts have survived DAGCombine, change the lowering of this
15101 // BUILD_VECTOR in something more vector friendly, i.e., that does not
15102 // force to use floating point types.
15103
15104 // Make sure we can change the type of the vector.
15105 // This is possible iff:
15106 // 1. The vector is only used in a bitcast to a integer type. I.e.,
15107 // 1.1. Vector is used only once.
15108 // 1.2. Use is a bit convert to an integer type.
15109 // 2. The size of its operands are 32-bits (64-bits are not legal).
15110 EVT VT = N->getValueType(0);
15111 EVT EltVT = VT.getVectorElementType();
15112
15113 // Check 1.1. and 2.
15114 if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
15115 return SDValue();
15116
15117 // By construction, the input type must be float.
15118 assert(EltVT == MVT::f32 && "Unexpected type!");
15119
15120 // Check 1.2.
15121 SDNode *Use = *N->user_begin();
15122 if (Use->getOpcode() != ISD::BITCAST ||
15123 Use->getValueType(0).isFloatingPoint())
15124 return SDValue();
15125
15126 // Check profitability.
15127 // Model is, if more than half of the relevant operands are bitcast from
15128 // i32, turn the build_vector into a sequence of insert_vector_elt.
15129 // Relevant operands are everything that is not statically
15130 // (i.e., at compile time) bitcasted.
15131 unsigned NumOfBitCastedElts = 0;
15132 unsigned NumElts = VT.getVectorNumElements();
15133 unsigned NumOfRelevantElts = NumElts;
15134 for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
15135 SDValue Elt = N->getOperand(Idx);
15136 if (Elt->getOpcode() == ISD::BITCAST) {
15137 // Assume only bit cast to i32 will go away.
15138 if (Elt->getOperand(0).getValueType() == MVT::i32)
15139 ++NumOfBitCastedElts;
15140 } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt))
15141 // Constants are statically casted, thus do not count them as
15142 // relevant operands.
15143 --NumOfRelevantElts;
15144 }
15145
15146 // Check if more than half of the elements require a non-free bitcast.
15147 if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
15148 return SDValue();
15149
15150 SelectionDAG &DAG = DCI.DAG;
15151 // Create the new vector type.
15152 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
15153 // Check if the type is legal.
15154 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15155 if (!TLI.isTypeLegal(VecVT))
15156 return SDValue();
15157
15158 // Combine:
15159 // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
15160 // => BITCAST INSERT_VECTOR_ELT
15161 // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
15162 // (BITCAST EN), N.
15163 SDValue Vec = DAG.getUNDEF(VecVT);
15164 SDLoc dl(N);
15165 for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
15166 SDValue V = N->getOperand(Idx);
15167 if (V.isUndef())
15168 continue;
15169 if (V.getOpcode() == ISD::BITCAST &&
15170 V->getOperand(0).getValueType() == MVT::i32)
15171 // Fold obvious case.
15172 V = V.getOperand(0);
15173 else {
15174 V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);
15175 // Make the DAGCombiner fold the bitcasts.
15176 DCI.AddToWorklist(V.getNode());
15177 }
15178 SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32);
15179 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
15180 }
15181 Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
15182 // Make the DAGCombiner fold the bitcasts.
15183 DCI.AddToWorklist(Vec.getNode());
15184 return Vec;
15185}
15186
15187static SDValue
15189 EVT VT = N->getValueType(0);
15190 SDValue Op = N->getOperand(0);
15191 SDLoc dl(N);
15192
15193 // PREDICATE_CAST(PREDICATE_CAST(x)) == PREDICATE_CAST(x)
15194 if (Op->getOpcode() == ARMISD::PREDICATE_CAST) {
15195 // If the valuetypes are the same, we can remove the cast entirely.
15196 if (Op->getOperand(0).getValueType() == VT)
15197 return Op->getOperand(0);
15198 return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
15199 }
15200
15201 // Turn pred_cast(xor x, -1) into xor(pred_cast x, -1), in order to produce
15202 // more VPNOT which might get folded as else predicates.
15203 if (Op.getValueType() == MVT::i32 && isBitwiseNot(Op)) {
15204 SDValue X =
15205 DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
15206 SDValue C = DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT,
15207 DCI.DAG.getConstant(65535, dl, MVT::i32));
15208 return DCI.DAG.getNode(ISD::XOR, dl, VT, X, C);
15209 }
15210
15211 // Only the bottom 16 bits of the source register are used.
15212 if (Op.getValueType() == MVT::i32) {
15213 APInt DemandedMask = APInt::getLowBitsSet(32, 16);
15214 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
15215 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
15216 return SDValue(N, 0);
15217 }
15218 return SDValue();
15219}
15220
15222 const ARMSubtarget *ST) {
15223 EVT VT = N->getValueType(0);
15224 SDValue Op = N->getOperand(0);
15225 SDLoc dl(N);
15226
15227 // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST
15228 if (ST->isLittle())
15229 return DAG.getNode(ISD::BITCAST, dl, VT, Op);
15230
15231 // VT VECTOR_REG_CAST (VT Op) -> Op
15232 if (Op.getValueType() == VT)
15233 return Op;
15234 // VECTOR_REG_CAST undef -> undef
15235 if (Op.isUndef())
15236 return DAG.getUNDEF(VT);
15237
15238 // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x)
15239 if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) {
15240 // If the valuetypes are the same, we can remove the cast entirely.
15241 if (Op->getOperand(0).getValueType() == VT)
15242 return Op->getOperand(0);
15243 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0));
15244 }
15245
15246 return SDValue();
15247}
15248
15250 const ARMSubtarget *Subtarget) {
15251 if (!Subtarget->hasMVEIntegerOps())
15252 return SDValue();
15253
15254 EVT VT = N->getValueType(0);
15255 SDValue Op0 = N->getOperand(0);
15256 SDValue Op1 = N->getOperand(1);
15257 ARMCC::CondCodes Cond = (ARMCC::CondCodes)N->getConstantOperandVal(2);
15258 SDLoc dl(N);
15259
15260 // vcmp X, 0, cc -> vcmpz X, cc
15261 if (isZeroVector(Op1))
15262 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0, N->getOperand(2));
15263
15264 unsigned SwappedCond = getSwappedCondition(Cond);
15265 if (isValidMVECond(SwappedCond, VT.isFloatingPoint())) {
15266 // vcmp 0, X, cc -> vcmpz X, reversed(cc)
15267 if (isZeroVector(Op0))
15268 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1,
15269 DAG.getConstant(SwappedCond, dl, MVT::i32));
15270 // vcmp vdup(Y), X, cc -> vcmp X, vdup(Y), reversed(cc)
15271 if (Op0->getOpcode() == ARMISD::VDUP && Op1->getOpcode() != ARMISD::VDUP)
15272 return DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0,
15273 DAG.getConstant(SwappedCond, dl, MVT::i32));
15274 }
15275
15276 return SDValue();
15277}
15278
15279/// PerformInsertEltCombine - Target-specific dag combine xforms for
15280/// ISD::INSERT_VECTOR_ELT.
15283 // Bitcast an i64 load inserted into a vector to f64.
15284 // Otherwise, the i64 value will be legalized to a pair of i32 values.
15285 EVT VT = N->getValueType(0);
15286 SDNode *Elt = N->getOperand(1).getNode();
15287 if (VT.getVectorElementType() != MVT::i64 ||
15288 !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
15289 return SDValue();
15290
15291 SelectionDAG &DAG = DCI.DAG;
15292 SDLoc dl(N);
15293 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
15295 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
15296 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
15297 // Make the DAGCombiner fold the bitcasts.
15298 DCI.AddToWorklist(Vec.getNode());
15299 DCI.AddToWorklist(V.getNode());
15300 SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
15301 Vec, V, N->getOperand(2));
15302 return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
15303}
15304
15305// Convert a pair of extracts from the same base vector to a VMOVRRD. Either
15306// directly or bitcast to an integer if the original is a float vector.
15307// extract(x, n); extract(x, n+1) -> VMOVRRD(extract v2f64 x, n/2)
15308// bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD(extract x, n/2)
15309static SDValue
15311 EVT VT = N->getValueType(0);
15312 SDLoc dl(N);
15313
15314 if (!DCI.isAfterLegalizeDAG() || VT != MVT::i32 ||
15315 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(MVT::f64))
15316 return SDValue();
15317
15318 SDValue Ext = SDValue(N, 0);
15319 if (Ext.getOpcode() == ISD::BITCAST &&
15320 Ext.getOperand(0).getValueType() == MVT::f32)
15321 Ext = Ext.getOperand(0);
15322 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
15324 Ext.getConstantOperandVal(1) % 2 != 0)
15325 return SDValue();
15326 if (Ext->hasOneUse() && (Ext->user_begin()->getOpcode() == ISD::SINT_TO_FP ||
15327 Ext->user_begin()->getOpcode() == ISD::UINT_TO_FP))
15328 return SDValue();
15329
15330 SDValue Op0 = Ext.getOperand(0);
15331 EVT VecVT = Op0.getValueType();
15332 unsigned ResNo = Op0.getResNo();
15333 unsigned Lane = Ext.getConstantOperandVal(1);
15334 if (VecVT.getVectorNumElements() != 4)
15335 return SDValue();
15336
15337 // Find another extract, of Lane + 1
15338 auto OtherIt = find_if(Op0->users(), [&](SDNode *V) {
15339 return V->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15340 isa<ConstantSDNode>(V->getOperand(1)) &&
15341 V->getConstantOperandVal(1) == Lane + 1 &&
15342 V->getOperand(0).getResNo() == ResNo;
15343 });
15344 if (OtherIt == Op0->users().end())
15345 return SDValue();
15346
15347 // For float extracts, we need to be converting to a i32 for both vector
15348 // lanes.
15349 SDValue OtherExt(*OtherIt, 0);
15350 if (OtherExt.getValueType() != MVT::i32) {
15351 if (!OtherExt->hasOneUse() ||
15352 OtherExt->user_begin()->getOpcode() != ISD::BITCAST ||
15353 OtherExt->user_begin()->getValueType(0) != MVT::i32)
15354 return SDValue();
15355 OtherExt = SDValue(*OtherExt->user_begin(), 0);
15356 }
15357
15358 // Convert the type to a f64 and extract with a VMOVRRD.
15359 SDValue F64 = DCI.DAG.getNode(
15360 ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15361 DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v2f64, Op0),
15362 DCI.DAG.getConstant(Ext.getConstantOperandVal(1) / 2, dl, MVT::i32));
15363 SDValue VMOVRRD =
15364 DCI.DAG.getNode(ARMISD::VMOVRRD, dl, {MVT::i32, MVT::i32}, F64);
15365
15366 DCI.CombineTo(OtherExt.getNode(), SDValue(VMOVRRD.getNode(), 1));
15367 return VMOVRRD;
15368}
15369
15372 const ARMSubtarget *ST) {
15373 SDValue Op0 = N->getOperand(0);
15374 EVT VT = N->getValueType(0);
15375 SDLoc dl(N);
15376
15377 // extract (vdup x) -> x
15378 if (Op0->getOpcode() == ARMISD::VDUP) {
15379 SDValue X = Op0->getOperand(0);
15380 if (VT == MVT::f16 && X.getValueType() == MVT::i32)
15381 return DCI.DAG.getNode(ARMISD::VMOVhr, dl, VT, X);
15382 if (VT == MVT::i32 && X.getValueType() == MVT::f16)
15383 return DCI.DAG.getNode(ARMISD::VMOVrh, dl, VT, X);
15384 if (VT == MVT::f32 && X.getValueType() == MVT::i32)
15385 return DCI.DAG.getNode(ISD::BITCAST, dl, VT, X);
15386
15387 while (X.getValueType() != VT && X->getOpcode() == ISD::BITCAST)
15388 X = X->getOperand(0);
15389 if (X.getValueType() == VT)
15390 return X;
15391 }
15392
15393 // extract ARM_BUILD_VECTOR -> x
15394 if (Op0->getOpcode() == ARMISD::BUILD_VECTOR &&
15395 isa<ConstantSDNode>(N->getOperand(1)) &&
15396 N->getConstantOperandVal(1) < Op0.getNumOperands()) {
15397 return Op0.getOperand(N->getConstantOperandVal(1));
15398 }
15399
15400 // extract(bitcast(BUILD_VECTOR(VMOVDRR(a, b), ..))) -> a or b
15401 if (Op0.getValueType() == MVT::v4i32 &&
15402 isa<ConstantSDNode>(N->getOperand(1)) &&
15403 Op0.getOpcode() == ISD::BITCAST &&
15405 Op0.getOperand(0).getValueType() == MVT::v2f64) {
15406 SDValue BV = Op0.getOperand(0);
15407 unsigned Offset = N->getConstantOperandVal(1);
15408 SDValue MOV = BV.getOperand(Offset < 2 ? 0 : 1);
15409 if (MOV.getOpcode() == ARMISD::VMOVDRR)
15410 return MOV.getOperand(ST->isLittle() ? Offset % 2 : 1 - Offset % 2);
15411 }
15412
15413 // extract x, n; extract x, n+1 -> VMOVRRD x
15414 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
15415 return R;
15416
15417 // extract (MVETrunc(x)) -> extract x
15418 if (Op0->getOpcode() == ARMISD::MVETRUNC) {
15419 unsigned Idx = N->getConstantOperandVal(1);
15420 unsigned Vec =
15422 unsigned SubIdx =
15424 return DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Op0.getOperand(Vec),
15425 DCI.DAG.getConstant(SubIdx, dl, MVT::i32));
15426 }
15427
15428 return SDValue();
15429}
15430
15432 SDValue Op = N->getOperand(0);
15433 EVT VT = N->getValueType(0);
15434
15435 // sext_inreg(VGETLANEu) -> VGETLANEs
15436 if (Op.getOpcode() == ARMISD::VGETLANEu &&
15437 cast<VTSDNode>(N->getOperand(1))->getVT() ==
15438 Op.getOperand(0).getValueType().getScalarType())
15439 return DAG.getNode(ARMISD::VGETLANEs, SDLoc(N), VT, Op.getOperand(0),
15440 Op.getOperand(1));
15441
15442 return SDValue();
15443}
15444
15445static SDValue
15447 SDValue Vec = N->getOperand(0);
15448 SDValue SubVec = N->getOperand(1);
15449 uint64_t IdxVal = N->getConstantOperandVal(2);
15450 EVT VecVT = Vec.getValueType();
15451 EVT SubVT = SubVec.getValueType();
15452
15453 // Only do this for legal fixed vector types.
15454 if (!VecVT.isFixedLengthVector() ||
15455 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
15457 return SDValue();
15458
15459 // Ignore widening patterns.
15460 if (IdxVal == 0 && Vec.isUndef())
15461 return SDValue();
15462
15463 // Subvector must be half the width and an "aligned" insertion.
15464 unsigned NumSubElts = SubVT.getVectorNumElements();
15465 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
15466 (IdxVal != 0 && IdxVal != NumSubElts))
15467 return SDValue();
15468
15469 // Fold insert_subvector -> concat_vectors
15470 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
15471 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
15472 SDLoc DL(N);
15473 SDValue Lo, Hi;
15474 if (IdxVal == 0) {
15475 Lo = SubVec;
15476 Hi = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
15477 DCI.DAG.getVectorIdxConstant(NumSubElts, DL));
15478 } else {
15479 Lo = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
15480 DCI.DAG.getVectorIdxConstant(0, DL));
15481 Hi = SubVec;
15482 }
15483 return DCI.DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
15484}
15485
15486// shuffle(MVETrunc(x, y)) -> VMOVN(x, y)
15488 SelectionDAG &DAG) {
15489 SDValue Trunc = N->getOperand(0);
15490 EVT VT = Trunc.getValueType();
15491 if (Trunc.getOpcode() != ARMISD::MVETRUNC || !N->getOperand(1).isUndef())
15492 return SDValue();
15493
15494 SDLoc DL(Trunc);
15495 if (isVMOVNTruncMask(N->getMask(), VT, false))
15496 return DAG.getNode(
15497 ARMISD::VMOVN, DL, VT,
15498 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
15499 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
15500 DAG.getConstant(1, DL, MVT::i32));
15501 else if (isVMOVNTruncMask(N->getMask(), VT, true))
15502 return DAG.getNode(
15503 ARMISD::VMOVN, DL, VT,
15504 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
15505 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
15506 DAG.getConstant(1, DL, MVT::i32));
15507 return SDValue();
15508}
15509
15510/// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
15511/// ISD::VECTOR_SHUFFLE.
15514 return R;
15515
15516 // The LLVM shufflevector instruction does not require the shuffle mask
15517 // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
15518 // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the
15519 // operands do not match the mask length, they are extended by concatenating
15520 // them with undef vectors. That is probably the right thing for other
15521 // targets, but for NEON it is better to concatenate two double-register
15522 // size vector operands into a single quad-register size vector. Do that
15523 // transformation here:
15524 // shuffle(concat(v1, undef), concat(v2, undef)) ->
15525 // shuffle(concat(v1, v2), undef)
15526 SDValue Op0 = N->getOperand(0);
15527 SDValue Op1 = N->getOperand(1);
15528 if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
15529 Op1.getOpcode() != ISD::CONCAT_VECTORS ||
15530 Op0.getNumOperands() != 2 ||
15531 Op1.getNumOperands() != 2)
15532 return SDValue();
15533 SDValue Concat0Op1 = Op0.getOperand(1);
15534 SDValue Concat1Op1 = Op1.getOperand(1);
15535 if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef())
15536 return SDValue();
15537 // Skip the transformation if any of the types are illegal.
15538 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15539 EVT VT = N->getValueType(0);
15540 if (!TLI.isTypeLegal(VT) ||
15541 !TLI.isTypeLegal(Concat0Op1.getValueType()) ||
15542 !TLI.isTypeLegal(Concat1Op1.getValueType()))
15543 return SDValue();
15544
15545 SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
15546 Op0.getOperand(0), Op1.getOperand(0));
15547 // Translate the shuffle mask.
15548 SmallVector<int, 16> NewMask;
15549 unsigned NumElts = VT.getVectorNumElements();
15550 unsigned HalfElts = NumElts/2;
15552 for (unsigned n = 0; n < NumElts; ++n) {
15553 int MaskElt = SVN->getMaskElt(n);
15554 int NewElt = -1;
15555 if (MaskElt < (int)HalfElts)
15556 NewElt = MaskElt;
15557 else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
15558 NewElt = HalfElts + MaskElt - NumElts;
15559 NewMask.push_back(NewElt);
15560 }
15561 return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
15562 DAG.getUNDEF(VT), NewMask);
15563}
15564
15565/// Load/store instruction that can be merged with a base address
15566/// update
15571 unsigned AddrOpIdx;
15572};
15573
15575 /// Instruction that updates a pointer
15577 /// Pointer increment operand
15579 /// Pointer increment value if it is a constant, or 0 otherwise
15580 unsigned ConstInc;
15581};
15582
15584 // Check that the add is independent of the load/store.
15585 // Otherwise, folding it would create a cycle. Search through Addr
15586 // as well, since the User may not be a direct user of Addr and
15587 // only share a base pointer.
15590 Worklist.push_back(N);
15591 Worklist.push_back(User);
15592 const unsigned MaxSteps = 1024;
15593 if (SDNode::hasPredecessorHelper(N, Visited, Worklist, MaxSteps) ||
15594 SDNode::hasPredecessorHelper(User, Visited, Worklist, MaxSteps))
15595 return false;
15596 return true;
15597}
15598
15600 struct BaseUpdateUser &User,
15601 bool SimpleConstIncOnly,
15603 SelectionDAG &DAG = DCI.DAG;
15604 SDNode *N = Target.N;
15605 MemSDNode *MemN = cast<MemSDNode>(N);
15606 SDLoc dl(N);
15607
15608 // Find the new opcode for the updating load/store.
15609 bool isLoadOp = true;
15610 bool isLaneOp = false;
15611 // Workaround for vst1x and vld1x intrinsics which do not have alignment
15612 // as an operand.
15613 bool hasAlignment = true;
15614 unsigned NewOpc = 0;
15615 unsigned NumVecs = 0;
15616 if (Target.isIntrinsic) {
15617 unsigned IntNo = N->getConstantOperandVal(1);
15618 switch (IntNo) {
15619 default:
15620 llvm_unreachable("unexpected intrinsic for Neon base update");
15621 case Intrinsic::arm_neon_vld1:
15622 NewOpc = ARMISD::VLD1_UPD;
15623 NumVecs = 1;
15624 break;
15625 case Intrinsic::arm_neon_vld2:
15626 NewOpc = ARMISD::VLD2_UPD;
15627 NumVecs = 2;
15628 break;
15629 case Intrinsic::arm_neon_vld3:
15630 NewOpc = ARMISD::VLD3_UPD;
15631 NumVecs = 3;
15632 break;
15633 case Intrinsic::arm_neon_vld4:
15634 NewOpc = ARMISD::VLD4_UPD;
15635 NumVecs = 4;
15636 break;
15637 case Intrinsic::arm_neon_vld1x2:
15638 NewOpc = ARMISD::VLD1x2_UPD;
15639 NumVecs = 2;
15640 hasAlignment = false;
15641 break;
15642 case Intrinsic::arm_neon_vld1x3:
15643 NewOpc = ARMISD::VLD1x3_UPD;
15644 NumVecs = 3;
15645 hasAlignment = false;
15646 break;
15647 case Intrinsic::arm_neon_vld1x4:
15648 NewOpc = ARMISD::VLD1x4_UPD;
15649 NumVecs = 4;
15650 hasAlignment = false;
15651 break;
15652 case Intrinsic::arm_neon_vld2dup:
15653 NewOpc = ARMISD::VLD2DUP_UPD;
15654 NumVecs = 2;
15655 break;
15656 case Intrinsic::arm_neon_vld3dup:
15657 NewOpc = ARMISD::VLD3DUP_UPD;
15658 NumVecs = 3;
15659 break;
15660 case Intrinsic::arm_neon_vld4dup:
15661 NewOpc = ARMISD::VLD4DUP_UPD;
15662 NumVecs = 4;
15663 break;
15664 case Intrinsic::arm_neon_vld2lane:
15665 NewOpc = ARMISD::VLD2LN_UPD;
15666 NumVecs = 2;
15667 isLaneOp = true;
15668 break;
15669 case Intrinsic::arm_neon_vld3lane:
15670 NewOpc = ARMISD::VLD3LN_UPD;
15671 NumVecs = 3;
15672 isLaneOp = true;
15673 break;
15674 case Intrinsic::arm_neon_vld4lane:
15675 NewOpc = ARMISD::VLD4LN_UPD;
15676 NumVecs = 4;
15677 isLaneOp = true;
15678 break;
15679 case Intrinsic::arm_neon_vst1:
15680 NewOpc = ARMISD::VST1_UPD;
15681 NumVecs = 1;
15682 isLoadOp = false;
15683 break;
15684 case Intrinsic::arm_neon_vst2:
15685 NewOpc = ARMISD::VST2_UPD;
15686 NumVecs = 2;
15687 isLoadOp = false;
15688 break;
15689 case Intrinsic::arm_neon_vst3:
15690 NewOpc = ARMISD::VST3_UPD;
15691 NumVecs = 3;
15692 isLoadOp = false;
15693 break;
15694 case Intrinsic::arm_neon_vst4:
15695 NewOpc = ARMISD::VST4_UPD;
15696 NumVecs = 4;
15697 isLoadOp = false;
15698 break;
15699 case Intrinsic::arm_neon_vst2lane:
15700 NewOpc = ARMISD::VST2LN_UPD;
15701 NumVecs = 2;
15702 isLoadOp = false;
15703 isLaneOp = true;
15704 break;
15705 case Intrinsic::arm_neon_vst3lane:
15706 NewOpc = ARMISD::VST3LN_UPD;
15707 NumVecs = 3;
15708 isLoadOp = false;
15709 isLaneOp = true;
15710 break;
15711 case Intrinsic::arm_neon_vst4lane:
15712 NewOpc = ARMISD::VST4LN_UPD;
15713 NumVecs = 4;
15714 isLoadOp = false;
15715 isLaneOp = true;
15716 break;
15717 case Intrinsic::arm_neon_vst1x2:
15718 NewOpc = ARMISD::VST1x2_UPD;
15719 NumVecs = 2;
15720 isLoadOp = false;
15721 hasAlignment = false;
15722 break;
15723 case Intrinsic::arm_neon_vst1x3:
15724 NewOpc = ARMISD::VST1x3_UPD;
15725 NumVecs = 3;
15726 isLoadOp = false;
15727 hasAlignment = false;
15728 break;
15729 case Intrinsic::arm_neon_vst1x4:
15730 NewOpc = ARMISD::VST1x4_UPD;
15731 NumVecs = 4;
15732 isLoadOp = false;
15733 hasAlignment = false;
15734 break;
15735 }
15736 } else {
15737 isLaneOp = true;
15738 switch (N->getOpcode()) {
15739 default:
15740 llvm_unreachable("unexpected opcode for Neon base update");
15741 case ARMISD::VLD1DUP:
15742 NewOpc = ARMISD::VLD1DUP_UPD;
15743 NumVecs = 1;
15744 break;
15745 case ARMISD::VLD2DUP:
15746 NewOpc = ARMISD::VLD2DUP_UPD;
15747 NumVecs = 2;
15748 break;
15749 case ARMISD::VLD3DUP:
15750 NewOpc = ARMISD::VLD3DUP_UPD;
15751 NumVecs = 3;
15752 break;
15753 case ARMISD::VLD4DUP:
15754 NewOpc = ARMISD::VLD4DUP_UPD;
15755 NumVecs = 4;
15756 break;
15757 case ISD::LOAD:
15758 NewOpc = ARMISD::VLD1_UPD;
15759 NumVecs = 1;
15760 isLaneOp = false;
15761 break;
15762 case ISD::STORE:
15763 NewOpc = ARMISD::VST1_UPD;
15764 NumVecs = 1;
15765 isLaneOp = false;
15766 isLoadOp = false;
15767 break;
15768 }
15769 }
15770
15771 // Find the size of memory referenced by the load/store.
15772 EVT VecTy;
15773 if (isLoadOp) {
15774 VecTy = N->getValueType(0);
15775 } else if (Target.isIntrinsic) {
15776 VecTy = N->getOperand(Target.AddrOpIdx + 1).getValueType();
15777 } else {
15778 assert(Target.isStore &&
15779 "Node has to be a load, a store, or an intrinsic!");
15780 VecTy = N->getOperand(1).getValueType();
15781 }
15782
15783 bool isVLDDUPOp =
15784 NewOpc == ARMISD::VLD1DUP_UPD || NewOpc == ARMISD::VLD2DUP_UPD ||
15785 NewOpc == ARMISD::VLD3DUP_UPD || NewOpc == ARMISD::VLD4DUP_UPD;
15786
15787 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
15788 if (isLaneOp || isVLDDUPOp)
15789 NumBytes /= VecTy.getVectorNumElements();
15790
15791 if (NumBytes >= 3 * 16 && User.ConstInc != NumBytes) {
15792 // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
15793 // separate instructions that make it harder to use a non-constant update.
15794 return false;
15795 }
15796
15797 if (SimpleConstIncOnly && User.ConstInc != NumBytes)
15798 return false;
15799
15800 if (!isValidBaseUpdate(N, User.N))
15801 return false;
15802
15803 // OK, we found an ADD we can fold into the base update.
15804 // Now, create a _UPD node, taking care of not breaking alignment.
15805
15806 EVT AlignedVecTy = VecTy;
15807 Align Alignment = MemN->getAlign();
15808
15809 // If this is a less-than-standard-aligned load/store, change the type to
15810 // match the standard alignment.
15811 // The alignment is overlooked when selecting _UPD variants; and it's
15812 // easier to introduce bitcasts here than fix that.
15813 // There are 3 ways to get to this base-update combine:
15814 // - intrinsics: they are assumed to be properly aligned (to the standard
15815 // alignment of the memory type), so we don't need to do anything.
15816 // - ARMISD::VLDx nodes: they are only generated from the aforementioned
15817 // intrinsics, so, likewise, there's nothing to do.
15818 // - generic load/store instructions: the alignment is specified as an
15819 // explicit operand, rather than implicitly as the standard alignment
15820 // of the memory type (like the intrisics). We need to change the
15821 // memory type to match the explicit alignment. That way, we don't
15822 // generate non-standard-aligned ARMISD::VLDx nodes.
15823 if (isa<LSBaseSDNode>(N)) {
15824 if (Alignment.value() < VecTy.getScalarSizeInBits() / 8) {
15825 MVT EltTy = MVT::getIntegerVT(Alignment.value() * 8);
15826 assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
15827 assert(!isLaneOp && "Unexpected generic load/store lane.");
15828 unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
15829 AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
15830 }
15831 // Don't set an explicit alignment on regular load/stores that we want
15832 // to transform to VLD/VST 1_UPD nodes.
15833 // This matches the behavior of regular load/stores, which only get an
15834 // explicit alignment if the MMO alignment is larger than the standard
15835 // alignment of the memory type.
15836 // Intrinsics, however, always get an explicit alignment, set to the
15837 // alignment of the MMO.
15838 Alignment = Align(1);
15839 }
15840
15841 // Create the new updating load/store node.
15842 // First, create an SDVTList for the new updating node's results.
15843 EVT Tys[6];
15844 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
15845 unsigned n;
15846 for (n = 0; n < NumResultVecs; ++n)
15847 Tys[n] = AlignedVecTy;
15848 Tys[n++] = MVT::i32;
15849 Tys[n] = MVT::Other;
15850 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
15851
15852 // Then, gather the new node's operands.
15854 Ops.push_back(N->getOperand(0)); // incoming chain
15855 Ops.push_back(N->getOperand(Target.AddrOpIdx));
15856 Ops.push_back(User.Inc);
15857
15858 if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
15859 // Try to match the intrinsic's signature
15860 Ops.push_back(StN->getValue());
15861 } else {
15862 // Loads (and of course intrinsics) match the intrinsics' signature,
15863 // so just add all but the alignment operand.
15864 unsigned LastOperand =
15865 hasAlignment ? N->getNumOperands() - 1 : N->getNumOperands();
15866 for (unsigned i = Target.AddrOpIdx + 1; i < LastOperand; ++i)
15867 Ops.push_back(N->getOperand(i));
15868 }
15869
15870 // For all node types, the alignment operand is always the last one.
15871 Ops.push_back(DAG.getConstant(Alignment.value(), dl, MVT::i32));
15872
15873 // If this is a non-standard-aligned STORE, the penultimate operand is the
15874 // stored value. Bitcast it to the aligned type.
15875 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
15876 SDValue &StVal = Ops[Ops.size() - 2];
15877 StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
15878 }
15879
15880 EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
15881 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
15882 MemN->getMemOperand());
15883
15884 // Update the uses.
15885 SmallVector<SDValue, 5> NewResults;
15886 for (unsigned i = 0; i < NumResultVecs; ++i)
15887 NewResults.push_back(SDValue(UpdN.getNode(), i));
15888
15889 // If this is an non-standard-aligned LOAD, the first result is the loaded
15890 // value. Bitcast it to the expected result type.
15891 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
15892 SDValue &LdVal = NewResults[0];
15893 LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
15894 }
15895
15896 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
15897 DCI.CombineTo(N, NewResults);
15898 DCI.CombineTo(User.N, SDValue(UpdN.getNode(), NumResultVecs));
15899
15900 return true;
15901}
15902
15903// If (opcode ptr inc) is and ADD-like instruction, return the
15904// increment value. Otherwise return 0.
15905static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr,
15906 SDValue Inc, const SelectionDAG &DAG) {
15908 if (!CInc)
15909 return 0;
15910
15911 switch (Opcode) {
15912 case ARMISD::VLD1_UPD:
15913 case ISD::ADD:
15914 return CInc->getZExtValue();
15915 case ISD::OR: {
15916 if (DAG.haveNoCommonBitsSet(Ptr, Inc)) {
15917 // (OR ptr inc) is the same as (ADD ptr inc)
15918 return CInc->getZExtValue();
15919 }
15920 return 0;
15921 }
15922 default:
15923 return 0;
15924 }
15925}
15926
15928 switch (N->getOpcode()) {
15929 case ISD::ADD:
15930 case ISD::OR: {
15931 if (isa<ConstantSDNode>(N->getOperand(1))) {
15932 *Ptr = N->getOperand(0);
15933 *CInc = N->getOperand(1);
15934 return true;
15935 }
15936 return false;
15937 }
15938 case ARMISD::VLD1_UPD: {
15939 if (isa<ConstantSDNode>(N->getOperand(2))) {
15940 *Ptr = N->getOperand(1);
15941 *CInc = N->getOperand(2);
15942 return true;
15943 }
15944 return false;
15945 }
15946 default:
15947 return false;
15948 }
15949}
15950
15951/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
15952/// NEON load/store intrinsics, and generic vector load/stores, to merge
15953/// base address updates.
15954/// For generic load/stores, the memory type is assumed to be a vector.
15955/// The caller is assumed to have checked legality.
15958 const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
15959 N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
15960 const bool isStore = N->getOpcode() == ISD::STORE;
15961 const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
15962 BaseUpdateTarget Target = {N, isIntrinsic, isStore, AddrOpIdx};
15963
15964 // Limit the number of possible base-updates we look at to prevent degenerate
15965 // cases.
15966 unsigned MaxBaseUpdates = ArmMaxBaseUpdatesToCheck;
15967
15968 SDValue Addr = N->getOperand(AddrOpIdx);
15969
15971
15972 // Search for a use of the address operand that is an increment.
15973 for (SDUse &Use : Addr->uses()) {
15974 SDNode *User = Use.getUser();
15975 if (Use.getResNo() != Addr.getResNo() || User->getNumOperands() != 2)
15976 continue;
15977
15978 SDValue Inc = User->getOperand(Use.getOperandNo() == 1 ? 0 : 1);
15979 unsigned ConstInc =
15980 getPointerConstIncrement(User->getOpcode(), Addr, Inc, DCI.DAG);
15981
15982 if (ConstInc || User->getOpcode() == ISD::ADD) {
15983 BaseUpdates.push_back({User, Inc, ConstInc});
15984 if (BaseUpdates.size() >= MaxBaseUpdates)
15985 break;
15986 }
15987 }
15988
15989 // If the address is a constant pointer increment itself, find
15990 // another constant increment that has the same base operand
15991 SDValue Base;
15992 SDValue CInc;
15993 if (findPointerConstIncrement(Addr.getNode(), &Base, &CInc)) {
15994 unsigned Offset =
15995 getPointerConstIncrement(Addr->getOpcode(), Base, CInc, DCI.DAG);
15996 for (SDUse &Use : Base->uses()) {
15997
15998 SDNode *User = Use.getUser();
15999 if (Use.getResNo() != Base.getResNo() || User == Addr.getNode() ||
16000 User->getNumOperands() != 2)
16001 continue;
16002
16003 SDValue UserInc = User->getOperand(Use.getOperandNo() == 0 ? 1 : 0);
16004 unsigned UserOffset =
16005 getPointerConstIncrement(User->getOpcode(), Base, UserInc, DCI.DAG);
16006
16007 if (!UserOffset || UserOffset <= Offset)
16008 continue;
16009
16010 unsigned NewConstInc = UserOffset - Offset;
16011 SDValue NewInc = DCI.DAG.getConstant(NewConstInc, SDLoc(N), MVT::i32);
16012 BaseUpdates.push_back({User, NewInc, NewConstInc});
16013 if (BaseUpdates.size() >= MaxBaseUpdates)
16014 break;
16015 }
16016 }
16017
16018 // Try to fold the load/store with an update that matches memory
16019 // access size. This should work well for sequential loads.
16020 unsigned NumValidUpd = BaseUpdates.size();
16021 for (unsigned I = 0; I < NumValidUpd; I++) {
16022 BaseUpdateUser &User = BaseUpdates[I];
16023 if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/true, DCI))
16024 return SDValue();
16025 }
16026
16027 // Try to fold with other users. Non-constant updates are considered
16028 // first, and constant updates are sorted to not break a sequence of
16029 // strided accesses (if there is any).
16030 llvm::stable_sort(BaseUpdates,
16031 [](const BaseUpdateUser &LHS, const BaseUpdateUser &RHS) {
16032 return LHS.ConstInc < RHS.ConstInc;
16033 });
16034 for (BaseUpdateUser &User : BaseUpdates) {
16035 if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/false, DCI))
16036 return SDValue();
16037 }
16038 return SDValue();
16039}
16040
16043 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
16044 return SDValue();
16045
16046 return CombineBaseUpdate(N, DCI);
16047}
16048
16051 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
16052 return SDValue();
16053
16054 SelectionDAG &DAG = DCI.DAG;
16055 SDValue Addr = N->getOperand(2);
16056 MemSDNode *MemN = cast<MemSDNode>(N);
16057 SDLoc dl(N);
16058
16059 // For the stores, where there are multiple intrinsics we only actually want
16060 // to post-inc the last of the them.
16061 unsigned IntNo = N->getConstantOperandVal(1);
16062 if (IntNo == Intrinsic::arm_mve_vst2q && N->getConstantOperandVal(5) != 1)
16063 return SDValue();
16064 if (IntNo == Intrinsic::arm_mve_vst4q && N->getConstantOperandVal(7) != 3)
16065 return SDValue();
16066
16067 // Search for a use of the address operand that is an increment.
16068 for (SDUse &Use : Addr->uses()) {
16069 SDNode *User = Use.getUser();
16070 if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
16071 continue;
16072
16073 // Check that the add is independent of the load/store. Otherwise, folding
16074 // it would create a cycle. We can avoid searching through Addr as it's a
16075 // predecessor to both.
16078 Visited.insert(Addr.getNode());
16079 Worklist.push_back(N);
16080 Worklist.push_back(User);
16081 const unsigned MaxSteps = 1024;
16082 if (SDNode::hasPredecessorHelper(N, Visited, Worklist, MaxSteps) ||
16083 SDNode::hasPredecessorHelper(User, Visited, Worklist, MaxSteps))
16084 continue;
16085
16086 // Find the new opcode for the updating load/store.
16087 bool isLoadOp = true;
16088 unsigned NewOpc = 0;
16089 unsigned NumVecs = 0;
16090 switch (IntNo) {
16091 default:
16092 llvm_unreachable("unexpected intrinsic for MVE VLDn combine");
16093 case Intrinsic::arm_mve_vld2q:
16094 NewOpc = ARMISD::VLD2_UPD;
16095 NumVecs = 2;
16096 break;
16097 case Intrinsic::arm_mve_vld4q:
16098 NewOpc = ARMISD::VLD4_UPD;
16099 NumVecs = 4;
16100 break;
16101 case Intrinsic::arm_mve_vst2q:
16102 NewOpc = ARMISD::VST2_UPD;
16103 NumVecs = 2;
16104 isLoadOp = false;
16105 break;
16106 case Intrinsic::arm_mve_vst4q:
16107 NewOpc = ARMISD::VST4_UPD;
16108 NumVecs = 4;
16109 isLoadOp = false;
16110 break;
16111 }
16112
16113 // Find the size of memory referenced by the load/store.
16114 EVT VecTy;
16115 if (isLoadOp) {
16116 VecTy = N->getValueType(0);
16117 } else {
16118 VecTy = N->getOperand(3).getValueType();
16119 }
16120
16121 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
16122
16123 // If the increment is a constant, it must match the memory ref size.
16124 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
16126 if (!CInc || CInc->getZExtValue() != NumBytes)
16127 continue;
16128
16129 // Create the new updating load/store node.
16130 // First, create an SDVTList for the new updating node's results.
16131 EVT Tys[6];
16132 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
16133 unsigned n;
16134 for (n = 0; n < NumResultVecs; ++n)
16135 Tys[n] = VecTy;
16136 Tys[n++] = MVT::i32;
16137 Tys[n] = MVT::Other;
16138 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
16139
16140 // Then, gather the new node's operands.
16142 Ops.push_back(N->getOperand(0)); // incoming chain
16143 Ops.push_back(N->getOperand(2)); // ptr
16144 Ops.push_back(Inc);
16145
16146 for (unsigned i = 3; i < N->getNumOperands(); ++i)
16147 Ops.push_back(N->getOperand(i));
16148
16149 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, VecTy,
16150 MemN->getMemOperand());
16151
16152 // Update the uses.
16153 SmallVector<SDValue, 5> NewResults;
16154 for (unsigned i = 0; i < NumResultVecs; ++i)
16155 NewResults.push_back(SDValue(UpdN.getNode(), i));
16156
16157 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
16158 DCI.CombineTo(N, NewResults);
16159 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
16160
16161 break;
16162 }
16163
16164 return SDValue();
16165}
16166
16167/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
16168/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
16169/// are also VDUPLANEs. If so, combine them to a vldN-dup operation and
16170/// return true.
16172 SelectionDAG &DAG = DCI.DAG;
16173 EVT VT = N->getValueType(0);
16174 // vldN-dup instructions only support 64-bit vectors for N > 1.
16175 if (!VT.is64BitVector())
16176 return false;
16177
16178 // Check if the VDUPLANE operand is a vldN-dup intrinsic.
16179 SDNode *VLD = N->getOperand(0).getNode();
16180 if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
16181 return false;
16182 unsigned NumVecs = 0;
16183 unsigned NewOpc = 0;
16184 unsigned IntNo = VLD->getConstantOperandVal(1);
16185 if (IntNo == Intrinsic::arm_neon_vld2lane) {
16186 NumVecs = 2;
16187 NewOpc = ARMISD::VLD2DUP;
16188 } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
16189 NumVecs = 3;
16190 NewOpc = ARMISD::VLD3DUP;
16191 } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
16192 NumVecs = 4;
16193 NewOpc = ARMISD::VLD4DUP;
16194 } else {
16195 return false;
16196 }
16197
16198 // First check that all the vldN-lane uses are VDUPLANEs and that the lane
16199 // numbers match the load.
16200 unsigned VLDLaneNo = VLD->getConstantOperandVal(NumVecs + 3);
16201 for (SDUse &Use : VLD->uses()) {
16202 // Ignore uses of the chain result.
16203 if (Use.getResNo() == NumVecs)
16204 continue;
16205 SDNode *User = Use.getUser();
16206 if (User->getOpcode() != ARMISD::VDUPLANE ||
16207 VLDLaneNo != User->getConstantOperandVal(1))
16208 return false;
16209 }
16210
16211 // Create the vldN-dup node.
16212 EVT Tys[5];
16213 unsigned n;
16214 for (n = 0; n < NumVecs; ++n)
16215 Tys[n] = VT;
16216 Tys[n] = MVT::Other;
16217 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumVecs + 1));
16218 SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
16220 SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
16221 Ops, VLDMemInt->getMemoryVT(),
16222 VLDMemInt->getMemOperand());
16223
16224 // Update the uses.
16225 for (SDUse &Use : VLD->uses()) {
16226 unsigned ResNo = Use.getResNo();
16227 // Ignore uses of the chain result.
16228 if (ResNo == NumVecs)
16229 continue;
16230 DCI.CombineTo(Use.getUser(), SDValue(VLDDup.getNode(), ResNo));
16231 }
16232
16233 // Now the vldN-lane intrinsic is dead except for its chain result.
16234 // Update uses of the chain.
16235 std::vector<SDValue> VLDDupResults;
16236 for (unsigned n = 0; n < NumVecs; ++n)
16237 VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
16238 VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
16239 DCI.CombineTo(VLD, VLDDupResults);
16240
16241 return true;
16242}
16243
16244/// PerformVDUPLANECombine - Target-specific dag combine xforms for
16245/// ARMISD::VDUPLANE.
16248 const ARMSubtarget *Subtarget) {
16249 SDValue Op = N->getOperand(0);
16250 EVT VT = N->getValueType(0);
16251
16252 // On MVE, we just convert the VDUPLANE to a VDUP with an extract.
16253 if (Subtarget->hasMVEIntegerOps()) {
16254 EVT ExtractVT = VT.getVectorElementType();
16255 // We need to ensure we are creating a legal type.
16256 if (!DCI.DAG.getTargetLoweringInfo().isTypeLegal(ExtractVT))
16257 ExtractVT = MVT::i32;
16258 SDValue Extract = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ExtractVT,
16259 N->getOperand(0), N->getOperand(1));
16260 return DCI.DAG.getNode(ARMISD::VDUP, SDLoc(N), VT, Extract);
16261 }
16262
16263 // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
16264 // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
16265 if (CombineVLDDUP(N, DCI))
16266 return SDValue(N, 0);
16267
16268 // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
16269 // redundant. Ignore bit_converts for now; element sizes are checked below.
16270 while (Op.getOpcode() == ISD::BITCAST)
16271 Op = Op.getOperand(0);
16272 if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
16273 return SDValue();
16274
16275 // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
16276 unsigned EltSize = Op.getScalarValueSizeInBits();
16277 // The canonical VMOV for a zero vector uses a 32-bit element size.
16278 unsigned Imm = Op.getConstantOperandVal(0);
16279 unsigned EltBits;
16280 if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0)
16281 EltSize = 8;
16282 if (EltSize > VT.getScalarSizeInBits())
16283 return SDValue();
16284
16285 return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
16286}
16287
16288/// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
16290 const ARMSubtarget *Subtarget) {
16291 SDValue Op = N->getOperand(0);
16292 SDLoc dl(N);
16293
16294 if (Subtarget->hasMVEIntegerOps()) {
16295 // Convert VDUP f32 -> VDUP BITCAST i32 under MVE, as we know the value will
16296 // need to come from a GPR.
16297 if (Op.getValueType() == MVT::f32)
16298 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
16299 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op));
16300 else if (Op.getValueType() == MVT::f16)
16301 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
16302 DAG.getNode(ARMISD::VMOVrh, dl, MVT::i32, Op));
16303 }
16304
16305 if (!Subtarget->hasNEON())
16306 return SDValue();
16307
16308 // Match VDUP(LOAD) -> VLD1DUP.
16309 // We match this pattern here rather than waiting for isel because the
16310 // transform is only legal for unindexed loads.
16311 LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode());
16312 if (LD && Op.hasOneUse() && LD->isUnindexed() &&
16313 LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) {
16314 SDValue Ops[] = {LD->getOperand(0), LD->getOperand(1),
16315 DAG.getConstant(LD->getAlign().value(), SDLoc(N), MVT::i32)};
16316 SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other);
16317 SDValue VLDDup =
16319 LD->getMemoryVT(), LD->getMemOperand());
16320 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1));
16321 return VLDDup;
16322 }
16323
16324 return SDValue();
16325}
16326
16329 const ARMSubtarget *Subtarget) {
16330 EVT VT = N->getValueType(0);
16331
16332 // If this is a legal vector load, try to combine it into a VLD1_UPD.
16333 if (Subtarget->hasNEON() && ISD::isNormalLoad(N) && VT.isVector() &&
16335 return CombineBaseUpdate(N, DCI);
16336
16337 return SDValue();
16338}
16339
16340// Optimize trunc store (of multiple scalars) to shuffle and store. First,
16341// pack all of the elements in one place. Next, store to memory in fewer
16342// chunks.
16344 SelectionDAG &DAG) {
16345 SDValue StVal = St->getValue();
16346 EVT VT = StVal.getValueType();
16347 if (!St->isTruncatingStore() || !VT.isVector())
16348 return SDValue();
16349 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16350 EVT StVT = St->getMemoryVT();
16351 unsigned NumElems = VT.getVectorNumElements();
16352 assert(StVT != VT && "Cannot truncate to the same type");
16353 unsigned FromEltSz = VT.getScalarSizeInBits();
16354 unsigned ToEltSz = StVT.getScalarSizeInBits();
16355
16356 // From, To sizes and ElemCount must be pow of two
16357 if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz))
16358 return SDValue();
16359
16360 // We are going to use the original vector elt for storing.
16361 // Accumulated smaller vector elements must be a multiple of the store size.
16362 if (0 != (NumElems * FromEltSz) % ToEltSz)
16363 return SDValue();
16364
16365 unsigned SizeRatio = FromEltSz / ToEltSz;
16366 assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
16367
16368 // Create a type on which we perform the shuffle.
16369 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
16370 NumElems * SizeRatio);
16371 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
16372
16373 SDLoc DL(St);
16374 SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
16375 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
16376 for (unsigned i = 0; i < NumElems; ++i)
16377 ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1
16378 : i * SizeRatio;
16379
16380 // Can't shuffle using an illegal type.
16381 if (!TLI.isTypeLegal(WideVecVT))
16382 return SDValue();
16383
16384 SDValue Shuff = DAG.getVectorShuffle(
16385 WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec);
16386 // At this point all of the data is stored at the bottom of the
16387 // register. We now need to save it to mem.
16388
16389 // Find the largest store unit
16390 MVT StoreType = MVT::i8;
16391 for (MVT Tp : MVT::integer_valuetypes()) {
16392 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
16393 StoreType = Tp;
16394 }
16395 // Didn't find a legal store type.
16396 if (!TLI.isTypeLegal(StoreType))
16397 return SDValue();
16398
16399 // Bitcast the original vector into a vector of store-size units
16400 EVT StoreVecVT =
16401 EVT::getVectorVT(*DAG.getContext(), StoreType,
16402 VT.getSizeInBits() / EVT(StoreType).getSizeInBits());
16403 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
16404 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
16406 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
16407 TLI.getPointerTy(DAG.getDataLayout()));
16408 SDValue BasePtr = St->getBasePtr();
16409
16410 // Perform one or more big stores into memory.
16411 unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits();
16412 for (unsigned I = 0; I < E; I++) {
16413 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType,
16414 ShuffWide, DAG.getIntPtrConstant(I, DL));
16415 SDValue Ch =
16416 DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(),
16417 St->getAlign(), St->getMemOperand()->getFlags());
16418 BasePtr =
16419 DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment);
16420 Chains.push_back(Ch);
16421 }
16422 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
16423}
16424
16425// Try taking a single vector store from an fpround (which would otherwise turn
16426// into an expensive buildvector) and splitting it into a series of narrowing
16427// stores.
16429 SelectionDAG &DAG) {
16430 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16431 return SDValue();
16432 SDValue Trunc = St->getValue();
16433 if (Trunc->getOpcode() != ISD::FP_ROUND)
16434 return SDValue();
16435 EVT FromVT = Trunc->getOperand(0).getValueType();
16436 EVT ToVT = Trunc.getValueType();
16437 if (!ToVT.isVector())
16438 return SDValue();
16440 EVT ToEltVT = ToVT.getVectorElementType();
16441 EVT FromEltVT = FromVT.getVectorElementType();
16442
16443 if (FromEltVT != MVT::f32 || ToEltVT != MVT::f16)
16444 return SDValue();
16445
16446 unsigned NumElements = 4;
16447 if (FromVT.getVectorNumElements() % NumElements != 0)
16448 return SDValue();
16449
16450 // Test if the Trunc will be convertable to a VMOVN with a shuffle, and if so
16451 // use the VMOVN over splitting the store. We are looking for patterns of:
16452 // !rev: 0 N 1 N+1 2 N+2 ...
16453 // rev: N 0 N+1 1 N+2 2 ...
16454 // The shuffle may either be a single source (in which case N = NumElts/2) or
16455 // two inputs extended with concat to the same size (in which case N =
16456 // NumElts).
16457 auto isVMOVNShuffle = [&](ShuffleVectorSDNode *SVN, bool Rev) {
16458 ArrayRef<int> M = SVN->getMask();
16459 unsigned NumElts = ToVT.getVectorNumElements();
16460 if (SVN->getOperand(1).isUndef())
16461 NumElts /= 2;
16462
16463 unsigned Off0 = Rev ? NumElts : 0;
16464 unsigned Off1 = Rev ? 0 : NumElts;
16465
16466 for (unsigned I = 0; I < NumElts; I += 2) {
16467 if (M[I] >= 0 && M[I] != (int)(Off0 + I / 2))
16468 return false;
16469 if (M[I + 1] >= 0 && M[I + 1] != (int)(Off1 + I / 2))
16470 return false;
16471 }
16472
16473 return true;
16474 };
16475
16476 if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc.getOperand(0)))
16477 if (isVMOVNShuffle(Shuffle, false) || isVMOVNShuffle(Shuffle, true))
16478 return SDValue();
16479
16480 LLVMContext &C = *DAG.getContext();
16481 SDLoc DL(St);
16482 // Details about the old store
16483 SDValue Ch = St->getChain();
16484 SDValue BasePtr = St->getBasePtr();
16485 Align Alignment = St->getBaseAlign();
16487 AAMDNodes AAInfo = St->getAAInfo();
16488
16489 // We split the store into slices of NumElements. fp16 trunc stores are vcvt
16490 // and then stored as truncating integer stores.
16491 EVT NewFromVT = EVT::getVectorVT(C, FromEltVT, NumElements);
16492 EVT NewToVT = EVT::getVectorVT(
16493 C, EVT::getIntegerVT(C, ToEltVT.getSizeInBits()), NumElements);
16494
16496 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
16497 unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8;
16498 SDValue NewPtr =
16499 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
16500
16501 SDValue Extract =
16502 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0),
16503 DAG.getConstant(i * NumElements, DL, MVT::i32));
16504
16505 SDValue FPTrunc =
16506 DAG.getNode(ARMISD::VCVTN, DL, MVT::v8f16, DAG.getUNDEF(MVT::v8f16),
16507 Extract, DAG.getConstant(0, DL, MVT::i32));
16508 Extract = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v4i32, FPTrunc);
16509
16510 SDValue Store = DAG.getTruncStore(
16511 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
16512 NewToVT, Alignment, MMOFlags, AAInfo);
16513 Stores.push_back(Store);
16514 }
16515 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
16516}
16517
16518// Try taking a single vector store from an MVETRUNC (which would otherwise turn
16519// into an expensive buildvector) and splitting it into a series of narrowing
16520// stores.
16522 SelectionDAG &DAG) {
16523 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16524 return SDValue();
16525 SDValue Trunc = St->getValue();
16526 if (Trunc->getOpcode() != ARMISD::MVETRUNC)
16527 return SDValue();
16528 EVT FromVT = Trunc->getOperand(0).getValueType();
16529 EVT ToVT = Trunc.getValueType();
16530
16531 LLVMContext &C = *DAG.getContext();
16532 SDLoc DL(St);
16533 // Details about the old store
16534 SDValue Ch = St->getChain();
16535 SDValue BasePtr = St->getBasePtr();
16536 Align Alignment = St->getBaseAlign();
16538 AAMDNodes AAInfo = St->getAAInfo();
16539
16540 EVT NewToVT = EVT::getVectorVT(C, ToVT.getVectorElementType(),
16541 FromVT.getVectorNumElements());
16542
16544 for (unsigned i = 0; i < Trunc.getNumOperands(); i++) {
16545 unsigned NewOffset =
16546 i * FromVT.getVectorNumElements() * ToVT.getScalarSizeInBits() / 8;
16547 SDValue NewPtr =
16548 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
16549
16550 SDValue Extract = Trunc.getOperand(i);
16551 SDValue Store = DAG.getTruncStore(
16552 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
16553 NewToVT, Alignment, MMOFlags, AAInfo);
16554 Stores.push_back(Store);
16555 }
16556 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
16557}
16558
16559// Given a floating point store from an extracted vector, with an integer
16560// VGETLANE that already exists, store the existing VGETLANEu directly. This can
16561// help reduce fp register pressure, doesn't require the fp extract and allows
16562// use of more integer post-inc stores not available with vstr.
16564 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16565 return SDValue();
16566 SDValue Extract = St->getValue();
16567 EVT VT = Extract.getValueType();
16568 // For now only uses f16. This may be useful for f32 too, but that will
16569 // be bitcast(extract), not the VGETLANEu we currently check here.
16570 if (VT != MVT::f16 || Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16571 return SDValue();
16572
16573 SDNode *GetLane =
16574 DAG.getNodeIfExists(ARMISD::VGETLANEu, DAG.getVTList(MVT::i32),
16575 {Extract.getOperand(0), Extract.getOperand(1)});
16576 if (!GetLane)
16577 return SDValue();
16578
16579 LLVMContext &C = *DAG.getContext();
16580 SDLoc DL(St);
16581 // Create a new integer store to replace the existing floating point version.
16582 SDValue Ch = St->getChain();
16583 SDValue BasePtr = St->getBasePtr();
16584 Align Alignment = St->getBaseAlign();
16586 AAMDNodes AAInfo = St->getAAInfo();
16587 EVT NewToVT = EVT::getIntegerVT(C, VT.getSizeInBits());
16588 SDValue Store = DAG.getTruncStore(Ch, DL, SDValue(GetLane, 0), BasePtr,
16589 St->getPointerInfo(), NewToVT, Alignment,
16590 MMOFlags, AAInfo);
16591
16592 return Store;
16593}
16594
16595/// PerformSTORECombine - Target-specific dag combine xforms for
16596/// ISD::STORE.
16599 const ARMSubtarget *Subtarget) {
16601 if (St->isVolatile())
16602 return SDValue();
16603 SDValue StVal = St->getValue();
16604 EVT VT = StVal.getValueType();
16605
16606 if (Subtarget->hasNEON())
16607 if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG))
16608 return Store;
16609
16610 if (Subtarget->hasMVEFloatOps())
16611 if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG))
16612 return NewToken;
16613
16614 if (Subtarget->hasMVEIntegerOps()) {
16615 if (SDValue NewChain = PerformExtractFpToIntStores(St, DCI.DAG))
16616 return NewChain;
16617 if (SDValue NewToken =
16619 return NewToken;
16620 }
16621
16622 if (!ISD::isNormalStore(St))
16623 return SDValue();
16624
16625 // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
16626 // ARM stores of arguments in the same cache line.
16627 if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
16628 StVal.getNode()->hasOneUse()) {
16629 SelectionDAG &DAG = DCI.DAG;
16630 bool isBigEndian = DAG.getDataLayout().isBigEndian();
16631 SDLoc DL(St);
16632 SDValue BasePtr = St->getBasePtr();
16633 SDValue NewST1 = DAG.getStore(
16634 St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0),
16635 BasePtr, St->getPointerInfo(), St->getBaseAlign(),
16636 St->getMemOperand()->getFlags());
16637
16638 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
16639 DAG.getConstant(4, DL, MVT::i32));
16640 return DAG.getStore(NewST1.getValue(0), DL,
16641 StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
16642 OffsetPtr, St->getPointerInfo().getWithOffset(4),
16643 St->getBaseAlign(), St->getMemOperand()->getFlags());
16644 }
16645
16646 if (StVal.getValueType() == MVT::i64 &&
16648
16649 // Bitcast an i64 store extracted from a vector to f64.
16650 // Otherwise, the i64 value will be legalized to a pair of i32 values.
16651 SelectionDAG &DAG = DCI.DAG;
16652 SDLoc dl(StVal);
16653 SDValue IntVec = StVal.getOperand(0);
16654 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
16656 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
16657 SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
16658 Vec, StVal.getOperand(1));
16659 dl = SDLoc(N);
16660 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
16661 // Make the DAGCombiner fold the bitcasts.
16662 DCI.AddToWorklist(Vec.getNode());
16663 DCI.AddToWorklist(ExtElt.getNode());
16664 DCI.AddToWorklist(V.getNode());
16665 return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
16666 St->getPointerInfo(), St->getAlign(),
16667 St->getMemOperand()->getFlags(), St->getAAInfo());
16668 }
16669
16670 // If this is a legal vector store, try to combine it into a VST1_UPD.
16671 if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() &&
16673 return CombineBaseUpdate(N, DCI);
16674
16675 return SDValue();
16676}
16677
16678/// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
16679/// can replace combinations of VMUL and VCVT (floating-point to integer)
16680/// when the VMUL has a constant operand that is a power of 2.
16681///
16682/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
16683/// vmul.f32 d16, d17, d16
16684/// vcvt.s32.f32 d16, d16
16685/// becomes:
16686/// vcvt.s32.f32 d16, d16, #3
16688 const ARMSubtarget *Subtarget) {
16689 if (!Subtarget->hasNEON())
16690 return SDValue();
16691
16692 SDValue Op = N->getOperand(0);
16693 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
16694 Op.getOpcode() != ISD::FMUL)
16695 return SDValue();
16696
16697 SDValue ConstVec = Op->getOperand(1);
16698 if (!isa<BuildVectorSDNode>(ConstVec))
16699 return SDValue();
16700
16701 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
16702 uint32_t FloatBits = FloatTy.getSizeInBits();
16703 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
16704 uint32_t IntBits = IntTy.getSizeInBits();
16705 unsigned NumLanes = Op.getValueType().getVectorNumElements();
16706 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
16707 // These instructions only exist converting from f32 to i32. We can handle
16708 // smaller integers by generating an extra truncate, but larger ones would
16709 // be lossy. We also can't handle anything other than 2 or 4 lanes, since
16710 // these intructions only support v2i32/v4i32 types.
16711 return SDValue();
16712 }
16713
16714 BitVector UndefElements;
16716 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
16717 if (C == -1 || C == 0 || C > 32)
16718 return SDValue();
16719
16720 SDLoc dl(N);
16721 bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
16722 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
16723 Intrinsic::arm_neon_vcvtfp2fxu;
16724 SDValue FixConv = DAG.getNode(
16725 ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
16726 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0),
16727 DAG.getConstant(C, dl, MVT::i32));
16728
16729 if (IntBits < FloatBits)
16730 FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv);
16731
16732 return FixConv;
16733}
16734
16736 const ARMSubtarget *Subtarget) {
16737 if (!Subtarget->hasMVEFloatOps())
16738 return SDValue();
16739
16740 // Turn (fadd x, (vselect c, y, -0.0)) into (vselect c, (fadd x, y), x)
16741 // The second form can be more easily turned into a predicated vadd, and
16742 // possibly combined into a fma to become a predicated vfma.
16743 SDValue Op0 = N->getOperand(0);
16744 SDValue Op1 = N->getOperand(1);
16745 EVT VT = N->getValueType(0);
16746 SDLoc DL(N);
16747
16748 // The identity element for a fadd is -0.0 or +0.0 when the nsz flag is set,
16749 // which these VMOV's represent.
16750 auto isIdentitySplat = [&](SDValue Op, bool NSZ) {
16751 if (Op.getOpcode() != ISD::BITCAST ||
16752 Op.getOperand(0).getOpcode() != ARMISD::VMOVIMM)
16753 return false;
16754 uint64_t ImmVal = Op.getOperand(0).getConstantOperandVal(0);
16755 if (VT == MVT::v4f32 && (ImmVal == 1664 || (ImmVal == 0 && NSZ)))
16756 return true;
16757 if (VT == MVT::v8f16 && (ImmVal == 2688 || (ImmVal == 0 && NSZ)))
16758 return true;
16759 return false;
16760 };
16761
16762 if (Op0.getOpcode() == ISD::VSELECT && Op1.getOpcode() != ISD::VSELECT)
16763 std::swap(Op0, Op1);
16764
16765 if (Op1.getOpcode() != ISD::VSELECT)
16766 return SDValue();
16767
16768 SDNodeFlags FaddFlags = N->getFlags();
16769 bool NSZ = FaddFlags.hasNoSignedZeros();
16770 if (!isIdentitySplat(Op1.getOperand(2), NSZ))
16771 return SDValue();
16772
16773 SDValue FAdd =
16774 DAG.getNode(ISD::FADD, DL, VT, Op0, Op1.getOperand(1), FaddFlags);
16775 return DAG.getNode(ISD::VSELECT, DL, VT, Op1.getOperand(0), FAdd, Op0, FaddFlags);
16776}
16777
16779 SDValue LHS = N->getOperand(0);
16780 SDValue RHS = N->getOperand(1);
16781 EVT VT = N->getValueType(0);
16782 SDLoc DL(N);
16783
16784 if (!N->getFlags().hasAllowReassociation())
16785 return SDValue();
16786
16787 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
16788 auto ReassocComplex = [&](SDValue A, SDValue B) {
16789 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
16790 return SDValue();
16791 unsigned Opc = A.getConstantOperandVal(0);
16792 if (Opc != Intrinsic::arm_mve_vcmlaq)
16793 return SDValue();
16794 SDValue VCMLA = DAG.getNode(
16795 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0), A.getOperand(1),
16796 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(2), B, N->getFlags()),
16797 A.getOperand(3), A.getOperand(4));
16798 VCMLA->setFlags(A->getFlags());
16799 return VCMLA;
16800 };
16801 if (SDValue R = ReassocComplex(LHS, RHS))
16802 return R;
16803 if (SDValue R = ReassocComplex(RHS, LHS))
16804 return R;
16805
16806 return SDValue();
16807}
16808
16810 const ARMSubtarget *Subtarget) {
16811 if (SDValue S = PerformFAddVSelectCombine(N, DAG, Subtarget))
16812 return S;
16813 if (SDValue S = PerformFADDVCMLACombine(N, DAG))
16814 return S;
16815 return SDValue();
16816}
16817
16818/// PerformVMulVCTPCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
16819/// can replace combinations of VCVT (integer to floating-point) and VMUL
16820/// when the VMUL has a constant operand that is a power of 2.
16821///
16822/// Example (assume d17 = <float 0.125, float 0.125>):
16823/// vcvt.f32.s32 d16, d16
16824/// vmul.f32 d16, d16, d17
16825/// becomes:
16826/// vcvt.f32.s32 d16, d16, #3
16828 const ARMSubtarget *Subtarget) {
16829 if (!Subtarget->hasNEON())
16830 return SDValue();
16831
16832 SDValue Op = N->getOperand(0);
16833 unsigned OpOpcode = Op.getNode()->getOpcode();
16834 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() ||
16835 (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
16836 return SDValue();
16837
16838 SDValue ConstVec = N->getOperand(1);
16839 if (!isa<BuildVectorSDNode>(ConstVec))
16840 return SDValue();
16841
16842 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
16843 uint32_t FloatBits = FloatTy.getSizeInBits();
16844 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
16845 uint32_t IntBits = IntTy.getSizeInBits();
16846 unsigned NumLanes = Op.getValueType().getVectorNumElements();
16847 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
16848 // These instructions only exist converting from i32 to f32. We can handle
16849 // smaller integers by generating an extra extend, but larger ones would
16850 // be lossy. We also can't handle anything other than 2 or 4 lanes, since
16851 // these intructions only support v2i32/v4i32 types.
16852 return SDValue();
16853 }
16854
16855 ConstantFPSDNode *CN = isConstOrConstSplatFP(ConstVec, true);
16856 APFloat Recip(0.0f);
16857 if (!CN || !CN->getValueAPF().getExactInverse(&Recip))
16858 return SDValue();
16859
16860 bool IsExact;
16861 APSInt IntVal(33);
16862 if (Recip.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact) !=
16863 APFloat::opOK ||
16864 !IsExact)
16865 return SDValue();
16866
16867 int32_t C = IntVal.exactLogBase2();
16868 if (C == -1 || C == 0 || C > 32)
16869 return SDValue();
16870
16871 SDLoc DL(N);
16872 bool isSigned = OpOpcode == ISD::SINT_TO_FP;
16873 SDValue ConvInput = Op.getOperand(0);
16874 if (IntBits < FloatBits)
16876 NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, ConvInput);
16877
16878 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp
16879 : Intrinsic::arm_neon_vcvtfxu2fp;
16880 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
16881 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
16882 DAG.getConstant(C, DL, MVT::i32));
16883}
16884
16886 const ARMSubtarget *ST) {
16887 if (!ST->hasMVEIntegerOps())
16888 return SDValue();
16889
16890 assert(N->getOpcode() == ISD::VECREDUCE_ADD);
16891 EVT ResVT = N->getValueType(0);
16892 SDValue N0 = N->getOperand(0);
16893 SDLoc dl(N);
16894
16895 // Try to turn vecreduce_add(add(x, y)) into vecreduce(x) + vecreduce(y)
16896 if (ResVT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
16897 (N0.getValueType() == MVT::v4i32 || N0.getValueType() == MVT::v8i16 ||
16898 N0.getValueType() == MVT::v16i8)) {
16899 SDValue Red0 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(0));
16900 SDValue Red1 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(1));
16901 return DAG.getNode(ISD::ADD, dl, ResVT, Red0, Red1);
16902 }
16903
16904 // We are looking for something that will have illegal types if left alone,
16905 // but that we can convert to a single instruction under MVE. For example
16906 // vecreduce_add(sext(A, v8i32)) => VADDV.s16 A
16907 // or
16908 // vecreduce_add(mul(zext(A, v16i32), zext(B, v16i32))) => VMLADAV.u8 A, B
16909
16910 // The legal cases are:
16911 // VADDV u/s 8/16/32
16912 // VMLAV u/s 8/16/32
16913 // VADDLV u/s 32
16914 // VMLALV u/s 16/32
16915
16916 // If the input vector is smaller than legal (v4i8/v4i16 for example) we can
16917 // extend it and use v4i32 instead.
16918 auto ExtTypeMatches = [](SDValue A, ArrayRef<MVT> ExtTypes) {
16919 EVT AVT = A.getValueType();
16920 return any_of(ExtTypes, [&](MVT Ty) {
16921 return AVT.getVectorNumElements() == Ty.getVectorNumElements() &&
16922 AVT.bitsLE(Ty);
16923 });
16924 };
16925 auto ExtendIfNeeded = [&](SDValue A, unsigned ExtendCode) {
16926 EVT AVT = A.getValueType();
16927 if (!AVT.is128BitVector())
16928 A = DAG.getNode(ExtendCode, dl,
16930 128 / AVT.getVectorMinNumElements())),
16931 A);
16932 return A;
16933 };
16934 auto IsVADDV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes) {
16935 if (ResVT != RetTy || N0->getOpcode() != ExtendCode)
16936 return SDValue();
16937 SDValue A = N0->getOperand(0);
16938 if (ExtTypeMatches(A, ExtTypes))
16939 return ExtendIfNeeded(A, ExtendCode);
16940 return SDValue();
16941 };
16942 auto IsPredVADDV = [&](MVT RetTy, unsigned ExtendCode,
16943 ArrayRef<MVT> ExtTypes, SDValue &Mask) {
16944 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
16946 return SDValue();
16947 Mask = N0->getOperand(0);
16948 SDValue Ext = N0->getOperand(1);
16949 if (Ext->getOpcode() != ExtendCode)
16950 return SDValue();
16951 SDValue A = Ext->getOperand(0);
16952 if (ExtTypeMatches(A, ExtTypes))
16953 return ExtendIfNeeded(A, ExtendCode);
16954 return SDValue();
16955 };
16956 auto IsVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
16957 SDValue &A, SDValue &B) {
16958 // For a vmla we are trying to match a larger pattern:
16959 // ExtA = sext/zext A
16960 // ExtB = sext/zext B
16961 // Mul = mul ExtA, ExtB
16962 // vecreduce.add Mul
16963 // There might also be en extra extend between the mul and the addreduce, so
16964 // long as the bitwidth is high enough to make them equivalent (for example
16965 // original v8i16 might be mul at v8i32 and the reduce happens at v8i64).
16966 if (ResVT != RetTy)
16967 return false;
16968 SDValue Mul = N0;
16969 if (Mul->getOpcode() == ExtendCode &&
16970 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
16971 ResVT.getScalarSizeInBits())
16972 Mul = Mul->getOperand(0);
16973 if (Mul->getOpcode() != ISD::MUL)
16974 return false;
16975 SDValue ExtA = Mul->getOperand(0);
16976 SDValue ExtB = Mul->getOperand(1);
16977 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
16978 return false;
16979 A = ExtA->getOperand(0);
16980 B = ExtB->getOperand(0);
16981 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
16982 A = ExtendIfNeeded(A, ExtendCode);
16983 B = ExtendIfNeeded(B, ExtendCode);
16984 return true;
16985 }
16986 return false;
16987 };
16988 auto IsPredVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
16989 SDValue &A, SDValue &B, SDValue &Mask) {
16990 // Same as the pattern above with a select for the zero predicated lanes
16991 // ExtA = sext/zext A
16992 // ExtB = sext/zext B
16993 // Mul = mul ExtA, ExtB
16994 // N0 = select Mask, Mul, 0
16995 // vecreduce.add N0
16996 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
16998 return false;
16999 Mask = N0->getOperand(0);
17000 SDValue Mul = N0->getOperand(1);
17001 if (Mul->getOpcode() == ExtendCode &&
17002 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
17003 ResVT.getScalarSizeInBits())
17004 Mul = Mul->getOperand(0);
17005 if (Mul->getOpcode() != ISD::MUL)
17006 return false;
17007 SDValue ExtA = Mul->getOperand(0);
17008 SDValue ExtB = Mul->getOperand(1);
17009 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
17010 return false;
17011 A = ExtA->getOperand(0);
17012 B = ExtB->getOperand(0);
17013 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
17014 A = ExtendIfNeeded(A, ExtendCode);
17015 B = ExtendIfNeeded(B, ExtendCode);
17016 return true;
17017 }
17018 return false;
17019 };
17020 auto Create64bitNode = [&](unsigned Opcode, ArrayRef<SDValue> Ops) {
17021 // Split illegal MVT::v16i8->i64 vector reductions into two legal v8i16->i64
17022 // reductions. The operands are extended with MVEEXT, but as they are
17023 // reductions the lane orders do not matter. MVEEXT may be combined with
17024 // loads to produce two extending loads, or else they will be expanded to
17025 // VREV/VMOVL.
17026 EVT VT = Ops[0].getValueType();
17027 if (VT == MVT::v16i8) {
17028 assert((Opcode == ARMISD::VMLALVs || Opcode == ARMISD::VMLALVu) &&
17029 "Unexpected illegal long reduction opcode");
17030 bool IsUnsigned = Opcode == ARMISD::VMLALVu;
17031
17032 SDValue Ext0 =
17033 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
17034 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[0]);
17035 SDValue Ext1 =
17036 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
17037 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[1]);
17038
17039 SDValue MLA0 = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
17040 Ext0, Ext1);
17041 SDValue MLA1 =
17042 DAG.getNode(IsUnsigned ? ARMISD::VMLALVAu : ARMISD::VMLALVAs, dl,
17043 DAG.getVTList(MVT::i32, MVT::i32), MLA0, MLA0.getValue(1),
17044 Ext0.getValue(1), Ext1.getValue(1));
17045 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, MLA1, MLA1.getValue(1));
17046 }
17047 SDValue Node = DAG.getNode(Opcode, dl, {MVT::i32, MVT::i32}, Ops);
17048 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Node,
17049 SDValue(Node.getNode(), 1));
17050 };
17051
17052 SDValue A, B;
17053 SDValue Mask;
17054 if (IsVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
17055 return DAG.getNode(ARMISD::VMLAVs, dl, ResVT, A, B);
17056 if (IsVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
17057 return DAG.getNode(ARMISD::VMLAVu, dl, ResVT, A, B);
17058 if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
17059 A, B))
17060 return Create64bitNode(ARMISD::VMLALVs, {A, B});
17061 if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
17062 A, B))
17063 return Create64bitNode(ARMISD::VMLALVu, {A, B});
17064 if (IsVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B))
17065 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17066 DAG.getNode(ARMISD::VMLAVs, dl, MVT::i32, A, B));
17067 if (IsVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B))
17068 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17069 DAG.getNode(ARMISD::VMLAVu, dl, MVT::i32, A, B));
17070
17071 if (IsPredVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
17072 Mask))
17073 return DAG.getNode(ARMISD::VMLAVps, dl, ResVT, A, B, Mask);
17074 if (IsPredVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
17075 Mask))
17076 return DAG.getNode(ARMISD::VMLAVpu, dl, ResVT, A, B, Mask);
17077 if (IsPredVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
17078 Mask))
17079 return Create64bitNode(ARMISD::VMLALVps, {A, B, Mask});
17080 if (IsPredVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
17081 Mask))
17082 return Create64bitNode(ARMISD::VMLALVpu, {A, B, Mask});
17083 if (IsPredVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B, Mask))
17084 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17085 DAG.getNode(ARMISD::VMLAVps, dl, MVT::i32, A, B, Mask));
17086 if (IsPredVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B, Mask))
17087 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17088 DAG.getNode(ARMISD::VMLAVpu, dl, MVT::i32, A, B, Mask));
17089
17090 if (SDValue A = IsVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}))
17091 return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A);
17092 if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}))
17093 return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A);
17094 if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}))
17095 return Create64bitNode(ARMISD::VADDLVs, {A});
17096 if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}))
17097 return Create64bitNode(ARMISD::VADDLVu, {A});
17098 if (SDValue A = IsVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}))
17099 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17100 DAG.getNode(ARMISD::VADDVs, dl, MVT::i32, A));
17101 if (SDValue A = IsVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}))
17102 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17103 DAG.getNode(ARMISD::VADDVu, dl, MVT::i32, A));
17104
17105 if (SDValue A = IsPredVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
17106 return DAG.getNode(ARMISD::VADDVps, dl, ResVT, A, Mask);
17107 if (SDValue A = IsPredVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
17108 return DAG.getNode(ARMISD::VADDVpu, dl, ResVT, A, Mask);
17109 if (SDValue A = IsPredVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}, Mask))
17110 return Create64bitNode(ARMISD::VADDLVps, {A, Mask});
17111 if (SDValue A = IsPredVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}, Mask))
17112 return Create64bitNode(ARMISD::VADDLVpu, {A, Mask});
17113 if (SDValue A = IsPredVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, Mask))
17114 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17115 DAG.getNode(ARMISD::VADDVps, dl, MVT::i32, A, Mask));
17116 if (SDValue A = IsPredVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, Mask))
17117 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17118 DAG.getNode(ARMISD::VADDVpu, dl, MVT::i32, A, Mask));
17119
17120 // Some complications. We can get a case where the two inputs of the mul are
17121 // the same, then the output sext will have been helpfully converted to a
17122 // zext. Turn it back.
17123 SDValue Op = N0;
17124 if (Op->getOpcode() == ISD::VSELECT)
17125 Op = Op->getOperand(1);
17126 if (Op->getOpcode() == ISD::ZERO_EXTEND &&
17127 Op->getOperand(0)->getOpcode() == ISD::MUL) {
17128 SDValue Mul = Op->getOperand(0);
17129 if (Mul->getOperand(0) == Mul->getOperand(1) &&
17130 Mul->getOperand(0)->getOpcode() == ISD::SIGN_EXTEND) {
17131 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, N0->getValueType(0), Mul);
17132 if (Op != N0)
17133 Ext = DAG.getNode(ISD::VSELECT, dl, N0->getValueType(0),
17134 N0->getOperand(0), Ext, N0->getOperand(2));
17135 return DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, Ext);
17136 }
17137 }
17138
17139 return SDValue();
17140}
17141
17142// Looks for vaddv(shuffle) or vmlav(shuffle, shuffle), with a shuffle where all
17143// the lanes are used. Due to the reduction being commutative the shuffle can be
17144// removed.
17146 unsigned VecOp = N->getOperand(0).getValueType().isVector() ? 0 : 2;
17147 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(VecOp));
17148 if (!Shuf || !Shuf->getOperand(1).isUndef())
17149 return SDValue();
17150
17151 // Check all elements are used once in the mask.
17152 ArrayRef<int> Mask = Shuf->getMask();
17153 APInt SetElts(Mask.size(), 0);
17154 for (int E : Mask) {
17155 if (E < 0 || E >= (int)Mask.size())
17156 return SDValue();
17157 SetElts.setBit(E);
17158 }
17159 if (!SetElts.isAllOnes())
17160 return SDValue();
17161
17162 if (N->getNumOperands() != VecOp + 1) {
17163 auto *Shuf2 = dyn_cast<ShuffleVectorSDNode>(N->getOperand(VecOp + 1));
17164 if (!Shuf2 || !Shuf2->getOperand(1).isUndef() || Shuf2->getMask() != Mask)
17165 return SDValue();
17166 }
17167
17169 for (SDValue Op : N->ops()) {
17170 if (Op.getValueType().isVector())
17171 Ops.push_back(Op.getOperand(0));
17172 else
17173 Ops.push_back(Op);
17174 }
17175 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getVTList(), Ops);
17176}
17177
17180 SDValue Op0 = N->getOperand(0);
17181 SDValue Op1 = N->getOperand(1);
17182 unsigned IsTop = N->getConstantOperandVal(2);
17183
17184 // VMOVNT a undef -> a
17185 // VMOVNB a undef -> a
17186 // VMOVNB undef a -> a
17187 if (Op1->isUndef())
17188 return Op0;
17189 if (Op0->isUndef() && !IsTop)
17190 return Op1;
17191
17192 // VMOVNt(c, VQMOVNb(a, b)) => VQMOVNt(c, b)
17193 // VMOVNb(c, VQMOVNb(a, b)) => VQMOVNb(c, b)
17194 if ((Op1->getOpcode() == ARMISD::VQMOVNs ||
17195 Op1->getOpcode() == ARMISD::VQMOVNu) &&
17196 Op1->getConstantOperandVal(2) == 0)
17197 return DCI.DAG.getNode(Op1->getOpcode(), SDLoc(Op1), N->getValueType(0),
17198 Op0, Op1->getOperand(1), N->getOperand(2));
17199
17200 // Only the bottom lanes from Qm (Op1) and either the top or bottom lanes from
17201 // Qd (Op0) are demanded from a VMOVN, depending on whether we are inserting
17202 // into the top or bottom lanes.
17203 unsigned NumElts = N->getValueType(0).getVectorNumElements();
17204 APInt Op1DemandedElts = APInt::getSplat(NumElts, APInt::getLowBitsSet(2, 1));
17205 APInt Op0DemandedElts =
17206 IsTop ? Op1DemandedElts
17207 : APInt::getSplat(NumElts, APInt::getHighBitsSet(2, 1));
17208
17209 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
17210 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI))
17211 return SDValue(N, 0);
17212 if (TLI.SimplifyDemandedVectorElts(Op1, Op1DemandedElts, DCI))
17213 return SDValue(N, 0);
17214
17215 return SDValue();
17216}
17217
17220 SDValue Op0 = N->getOperand(0);
17221 unsigned IsTop = N->getConstantOperandVal(2);
17222
17223 unsigned NumElts = N->getValueType(0).getVectorNumElements();
17224 APInt Op0DemandedElts =
17225 APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
17226 : APInt::getHighBitsSet(2, 1));
17227
17228 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
17229 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI))
17230 return SDValue(N, 0);
17231 return SDValue();
17232}
17233
17236 EVT VT = N->getValueType(0);
17237 SDValue LHS = N->getOperand(0);
17238 SDValue RHS = N->getOperand(1);
17239
17240 auto *Shuf0 = dyn_cast<ShuffleVectorSDNode>(LHS);
17241 auto *Shuf1 = dyn_cast<ShuffleVectorSDNode>(RHS);
17242 // Turn VQDMULH(shuffle, shuffle) -> shuffle(VQDMULH)
17243 if (Shuf0 && Shuf1 && Shuf0->getMask().equals(Shuf1->getMask()) &&
17244 LHS.getOperand(1).isUndef() && RHS.getOperand(1).isUndef() &&
17245 (LHS.hasOneUse() || RHS.hasOneUse() || LHS == RHS)) {
17246 SDLoc DL(N);
17247 SDValue NewBinOp = DCI.DAG.getNode(N->getOpcode(), DL, VT,
17248 LHS.getOperand(0), RHS.getOperand(0));
17249 SDValue UndefV = LHS.getOperand(1);
17250 return DCI.DAG.getVectorShuffle(VT, DL, NewBinOp, UndefV, Shuf0->getMask());
17251 }
17252 return SDValue();
17253}
17254
17256 SDLoc DL(N);
17257 SDValue Op0 = N->getOperand(0);
17258 SDValue Op1 = N->getOperand(1);
17259
17260 // Turn X << -C -> X >> C and viceversa. The negative shifts can come up from
17261 // uses of the intrinsics.
17262 if (auto C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
17263 int ShiftAmt = C->getSExtValue();
17264 if (ShiftAmt == 0) {
17265 SDValue Merge = DAG.getMergeValues({Op0, Op1}, DL);
17266 DAG.ReplaceAllUsesWith(N, Merge.getNode());
17267 return SDValue();
17268 }
17269
17270 if (ShiftAmt >= -32 && ShiftAmt < 0) {
17271 unsigned NewOpcode =
17272 N->getOpcode() == ARMISD::LSLL ? ARMISD::LSRL : ARMISD::LSLL;
17273 SDValue NewShift = DAG.getNode(NewOpcode, DL, N->getVTList(), Op0, Op1,
17274 DAG.getConstant(-ShiftAmt, DL, MVT::i32));
17275 DAG.ReplaceAllUsesWith(N, NewShift.getNode());
17276 return NewShift;
17277 }
17278 }
17279
17280 return SDValue();
17281}
17282
17283/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
17285 DAGCombinerInfo &DCI) const {
17286 SelectionDAG &DAG = DCI.DAG;
17287 unsigned IntNo = N->getConstantOperandVal(0);
17288 switch (IntNo) {
17289 default:
17290 // Don't do anything for most intrinsics.
17291 break;
17292
17293 // Vector shifts: check for immediate versions and lower them.
17294 // Note: This is done during DAG combining instead of DAG legalizing because
17295 // the build_vectors for 64-bit vector element shift counts are generally
17296 // not legal, and it is hard to see their values after they get legalized to
17297 // loads from a constant pool.
17298 case Intrinsic::arm_neon_vshifts:
17299 case Intrinsic::arm_neon_vshiftu:
17300 case Intrinsic::arm_neon_vrshifts:
17301 case Intrinsic::arm_neon_vrshiftu:
17302 case Intrinsic::arm_neon_vrshiftn:
17303 case Intrinsic::arm_neon_vqshifts:
17304 case Intrinsic::arm_neon_vqshiftu:
17305 case Intrinsic::arm_neon_vqshiftsu:
17306 case Intrinsic::arm_neon_vqshiftns:
17307 case Intrinsic::arm_neon_vqshiftnu:
17308 case Intrinsic::arm_neon_vqshiftnsu:
17309 case Intrinsic::arm_neon_vqrshiftns:
17310 case Intrinsic::arm_neon_vqrshiftnu:
17311 case Intrinsic::arm_neon_vqrshiftnsu: {
17312 EVT VT = N->getOperand(1).getValueType();
17313 int64_t Cnt;
17314 unsigned VShiftOpc = 0;
17315
17316 switch (IntNo) {
17317 case Intrinsic::arm_neon_vshifts:
17318 case Intrinsic::arm_neon_vshiftu:
17319 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
17320 VShiftOpc = ARMISD::VSHLIMM;
17321 break;
17322 }
17323 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
17324 VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM
17325 : ARMISD::VSHRuIMM);
17326 break;
17327 }
17328 return SDValue();
17329
17330 case Intrinsic::arm_neon_vrshifts:
17331 case Intrinsic::arm_neon_vrshiftu:
17332 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
17333 break;
17334 return SDValue();
17335
17336 case Intrinsic::arm_neon_vqshifts:
17337 case Intrinsic::arm_neon_vqshiftu:
17338 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
17339 break;
17340 return SDValue();
17341
17342 case Intrinsic::arm_neon_vqshiftsu:
17343 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
17344 break;
17345 llvm_unreachable("invalid shift count for vqshlu intrinsic");
17346
17347 case Intrinsic::arm_neon_vrshiftn:
17348 case Intrinsic::arm_neon_vqshiftns:
17349 case Intrinsic::arm_neon_vqshiftnu:
17350 case Intrinsic::arm_neon_vqshiftnsu:
17351 case Intrinsic::arm_neon_vqrshiftns:
17352 case Intrinsic::arm_neon_vqrshiftnu:
17353 case Intrinsic::arm_neon_vqrshiftnsu:
17354 // Narrowing shifts require an immediate right shift.
17355 if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
17356 break;
17357 llvm_unreachable("invalid shift count for narrowing vector shift "
17358 "intrinsic");
17359
17360 default:
17361 llvm_unreachable("unhandled vector shift");
17362 }
17363
17364 switch (IntNo) {
17365 case Intrinsic::arm_neon_vshifts:
17366 case Intrinsic::arm_neon_vshiftu:
17367 // Opcode already set above.
17368 break;
17369 case Intrinsic::arm_neon_vrshifts:
17370 VShiftOpc = ARMISD::VRSHRsIMM;
17371 break;
17372 case Intrinsic::arm_neon_vrshiftu:
17373 VShiftOpc = ARMISD::VRSHRuIMM;
17374 break;
17375 case Intrinsic::arm_neon_vrshiftn:
17376 VShiftOpc = ARMISD::VRSHRNIMM;
17377 break;
17378 case Intrinsic::arm_neon_vqshifts:
17379 VShiftOpc = ARMISD::VQSHLsIMM;
17380 break;
17381 case Intrinsic::arm_neon_vqshiftu:
17382 VShiftOpc = ARMISD::VQSHLuIMM;
17383 break;
17384 case Intrinsic::arm_neon_vqshiftsu:
17385 VShiftOpc = ARMISD::VQSHLsuIMM;
17386 break;
17387 case Intrinsic::arm_neon_vqshiftns:
17388 VShiftOpc = ARMISD::VQSHRNsIMM;
17389 break;
17390 case Intrinsic::arm_neon_vqshiftnu:
17391 VShiftOpc = ARMISD::VQSHRNuIMM;
17392 break;
17393 case Intrinsic::arm_neon_vqshiftnsu:
17394 VShiftOpc = ARMISD::VQSHRNsuIMM;
17395 break;
17396 case Intrinsic::arm_neon_vqrshiftns:
17397 VShiftOpc = ARMISD::VQRSHRNsIMM;
17398 break;
17399 case Intrinsic::arm_neon_vqrshiftnu:
17400 VShiftOpc = ARMISD::VQRSHRNuIMM;
17401 break;
17402 case Intrinsic::arm_neon_vqrshiftnsu:
17403 VShiftOpc = ARMISD::VQRSHRNsuIMM;
17404 break;
17405 }
17406
17407 SDLoc dl(N);
17408 return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
17409 N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32));
17410 }
17411
17412 case Intrinsic::arm_neon_vshiftins: {
17413 EVT VT = N->getOperand(1).getValueType();
17414 int64_t Cnt;
17415 unsigned VShiftOpc = 0;
17416
17417 if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
17418 VShiftOpc = ARMISD::VSLIIMM;
17419 else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
17420 VShiftOpc = ARMISD::VSRIIMM;
17421 else {
17422 llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
17423 }
17424
17425 SDLoc dl(N);
17426 return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
17427 N->getOperand(1), N->getOperand(2),
17428 DAG.getConstant(Cnt, dl, MVT::i32));
17429 }
17430
17431 case Intrinsic::arm_neon_vqrshifts:
17432 case Intrinsic::arm_neon_vqrshiftu:
17433 // No immediate versions of these to check for.
17434 break;
17435
17436 case Intrinsic::arm_neon_vbsl: {
17437 SDLoc dl(N);
17438 return DAG.getNode(ARMISD::VBSP, dl, N->getValueType(0), N->getOperand(1),
17439 N->getOperand(2), N->getOperand(3));
17440 }
17441 case Intrinsic::arm_mve_vqdmlah:
17442 case Intrinsic::arm_mve_vqdmlash:
17443 case Intrinsic::arm_mve_vqrdmlah:
17444 case Intrinsic::arm_mve_vqrdmlash:
17445 case Intrinsic::arm_mve_vmla_n_predicated:
17446 case Intrinsic::arm_mve_vmlas_n_predicated:
17447 case Intrinsic::arm_mve_vqdmlah_predicated:
17448 case Intrinsic::arm_mve_vqdmlash_predicated:
17449 case Intrinsic::arm_mve_vqrdmlah_predicated:
17450 case Intrinsic::arm_mve_vqrdmlash_predicated: {
17451 // These intrinsics all take an i32 scalar operand which is narrowed to the
17452 // size of a single lane of the vector type they return. So we don't need
17453 // any bits of that operand above that point, which allows us to eliminate
17454 // uxth/sxth.
17455 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
17456 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
17457 if (SimplifyDemandedBits(N->getOperand(3), DemandedMask, DCI))
17458 return SDValue();
17459 break;
17460 }
17461
17462 case Intrinsic::arm_mve_minv:
17463 case Intrinsic::arm_mve_maxv:
17464 case Intrinsic::arm_mve_minav:
17465 case Intrinsic::arm_mve_maxav:
17466 case Intrinsic::arm_mve_minv_predicated:
17467 case Intrinsic::arm_mve_maxv_predicated:
17468 case Intrinsic::arm_mve_minav_predicated:
17469 case Intrinsic::arm_mve_maxav_predicated: {
17470 // These intrinsics all take an i32 scalar operand which is narrowed to the
17471 // size of a single lane of the vector type they take as the other input.
17472 unsigned BitWidth = N->getOperand(2)->getValueType(0).getScalarSizeInBits();
17473 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
17474 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
17475 return SDValue();
17476 break;
17477 }
17478
17479 case Intrinsic::arm_mve_addv: {
17480 // Turn this intrinsic straight into the appropriate ARMISD::VADDV node,
17481 // which allow PerformADDVecReduce to turn it into VADDLV when possible.
17482 bool Unsigned = N->getConstantOperandVal(2);
17483 unsigned Opc = Unsigned ? ARMISD::VADDVu : ARMISD::VADDVs;
17484 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), N->getOperand(1));
17485 }
17486
17487 case Intrinsic::arm_mve_addlv:
17488 case Intrinsic::arm_mve_addlv_predicated: {
17489 // Same for these, but ARMISD::VADDLV has to be followed by a BUILD_PAIR
17490 // which recombines the two outputs into an i64
17491 bool Unsigned = N->getConstantOperandVal(2);
17492 unsigned Opc = IntNo == Intrinsic::arm_mve_addlv ?
17493 (Unsigned ? ARMISD::VADDLVu : ARMISD::VADDLVs) :
17494 (Unsigned ? ARMISD::VADDLVpu : ARMISD::VADDLVps);
17495
17497 for (unsigned i = 1, e = N->getNumOperands(); i < e; i++)
17498 if (i != 2) // skip the unsigned flag
17499 Ops.push_back(N->getOperand(i));
17500
17501 SDLoc dl(N);
17502 SDValue val = DAG.getNode(Opc, dl, {MVT::i32, MVT::i32}, Ops);
17503 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, val.getValue(0),
17504 val.getValue(1));
17505 }
17506 }
17507
17508 return SDValue();
17509}
17510
17511/// PerformShiftCombine - Checks for immediate versions of vector shifts and
17512/// lowers them. As with the vector shift intrinsics, this is done during DAG
17513/// combining instead of DAG legalizing because the build_vectors for 64-bit
17514/// vector element shift counts are generally not legal, and it is hard to see
17515/// their values after they get legalized to loads from a constant pool.
17518 const ARMSubtarget *ST) {
17519 SelectionDAG &DAG = DCI.DAG;
17520 EVT VT = N->getValueType(0);
17521
17522 if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 &&
17523 N->getOperand(0)->getOpcode() == ISD::AND &&
17524 N->getOperand(0)->hasOneUse()) {
17525 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
17526 return SDValue();
17527 // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't
17528 // usually show up because instcombine prefers to canonicalize it to
17529 // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come
17530 // out of GEP lowering in some cases.
17531 SDValue N0 = N->getOperand(0);
17532 ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(N->getOperand(1));
17533 if (!ShiftAmtNode)
17534 return SDValue();
17535 uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue());
17536 ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(N0->getOperand(1));
17537 if (!AndMaskNode)
17538 return SDValue();
17539 uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue());
17540 // Don't transform uxtb/uxth.
17541 if (AndMask == 255 || AndMask == 65535)
17542 return SDValue();
17543 if (isMask_32(AndMask)) {
17544 uint32_t MaskedBits = llvm::countl_zero(AndMask);
17545 if (MaskedBits > ShiftAmt) {
17546 SDLoc DL(N);
17547 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
17548 DAG.getConstant(MaskedBits, DL, MVT::i32));
17549 return DAG.getNode(
17550 ISD::SRL, DL, MVT::i32, SHL,
17551 DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32));
17552 }
17553 }
17554 }
17555
17556 // Nothing to be done for scalar shifts.
17557 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17558 if (!VT.isVector() || !TLI.isTypeLegal(VT))
17559 return SDValue();
17560 if (ST->hasMVEIntegerOps())
17561 return SDValue();
17562
17563 int64_t Cnt;
17564
17565 switch (N->getOpcode()) {
17566 default: llvm_unreachable("unexpected shift opcode");
17567
17568 case ISD::SHL:
17569 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) {
17570 SDLoc dl(N);
17571 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
17572 DAG.getConstant(Cnt, dl, MVT::i32));
17573 }
17574 break;
17575
17576 case ISD::SRA:
17577 case ISD::SRL:
17578 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
17579 unsigned VShiftOpc =
17580 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
17581 SDLoc dl(N);
17582 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
17583 DAG.getConstant(Cnt, dl, MVT::i32));
17584 }
17585 }
17586 return SDValue();
17587}
17588
17589// Look for a sign/zero/fpextend extend of a larger than legal load. This can be
17590// split into multiple extending loads, which are simpler to deal with than an
17591// arbitrary extend. For fp extends we use an integer extending load and a VCVTL
17592// to convert the type to an f32.
17594 SDValue N0 = N->getOperand(0);
17595 if (N0.getOpcode() != ISD::LOAD)
17596 return SDValue();
17598 if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() ||
17599 LD->getExtensionType() != ISD::NON_EXTLOAD)
17600 return SDValue();
17601 EVT FromVT = LD->getValueType(0);
17602 EVT ToVT = N->getValueType(0);
17603 if (!ToVT.isVector())
17604 return SDValue();
17606 EVT ToEltVT = ToVT.getVectorElementType();
17607 EVT FromEltVT = FromVT.getVectorElementType();
17608
17609 unsigned NumElements = 0;
17610 if (ToEltVT == MVT::i32 && FromEltVT == MVT::i8)
17611 NumElements = 4;
17612 if (ToEltVT == MVT::f32 && FromEltVT == MVT::f16)
17613 NumElements = 4;
17614 if (NumElements == 0 ||
17615 (FromEltVT != MVT::f16 && FromVT.getVectorNumElements() == NumElements) ||
17616 FromVT.getVectorNumElements() % NumElements != 0 ||
17617 !isPowerOf2_32(NumElements))
17618 return SDValue();
17619
17620 LLVMContext &C = *DAG.getContext();
17621 SDLoc DL(LD);
17622 // Details about the old load
17623 SDValue Ch = LD->getChain();
17624 SDValue BasePtr = LD->getBasePtr();
17625 Align Alignment = LD->getBaseAlign();
17626 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
17627 AAMDNodes AAInfo = LD->getAAInfo();
17628
17629 ISD::LoadExtType NewExtType =
17630 N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
17631 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
17632 EVT NewFromVT = EVT::getVectorVT(
17633 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
17634 EVT NewToVT = EVT::getVectorVT(
17635 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
17636
17639 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
17640 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
17641 SDValue NewPtr =
17642 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
17643
17644 SDValue NewLoad =
17645 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
17646 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
17647 Alignment, MMOFlags, AAInfo);
17648 Loads.push_back(NewLoad);
17649 Chains.push_back(SDValue(NewLoad.getNode(), 1));
17650 }
17651
17652 // Float truncs need to extended with VCVTB's into their floating point types.
17653 if (FromEltVT == MVT::f16) {
17655
17656 for (unsigned i = 0; i < Loads.size(); i++) {
17657 SDValue LoadBC =
17658 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v8f16, Loads[i]);
17659 SDValue FPExt = DAG.getNode(ARMISD::VCVTL, DL, MVT::v4f32, LoadBC,
17660 DAG.getConstant(0, DL, MVT::i32));
17661 Extends.push_back(FPExt);
17662 }
17663
17664 Loads = Extends;
17665 }
17666
17667 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
17668 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
17669 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Loads);
17670}
17671
17672/// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
17673/// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
17675 const ARMSubtarget *ST) {
17676 SDValue N0 = N->getOperand(0);
17677
17678 // Check for sign- and zero-extensions of vector extract operations of 8- and
17679 // 16-bit vector elements. NEON and MVE support these directly. They are
17680 // handled during DAG combining because type legalization will promote them
17681 // to 32-bit types and it is messy to recognize the operations after that.
17682 if ((ST->hasNEON() || ST->hasMVEIntegerOps()) &&
17684 SDValue Vec = N0.getOperand(0);
17685 SDValue Lane = N0.getOperand(1);
17686 EVT VT = N->getValueType(0);
17687 EVT EltVT = N0.getValueType();
17688 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17689
17690 if (VT == MVT::i32 &&
17691 (EltVT == MVT::i8 || EltVT == MVT::i16) &&
17692 TLI.isTypeLegal(Vec.getValueType()) &&
17693 isa<ConstantSDNode>(Lane)) {
17694
17695 unsigned Opc = 0;
17696 switch (N->getOpcode()) {
17697 default: llvm_unreachable("unexpected opcode");
17698 case ISD::SIGN_EXTEND:
17699 Opc = ARMISD::VGETLANEs;
17700 break;
17701 case ISD::ZERO_EXTEND:
17702 case ISD::ANY_EXTEND:
17703 Opc = ARMISD::VGETLANEu;
17704 break;
17705 }
17706 return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane);
17707 }
17708 }
17709
17710 if (ST->hasMVEIntegerOps())
17711 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
17712 return NewLoad;
17713
17714 return SDValue();
17715}
17716
17718 const ARMSubtarget *ST) {
17719 if (ST->hasMVEFloatOps())
17720 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
17721 return NewLoad;
17722
17723 return SDValue();
17724}
17725
17726// Lower smin(smax(x, C1), C2) to ssat or usat, if they have saturating
17727// constant bounds.
17729 const ARMSubtarget *Subtarget) {
17730 if ((Subtarget->isThumb() || !Subtarget->hasV6Ops()) &&
17731 !Subtarget->isThumb2())
17732 return SDValue();
17733
17734 EVT VT = Op.getValueType();
17735 SDValue Op0 = Op.getOperand(0);
17736
17737 if (VT != MVT::i32 ||
17738 (Op0.getOpcode() != ISD::SMIN && Op0.getOpcode() != ISD::SMAX) ||
17739 !isa<ConstantSDNode>(Op.getOperand(1)) ||
17741 return SDValue();
17742
17743 SDValue Min = Op;
17744 SDValue Max = Op0;
17745 SDValue Input = Op0.getOperand(0);
17746 if (Min.getOpcode() == ISD::SMAX)
17747 std::swap(Min, Max);
17748
17749 APInt MinC = Min.getConstantOperandAPInt(1);
17750 APInt MaxC = Max.getConstantOperandAPInt(1);
17751
17752 if (Min.getOpcode() != ISD::SMIN || Max.getOpcode() != ISD::SMAX ||
17753 !(MinC + 1).isPowerOf2())
17754 return SDValue();
17755
17756 SDLoc DL(Op);
17757 if (MinC == ~MaxC)
17758 return DAG.getNode(ARMISD::SSAT, DL, VT, Input,
17759 DAG.getConstant(MinC.countr_one(), DL, VT));
17760 if (MaxC == 0)
17761 return DAG.getNode(ARMISD::USAT, DL, VT, Input,
17762 DAG.getConstant(MinC.countr_one(), DL, VT));
17763
17764 return SDValue();
17765}
17766
17767/// PerformMinMaxCombine - Target-specific DAG combining for creating truncating
17768/// saturates.
17770 const ARMSubtarget *ST) {
17771 EVT VT = N->getValueType(0);
17772 SDValue N0 = N->getOperand(0);
17773
17774 if (VT == MVT::i32)
17775 return PerformMinMaxToSatCombine(SDValue(N, 0), DAG, ST);
17776
17777 if (!ST->hasMVEIntegerOps())
17778 return SDValue();
17779
17780 if (SDValue V = PerformVQDMULHCombine(N, DAG))
17781 return V;
17782
17783 if (VT != MVT::v4i32 && VT != MVT::v8i16)
17784 return SDValue();
17785
17786 auto IsSignedSaturate = [&](SDNode *Min, SDNode *Max) {
17787 // Check one is a smin and the other is a smax
17788 if (Min->getOpcode() != ISD::SMIN)
17789 std::swap(Min, Max);
17790 if (Min->getOpcode() != ISD::SMIN || Max->getOpcode() != ISD::SMAX)
17791 return false;
17792
17793 APInt SaturateC;
17794 if (VT == MVT::v4i32)
17795 SaturateC = APInt(32, (1 << 15) - 1, true);
17796 else //if (VT == MVT::v8i16)
17797 SaturateC = APInt(16, (1 << 7) - 1, true);
17798
17799 APInt MinC, MaxC;
17800 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
17801 MinC != SaturateC)
17802 return false;
17803 if (!ISD::isConstantSplatVector(Max->getOperand(1).getNode(), MaxC) ||
17804 MaxC != ~SaturateC)
17805 return false;
17806 return true;
17807 };
17808
17809 if (IsSignedSaturate(N, N0.getNode())) {
17810 SDLoc DL(N);
17811 MVT ExtVT, HalfVT;
17812 if (VT == MVT::v4i32) {
17813 HalfVT = MVT::v8i16;
17814 ExtVT = MVT::v4i16;
17815 } else { // if (VT == MVT::v8i16)
17816 HalfVT = MVT::v16i8;
17817 ExtVT = MVT::v8i8;
17818 }
17819
17820 // Create a VQMOVNB with undef top lanes, then signed extended into the top
17821 // half. That extend will hopefully be removed if only the bottom bits are
17822 // demanded (though a truncating store, for example).
17823 SDValue VQMOVN =
17824 DAG.getNode(ARMISD::VQMOVNs, DL, HalfVT, DAG.getUNDEF(HalfVT),
17825 N0->getOperand(0), DAG.getConstant(0, DL, MVT::i32));
17826 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
17827 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Bitcast,
17828 DAG.getValueType(ExtVT));
17829 }
17830
17831 auto IsUnsignedSaturate = [&](SDNode *Min) {
17832 // For unsigned, we just need to check for <= 0xffff
17833 if (Min->getOpcode() != ISD::UMIN)
17834 return false;
17835
17836 APInt SaturateC;
17837 if (VT == MVT::v4i32)
17838 SaturateC = APInt(32, (1 << 16) - 1, true);
17839 else //if (VT == MVT::v8i16)
17840 SaturateC = APInt(16, (1 << 8) - 1, true);
17841
17842 APInt MinC;
17843 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
17844 MinC != SaturateC)
17845 return false;
17846 return true;
17847 };
17848
17849 if (IsUnsignedSaturate(N)) {
17850 SDLoc DL(N);
17851 MVT HalfVT;
17852 unsigned ExtConst;
17853 if (VT == MVT::v4i32) {
17854 HalfVT = MVT::v8i16;
17855 ExtConst = 0x0000FFFF;
17856 } else { //if (VT == MVT::v8i16)
17857 HalfVT = MVT::v16i8;
17858 ExtConst = 0x00FF;
17859 }
17860
17861 // Create a VQMOVNB with undef top lanes, then ZExt into the top half with
17862 // an AND. That extend will hopefully be removed if only the bottom bits are
17863 // demanded (though a truncating store, for example).
17864 SDValue VQMOVN =
17865 DAG.getNode(ARMISD::VQMOVNu, DL, HalfVT, DAG.getUNDEF(HalfVT), N0,
17866 DAG.getConstant(0, DL, MVT::i32));
17867 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
17868 return DAG.getNode(ISD::AND, DL, VT, Bitcast,
17869 DAG.getConstant(ExtConst, DL, VT));
17870 }
17871
17872 return SDValue();
17873}
17874
17877 if (!C)
17878 return nullptr;
17879 const APInt *CV = &C->getAPIntValue();
17880 return CV->isPowerOf2() ? CV : nullptr;
17881}
17882
17884 // If we have a CMOV, OR and AND combination such as:
17885 // if (x & CN)
17886 // y |= CM;
17887 //
17888 // And:
17889 // * CN is a single bit;
17890 // * All bits covered by CM are known zero in y
17891 //
17892 // Then we can convert this into a sequence of BFI instructions. This will
17893 // always be a win if CM is a single bit, will always be no worse than the
17894 // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is
17895 // three bits (due to the extra IT instruction).
17896
17897 SDValue Op0 = CMOV->getOperand(0);
17898 SDValue Op1 = CMOV->getOperand(1);
17899 auto CC = CMOV->getConstantOperandAPInt(2).getLimitedValue();
17900 SDValue CmpZ = CMOV->getOperand(3);
17901
17902 // The compare must be against zero.
17903 if (!isNullConstant(CmpZ->getOperand(1)))
17904 return SDValue();
17905
17906 assert(CmpZ->getOpcode() == ARMISD::CMPZ);
17907 SDValue And = CmpZ->getOperand(0);
17908 if (And->getOpcode() != ISD::AND)
17909 return SDValue();
17910 const APInt *AndC = isPowerOf2Constant(And->getOperand(1));
17911 if (!AndC)
17912 return SDValue();
17913 SDValue X = And->getOperand(0);
17914
17915 if (CC == ARMCC::EQ) {
17916 // We're performing an "equal to zero" compare. Swap the operands so we
17917 // canonicalize on a "not equal to zero" compare.
17918 std::swap(Op0, Op1);
17919 } else {
17920 assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?");
17921 }
17922
17923 if (Op1->getOpcode() != ISD::OR)
17924 return SDValue();
17925
17927 if (!OrC)
17928 return SDValue();
17929 SDValue Y = Op1->getOperand(0);
17930
17931 if (Op0 != Y)
17932 return SDValue();
17933
17934 // Now, is it profitable to continue?
17935 APInt OrCI = OrC->getAPIntValue();
17936 unsigned Heuristic = Subtarget->isThumb() ? 3 : 2;
17937 if (OrCI.popcount() > Heuristic)
17938 return SDValue();
17939
17940 // Lastly, can we determine that the bits defined by OrCI
17941 // are zero in Y?
17942 KnownBits Known = DAG.computeKnownBits(Y);
17943 if ((OrCI & Known.Zero) != OrCI)
17944 return SDValue();
17945
17946 // OK, we can do the combine.
17947 SDValue V = Y;
17948 SDLoc dl(X);
17949 EVT VT = X.getValueType();
17950 unsigned BitInX = AndC->logBase2();
17951
17952 if (BitInX != 0) {
17953 // We must shift X first.
17954 X = DAG.getNode(ISD::SRL, dl, VT, X,
17955 DAG.getConstant(BitInX, dl, VT));
17956 }
17957
17958 for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits();
17959 BitInY < NumActiveBits; ++BitInY) {
17960 if (OrCI[BitInY] == 0)
17961 continue;
17962 APInt Mask(VT.getSizeInBits(), 0);
17963 Mask.setBit(BitInY);
17964 V = DAG.getNode(ARMISD::BFI, dl, VT, V, X,
17965 // Confusingly, the operand is an *inverted* mask.
17966 DAG.getConstant(~Mask, dl, VT));
17967 }
17968
17969 return V;
17970}
17971
17972// Given N, the value controlling the conditional branch, search for the loop
17973// intrinsic, returning it, along with how the value is used. We need to handle
17974// patterns such as the following:
17975// (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit)
17976// (brcond (setcc (loop.decrement), 0, eq), exit)
17977// (brcond (setcc (loop.decrement), 0, ne), header)
17979 bool &Negate) {
17980 switch (N->getOpcode()) {
17981 default:
17982 break;
17983 case ISD::XOR: {
17984 if (!isa<ConstantSDNode>(N.getOperand(1)))
17985 return SDValue();
17986 if (!cast<ConstantSDNode>(N.getOperand(1))->isOne())
17987 return SDValue();
17988 Negate = !Negate;
17989 return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate);
17990 }
17991 case ISD::SETCC: {
17992 auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1));
17993 if (!Const)
17994 return SDValue();
17995 if (Const->isZero())
17996 Imm = 0;
17997 else if (Const->isOne())
17998 Imm = 1;
17999 else
18000 return SDValue();
18001 CC = cast<CondCodeSDNode>(N.getOperand(2))->get();
18002 return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate);
18003 }
18005 unsigned IntOp = N.getConstantOperandVal(1);
18006 if (IntOp != Intrinsic::test_start_loop_iterations &&
18007 IntOp != Intrinsic::loop_decrement_reg)
18008 return SDValue();
18009 return N;
18010 }
18011 }
18012 return SDValue();
18013}
18014
18017 const ARMSubtarget *ST) {
18018
18019 // The hwloop intrinsics that we're interested are used for control-flow,
18020 // either for entering or exiting the loop:
18021 // - test.start.loop.iterations will test whether its operand is zero. If it
18022 // is zero, the proceeding branch should not enter the loop.
18023 // - loop.decrement.reg also tests whether its operand is zero. If it is
18024 // zero, the proceeding branch should not branch back to the beginning of
18025 // the loop.
18026 // So here, we need to check that how the brcond is using the result of each
18027 // of the intrinsics to ensure that we're branching to the right place at the
18028 // right time.
18029
18030 ISD::CondCode CC;
18031 SDValue Cond;
18032 int Imm = 1;
18033 bool Negate = false;
18034 SDValue Chain = N->getOperand(0);
18035 SDValue Dest;
18036
18037 if (N->getOpcode() == ISD::BRCOND) {
18038 CC = ISD::SETEQ;
18039 Cond = N->getOperand(1);
18040 Dest = N->getOperand(2);
18041 } else {
18042 assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!");
18043 CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
18044 Cond = N->getOperand(2);
18045 Dest = N->getOperand(4);
18046 if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) {
18047 if (!Const->isOne() && !Const->isZero())
18048 return SDValue();
18049 Imm = Const->getZExtValue();
18050 } else
18051 return SDValue();
18052 }
18053
18054 SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate);
18055 if (!Int)
18056 return SDValue();
18057
18058 if (Negate)
18059 CC = ISD::getSetCCInverse(CC, /* Integer inverse */ MVT::i32);
18060
18061 auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) {
18062 return (CC == ISD::SETEQ && Imm == 0) ||
18063 (CC == ISD::SETNE && Imm == 1) ||
18064 (CC == ISD::SETLT && Imm == 1) ||
18065 (CC == ISD::SETULT && Imm == 1);
18066 };
18067
18068 auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) {
18069 return (CC == ISD::SETEQ && Imm == 1) ||
18070 (CC == ISD::SETNE && Imm == 0) ||
18071 (CC == ISD::SETGT && Imm == 0) ||
18072 (CC == ISD::SETUGT && Imm == 0) ||
18073 (CC == ISD::SETGE && Imm == 1) ||
18074 (CC == ISD::SETUGE && Imm == 1);
18075 };
18076
18077 assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) &&
18078 "unsupported condition");
18079
18080 SDLoc dl(Int);
18081 SelectionDAG &DAG = DCI.DAG;
18082 SDValue Elements = Int.getOperand(2);
18083 unsigned IntOp = Int->getConstantOperandVal(1);
18084 assert((N->hasOneUse() && N->user_begin()->getOpcode() == ISD::BR) &&
18085 "expected single br user");
18086 SDNode *Br = *N->user_begin();
18087 SDValue OtherTarget = Br->getOperand(1);
18088
18089 // Update the unconditional branch to branch to the given Dest.
18090 auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) {
18091 SDValue NewBrOps[] = { Br->getOperand(0), Dest };
18092 SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps);
18093 DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr);
18094 };
18095
18096 if (IntOp == Intrinsic::test_start_loop_iterations) {
18097 SDValue Res;
18098 SDValue Setup = DAG.getNode(ARMISD::WLSSETUP, dl, MVT::i32, Elements);
18099 // We expect this 'instruction' to branch when the counter is zero.
18100 if (IsTrueIfZero(CC, Imm)) {
18101 SDValue Ops[] = {Chain, Setup, Dest};
18102 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
18103 } else {
18104 // The logic is the reverse of what we need for WLS, so find the other
18105 // basic block target: the target of the proceeding br.
18106 UpdateUncondBr(Br, Dest, DAG);
18107
18108 SDValue Ops[] = {Chain, Setup, OtherTarget};
18109 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
18110 }
18111 // Update LR count to the new value
18112 DAG.ReplaceAllUsesOfValueWith(Int.getValue(0), Setup);
18113 // Update chain
18114 DAG.ReplaceAllUsesOfValueWith(Int.getValue(2), Int.getOperand(0));
18115 return Res;
18116 } else {
18117 SDValue Size =
18118 DAG.getTargetConstant(Int.getConstantOperandVal(3), dl, MVT::i32);
18119 SDValue Args[] = { Int.getOperand(0), Elements, Size, };
18120 SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl,
18121 DAG.getVTList(MVT::i32, MVT::Other), Args);
18122 DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode());
18123
18124 // We expect this instruction to branch when the count is not zero.
18125 SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget;
18126
18127 // Update the unconditional branch to target the loop preheader if we've
18128 // found the condition has been reversed.
18129 if (Target == OtherTarget)
18130 UpdateUncondBr(Br, Dest, DAG);
18131
18132 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
18133 SDValue(LoopDec.getNode(), 1), Chain);
18134
18135 SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target };
18136 return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs);
18137 }
18138 return SDValue();
18139}
18140
18141/// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
18142SDValue
18144 SDValue Cmp = N->getOperand(3);
18145 if (Cmp.getOpcode() != ARMISD::CMPZ)
18146 // Only looking at NE cases.
18147 return SDValue();
18148
18149 SDLoc dl(N);
18150 SDValue LHS = Cmp.getOperand(0);
18151 SDValue RHS = Cmp.getOperand(1);
18152 SDValue Chain = N->getOperand(0);
18153 SDValue BB = N->getOperand(1);
18154 SDValue ARMcc = N->getOperand(2);
18156
18157 // (brcond Chain BB ne (cmpz (and (cmov 0 1 CC Flags) 1) 0))
18158 // -> (brcond Chain BB CC Flags)
18159 if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() &&
18160 LHS->getOperand(0)->getOpcode() == ARMISD::CMOV &&
18161 LHS->getOperand(0)->hasOneUse() &&
18162 isNullConstant(LHS->getOperand(0)->getOperand(0)) &&
18163 isOneConstant(LHS->getOperand(0)->getOperand(1)) &&
18164 isOneConstant(LHS->getOperand(1)) && isNullConstant(RHS)) {
18165 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, BB,
18166 LHS->getOperand(0)->getOperand(2),
18167 LHS->getOperand(0)->getOperand(3));
18168 }
18169
18170 return SDValue();
18171}
18172
18173/// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
18174SDValue
18176 SDValue Cmp = N->getOperand(3);
18177 if (Cmp.getOpcode() != ARMISD::CMPZ)
18178 // Only looking at EQ and NE cases.
18179 return SDValue();
18180
18181 EVT VT = N->getValueType(0);
18182 SDLoc dl(N);
18183 SDValue LHS = Cmp.getOperand(0);
18184 SDValue RHS = Cmp.getOperand(1);
18185 SDValue FalseVal = N->getOperand(0);
18186 SDValue TrueVal = N->getOperand(1);
18187 SDValue ARMcc = N->getOperand(2);
18189
18190 // BFI is only available on V6T2+.
18191 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) {
18193 if (R)
18194 return R;
18195 }
18196
18197 // Simplify
18198 // mov r1, r0
18199 // cmp r1, x
18200 // mov r0, y
18201 // moveq r0, x
18202 // to
18203 // cmp r0, x
18204 // movne r0, y
18205 //
18206 // mov r1, r0
18207 // cmp r1, x
18208 // mov r0, x
18209 // movne r0, y
18210 // to
18211 // cmp r0, x
18212 // movne r0, y
18213 /// FIXME: Turn this into a target neutral optimization?
18214 SDValue Res;
18215 if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
18216 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc, Cmp);
18217 } else if (CC == ARMCC::EQ && TrueVal == RHS) {
18218 SDValue ARMcc;
18219 SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl);
18220 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc, NewCmp);
18221 }
18222
18223 // (cmov F T ne (cmpz (cmov 0 1 CC Flags) 0))
18224 // -> (cmov F T CC Flags)
18225 if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse() &&
18226 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
18227 isNullConstant(RHS)) {
18228 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
18229 LHS->getOperand(2), LHS->getOperand(3));
18230 }
18231
18232 if (!VT.isInteger())
18233 return SDValue();
18234
18235 // Fold away an unneccessary CMPZ/CMOV
18236 // CMOV A, B, C1, (CMPZ (CMOV 1, 0, C2, D), 0) ->
18237 // if C1==EQ -> CMOV A, B, C2, D
18238 // if C1==NE -> CMOV A, B, NOT(C2), D
18239 if (N->getConstantOperandVal(2) == ARMCC::EQ ||
18240 N->getConstantOperandVal(2) == ARMCC::NE) {
18242 if (SDValue C = IsCMPZCSINC(N->getOperand(3).getNode(), Cond)) {
18243 if (N->getConstantOperandVal(2) == ARMCC::NE)
18245 return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
18246 N->getOperand(1),
18247 DAG.getConstant(Cond, SDLoc(N), MVT::i32), C);
18248 }
18249 }
18250
18251 // Materialize a boolean comparison for integers so we can avoid branching.
18252 if (isNullConstant(FalseVal)) {
18253 if (CC == ARMCC::EQ && isOneConstant(TrueVal)) {
18254 if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) {
18255 // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it
18256 // right 5 bits will make that 32 be 1, otherwise it will be 0.
18257 // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5
18258 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
18259 Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub),
18260 DAG.getConstant(5, dl, MVT::i32));
18261 } else {
18262 // CMOV 0, 1, ==, (CMPZ x, y) ->
18263 // (UADDO_CARRY (SUB x, y), t:0, t:1)
18264 // where t = (USUBO_CARRY 0, (SUB x, y), 0)
18265 //
18266 // The USUBO_CARRY computes 0 - (x - y) and this will give a borrow when
18267 // x != y. In other words, a carry C == 1 when x == y, C == 0
18268 // otherwise.
18269 // The final UADDO_CARRY computes
18270 // x - y + (0 - (x - y)) + C == C
18271 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
18272 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18273 SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub);
18274 // ISD::USUBO_CARRY returns a borrow but we want the carry here
18275 // actually.
18276 SDValue Carry =
18277 DAG.getNode(ISD::SUB, dl, MVT::i32,
18278 DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1));
18279 Res = DAG.getNode(ISD::UADDO_CARRY, dl, VTs, Sub, Neg, Carry);
18280 }
18281 } else if (CC == ARMCC::NE && !isNullConstant(RHS) &&
18282 (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) {
18283 // This seems pointless but will allow us to combine it further below.
18284 // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBC x, y), z, !=, (SUBC x, y):1
18285 SDValue Sub =
18286 DAG.getNode(ARMISD::SUBC, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
18287 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc,
18288 Sub.getValue(1));
18289 FalseVal = Sub;
18290 }
18291 } else if (isNullConstant(TrueVal)) {
18292 if (CC == ARMCC::EQ && !isNullConstant(RHS) &&
18293 (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) {
18294 // This seems pointless but will allow us to combine it further below
18295 // Note that we change == for != as this is the dual for the case above.
18296 // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBC x, y), z, !=, (SUBC x, y):1
18297 SDValue Sub =
18298 DAG.getNode(ARMISD::SUBC, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
18299 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal,
18300 DAG.getConstant(ARMCC::NE, dl, MVT::i32),
18301 Sub.getValue(1));
18302 FalseVal = Sub;
18303 }
18304 }
18305
18306 // On Thumb1, the DAG above may be further combined if z is a power of 2
18307 // (z == 2 ^ K).
18308 // CMOV (SUBC x, y), z, !=, (SUBC x, y):1 ->
18309 // t1 = (USUBO (SUB x, y), 1)
18310 // t2 = (USUBO_CARRY (SUB x, y), t1:0, t1:1)
18311 // Result = if K != 0 then (SHL t2:0, K) else t2:0
18312 //
18313 // This also handles the special case of comparing against zero; it's
18314 // essentially, the same pattern, except there's no SUBC:
18315 // CMOV x, z, !=, (CMPZ x, 0) ->
18316 // t1 = (USUBO x, 1)
18317 // t2 = (USUBO_CARRY x, t1:0, t1:1)
18318 // Result = if K != 0 then (SHL t2:0, K) else t2:0
18319 const APInt *TrueConst;
18320 if (Subtarget->isThumb1Only() && CC == ARMCC::NE &&
18321 ((FalseVal.getOpcode() == ARMISD::SUBC && FalseVal.getOperand(0) == LHS &&
18322 FalseVal.getOperand(1) == RHS) ||
18323 (FalseVal == LHS && isNullConstant(RHS))) &&
18324 (TrueConst = isPowerOf2Constant(TrueVal))) {
18325 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18326 unsigned ShiftAmount = TrueConst->logBase2();
18327 if (ShiftAmount)
18328 TrueVal = DAG.getConstant(1, dl, VT);
18329 SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal);
18330 Res = DAG.getNode(ISD::USUBO_CARRY, dl, VTs, FalseVal, Subc,
18331 Subc.getValue(1));
18332
18333 if (ShiftAmount)
18334 Res = DAG.getNode(ISD::SHL, dl, VT, Res,
18335 DAG.getConstant(ShiftAmount, dl, MVT::i32));
18336 }
18337
18338 if (Res.getNode()) {
18339 KnownBits Known = DAG.computeKnownBits(SDValue(N,0));
18340 // Capture demanded bits information that would be otherwise lost.
18341 if (Known.Zero == 0xfffffffe)
18342 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18343 DAG.getValueType(MVT::i1));
18344 else if (Known.Zero == 0xffffff00)
18345 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18346 DAG.getValueType(MVT::i8));
18347 else if (Known.Zero == 0xffff0000)
18348 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18349 DAG.getValueType(MVT::i16));
18350 }
18351
18352 return Res;
18353}
18354
18357 const ARMSubtarget *ST) {
18358 SelectionDAG &DAG = DCI.DAG;
18359 SDValue Src = N->getOperand(0);
18360 EVT DstVT = N->getValueType(0);
18361
18362 // Convert v4f32 bitcast (v4i32 vdup (i32)) -> v4f32 vdup (i32) under MVE.
18363 if (ST->hasMVEIntegerOps() && Src.getOpcode() == ARMISD::VDUP) {
18364 EVT SrcVT = Src.getValueType();
18365 if (SrcVT.getScalarSizeInBits() == DstVT.getScalarSizeInBits())
18366 return DAG.getNode(ARMISD::VDUP, SDLoc(N), DstVT, Src.getOperand(0));
18367 }
18368
18369 // We may have a bitcast of something that has already had this bitcast
18370 // combine performed on it, so skip past any VECTOR_REG_CASTs.
18371 if (Src.getOpcode() == ARMISD::VECTOR_REG_CAST &&
18372 Src.getOperand(0).getValueType().getScalarSizeInBits() <=
18373 Src.getValueType().getScalarSizeInBits())
18374 Src = Src.getOperand(0);
18375
18376 // Bitcast from element-wise VMOV or VMVN doesn't need VREV if the VREV that
18377 // would be generated is at least the width of the element type.
18378 EVT SrcVT = Src.getValueType();
18379 if ((Src.getOpcode() == ARMISD::VMOVIMM ||
18380 Src.getOpcode() == ARMISD::VMVNIMM ||
18381 Src.getOpcode() == ARMISD::VMOVFPIMM) &&
18382 SrcVT.getScalarSizeInBits() <= DstVT.getScalarSizeInBits() &&
18383 DAG.getDataLayout().isBigEndian())
18384 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(N), DstVT, Src);
18385
18386 // bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD x
18387 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
18388 return R;
18389
18390 return SDValue();
18391}
18392
18393// Some combines for the MVETrunc truncations legalizer helper. Also lowers the
18394// node into stack operations after legalizeOps.
18397 SelectionDAG &DAG = DCI.DAG;
18398 EVT VT = N->getValueType(0);
18399 SDLoc DL(N);
18400
18401 // MVETrunc(Undef, Undef) -> Undef
18402 if (all_of(N->ops(), [](SDValue Op) { return Op.isUndef(); }))
18403 return DAG.getUNDEF(VT);
18404
18405 // MVETrunc(MVETrunc a b, MVETrunc c, d) -> MVETrunc
18406 if (N->getNumOperands() == 2 &&
18407 N->getOperand(0).getOpcode() == ARMISD::MVETRUNC &&
18408 N->getOperand(1).getOpcode() == ARMISD::MVETRUNC)
18409 return DAG.getNode(ARMISD::MVETRUNC, DL, VT, N->getOperand(0).getOperand(0),
18410 N->getOperand(0).getOperand(1),
18411 N->getOperand(1).getOperand(0),
18412 N->getOperand(1).getOperand(1));
18413
18414 // MVETrunc(shuffle, shuffle) -> VMOVN
18415 if (N->getNumOperands() == 2 &&
18416 N->getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
18417 N->getOperand(1).getOpcode() == ISD::VECTOR_SHUFFLE) {
18418 auto *S0 = cast<ShuffleVectorSDNode>(N->getOperand(0).getNode());
18419 auto *S1 = cast<ShuffleVectorSDNode>(N->getOperand(1).getNode());
18420
18421 if (S0->getOperand(0) == S1->getOperand(0) &&
18422 S0->getOperand(1) == S1->getOperand(1)) {
18423 // Construct complete shuffle mask
18424 SmallVector<int, 8> Mask(S0->getMask());
18425 Mask.append(S1->getMask().begin(), S1->getMask().end());
18426
18427 if (isVMOVNTruncMask(Mask, VT, false))
18428 return DAG.getNode(
18429 ARMISD::VMOVN, DL, VT,
18430 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
18431 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
18432 DAG.getConstant(1, DL, MVT::i32));
18433 if (isVMOVNTruncMask(Mask, VT, true))
18434 return DAG.getNode(
18435 ARMISD::VMOVN, DL, VT,
18436 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
18437 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
18438 DAG.getConstant(1, DL, MVT::i32));
18439 }
18440 }
18441
18442 // For MVETrunc of a buildvector or shuffle, it can be beneficial to lower the
18443 // truncate to a buildvector to allow the generic optimisations to kick in.
18444 if (all_of(N->ops(), [](SDValue Op) {
18445 return Op.getOpcode() == ISD::BUILD_VECTOR ||
18446 Op.getOpcode() == ISD::VECTOR_SHUFFLE ||
18447 (Op.getOpcode() == ISD::BITCAST &&
18448 Op.getOperand(0).getOpcode() == ISD::BUILD_VECTOR);
18449 })) {
18450 SmallVector<SDValue, 8> Extracts;
18451 for (unsigned Op = 0; Op < N->getNumOperands(); Op++) {
18452 SDValue O = N->getOperand(Op);
18453 for (unsigned i = 0; i < O.getValueType().getVectorNumElements(); i++) {
18454 SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, O,
18455 DAG.getConstant(i, DL, MVT::i32));
18456 Extracts.push_back(Ext);
18457 }
18458 }
18459 return DAG.getBuildVector(VT, DL, Extracts);
18460 }
18461
18462 // If we are late in the legalization process and nothing has optimised
18463 // the trunc to anything better, lower it to a stack store and reload,
18464 // performing the truncation whilst keeping the lanes in the correct order:
18465 // VSTRH.32 a, stack; VSTRH.32 b, stack+8; VLDRW.32 stack;
18466 if (!DCI.isAfterLegalizeDAG())
18467 return SDValue();
18468
18469 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::getFixed(16), Align(4));
18470 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
18471 int NumIns = N->getNumOperands();
18472 assert((NumIns == 2 || NumIns == 4) &&
18473 "Expected 2 or 4 inputs to an MVETrunc");
18474 EVT StoreVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
18475 if (N->getNumOperands() == 4)
18476 StoreVT = StoreVT.getHalfNumVectorElementsVT(*DAG.getContext());
18477
18478 SmallVector<SDValue> Chains;
18479 for (int I = 0; I < NumIns; I++) {
18480 SDValue Ptr = DAG.getNode(
18481 ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
18482 DAG.getConstant(I * 16 / NumIns, DL, StackPtr.getValueType()));
18484 DAG.getMachineFunction(), SPFI, I * 16 / NumIns);
18485 SDValue Ch = DAG.getTruncStore(DAG.getEntryNode(), DL, N->getOperand(I),
18486 Ptr, MPI, StoreVT, Align(4));
18487 Chains.push_back(Ch);
18488 }
18489
18490 SDValue Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
18491 MachinePointerInfo MPI =
18493 return DAG.getLoad(VT, DL, Chain, StackPtr, MPI, Align(4));
18494}
18495
18496// Take a MVEEXT(load x) and split that into (extload x, extload x+8)
18498 SelectionDAG &DAG) {
18499 SDValue N0 = N->getOperand(0);
18501 if (!LD || !LD->isSimple() || !N0.hasOneUse() || LD->isIndexed())
18502 return SDValue();
18503
18504 EVT FromVT = LD->getMemoryVT();
18505 EVT ToVT = N->getValueType(0);
18506 if (!ToVT.isVector())
18507 return SDValue();
18508 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements() * 2);
18509 EVT ToEltVT = ToVT.getVectorElementType();
18510 EVT FromEltVT = FromVT.getVectorElementType();
18511
18512 unsigned NumElements = 0;
18513 if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8))
18514 NumElements = 4;
18515 if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8)
18516 NumElements = 8;
18517 assert(NumElements != 0);
18518
18519 ISD::LoadExtType NewExtType =
18520 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
18521 if (LD->getExtensionType() != ISD::NON_EXTLOAD &&
18522 LD->getExtensionType() != ISD::EXTLOAD &&
18523 LD->getExtensionType() != NewExtType)
18524 return SDValue();
18525
18526 LLVMContext &C = *DAG.getContext();
18527 SDLoc DL(LD);
18528 // Details about the old load
18529 SDValue Ch = LD->getChain();
18530 SDValue BasePtr = LD->getBasePtr();
18531 Align Alignment = LD->getBaseAlign();
18532 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
18533 AAMDNodes AAInfo = LD->getAAInfo();
18534
18535 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
18536 EVT NewFromVT = EVT::getVectorVT(
18537 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
18538 EVT NewToVT = EVT::getVectorVT(
18539 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
18540
18543 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
18544 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
18545 SDValue NewPtr =
18546 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
18547
18548 SDValue NewLoad =
18549 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
18550 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
18551 Alignment, MMOFlags, AAInfo);
18552 Loads.push_back(NewLoad);
18553 Chains.push_back(SDValue(NewLoad.getNode(), 1));
18554 }
18555
18556 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
18557 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
18558 return DAG.getMergeValues(Loads, DL);
18559}
18560
18561// Perform combines for MVEEXT. If it has not be optimized to anything better
18562// before lowering, it gets converted to stack store and extloads performing the
18563// extend whilst still keeping the same lane ordering.
18566 SelectionDAG &DAG = DCI.DAG;
18567 EVT VT = N->getValueType(0);
18568 SDLoc DL(N);
18569 assert(N->getNumValues() == 2 && "Expected MVEEXT with 2 elements");
18570 assert((VT == MVT::v4i32 || VT == MVT::v8i16) && "Unexpected MVEEXT type");
18571
18572 EVT ExtVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
18573 *DAG.getContext());
18574 auto Extend = [&](SDValue V) {
18575 SDValue VVT = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, V);
18576 return N->getOpcode() == ARMISD::MVESEXT
18577 ? DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, VVT,
18578 DAG.getValueType(ExtVT))
18579 : DAG.getZeroExtendInReg(VVT, DL, ExtVT);
18580 };
18581
18582 // MVEEXT(VDUP) -> SIGN_EXTEND_INREG(VDUP)
18583 if (N->getOperand(0).getOpcode() == ARMISD::VDUP) {
18584 SDValue Ext = Extend(N->getOperand(0));
18585 return DAG.getMergeValues({Ext, Ext}, DL);
18586 }
18587
18588 // MVEEXT(shuffle) -> SIGN_EXTEND_INREG/ZERO_EXTEND_INREG
18589 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0))) {
18590 ArrayRef<int> Mask = SVN->getMask();
18591 assert(Mask.size() == 2 * VT.getVectorNumElements());
18592 assert(Mask.size() == SVN->getValueType(0).getVectorNumElements());
18593 unsigned Rev = VT == MVT::v4i32 ? ARMISD::VREV32 : ARMISD::VREV16;
18594 SDValue Op0 = SVN->getOperand(0);
18595 SDValue Op1 = SVN->getOperand(1);
18596
18597 auto CheckInregMask = [&](int Start, int Offset) {
18598 for (int Idx = 0, E = VT.getVectorNumElements(); Idx < E; ++Idx)
18599 if (Mask[Start + Idx] >= 0 && Mask[Start + Idx] != Idx * 2 + Offset)
18600 return false;
18601 return true;
18602 };
18603 SDValue V0 = SDValue(N, 0);
18604 SDValue V1 = SDValue(N, 1);
18605 if (CheckInregMask(0, 0))
18606 V0 = Extend(Op0);
18607 else if (CheckInregMask(0, 1))
18608 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
18609 else if (CheckInregMask(0, Mask.size()))
18610 V0 = Extend(Op1);
18611 else if (CheckInregMask(0, Mask.size() + 1))
18612 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
18613
18614 if (CheckInregMask(VT.getVectorNumElements(), Mask.size()))
18615 V1 = Extend(Op1);
18616 else if (CheckInregMask(VT.getVectorNumElements(), Mask.size() + 1))
18617 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
18618 else if (CheckInregMask(VT.getVectorNumElements(), 0))
18619 V1 = Extend(Op0);
18620 else if (CheckInregMask(VT.getVectorNumElements(), 1))
18621 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
18622
18623 if (V0.getNode() != N || V1.getNode() != N)
18624 return DAG.getMergeValues({V0, V1}, DL);
18625 }
18626
18627 // MVEEXT(load) -> extload, extload
18628 if (N->getOperand(0)->getOpcode() == ISD::LOAD)
18630 return L;
18631
18632 if (!DCI.isAfterLegalizeDAG())
18633 return SDValue();
18634
18635 // Lower to a stack store and reload:
18636 // VSTRW.32 a, stack; VLDRH.32 stack; VLDRH.32 stack+8;
18637 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::getFixed(16), Align(4));
18638 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
18639 int NumOuts = N->getNumValues();
18640 assert((NumOuts == 2 || NumOuts == 4) &&
18641 "Expected 2 or 4 outputs to an MVEEXT");
18642 EVT LoadVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
18643 *DAG.getContext());
18644 if (N->getNumOperands() == 4)
18645 LoadVT = LoadVT.getHalfNumVectorElementsVT(*DAG.getContext());
18646
18647 MachinePointerInfo MPI =
18649 SDValue Chain = DAG.getStore(DAG.getEntryNode(), DL, N->getOperand(0),
18650 StackPtr, MPI, Align(4));
18651
18653 for (int I = 0; I < NumOuts; I++) {
18654 SDValue Ptr = DAG.getNode(
18655 ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
18656 DAG.getConstant(I * 16 / NumOuts, DL, StackPtr.getValueType()));
18658 DAG.getMachineFunction(), SPFI, I * 16 / NumOuts);
18659 SDValue Load = DAG.getExtLoad(
18660 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL,
18661 VT, Chain, Ptr, MPI, LoadVT, Align(4));
18662 Loads.push_back(Load);
18663 }
18664
18665 return DAG.getMergeValues(Loads, DL);
18666}
18667
18669 DAGCombinerInfo &DCI) const {
18670 switch (N->getOpcode()) {
18671 default: break;
18672 case ISD::SELECT_CC:
18673 case ISD::SELECT: return PerformSELECTCombine(N, DCI, Subtarget);
18674 case ISD::VSELECT: return PerformVSELECTCombine(N, DCI, Subtarget);
18675 case ISD::SETCC: return PerformVSetCCToVCTPCombine(N, DCI, Subtarget);
18676 case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget);
18677 case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget);
18678 case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget);
18679 case ISD::SUB: return PerformSUBCombine(N, DCI, Subtarget);
18680 case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget);
18681 case ISD::OR: return PerformORCombine(N, DCI, Subtarget);
18682 case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget);
18683 case ISD::AND: return PerformANDCombine(N, DCI, Subtarget);
18684 case ISD::BRCOND:
18685 case ISD::BR_CC: return PerformHWLoopCombine(N, DCI, Subtarget);
18686 case ARMISD::ADDC:
18687 case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget);
18688 case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget);
18689 case ARMISD::BFI: return PerformBFICombine(N, DCI.DAG);
18690 case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
18691 case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
18692 case ARMISD::VMOVhr: return PerformVMOVhrCombine(N, DCI);
18693 case ARMISD::VMOVrh: return PerformVMOVrhCombine(N, DCI.DAG);
18694 case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget);
18695 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
18698 return PerformExtractEltCombine(N, DCI, Subtarget);
18702 case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI, Subtarget);
18703 case ARMISD::VDUP: return PerformVDUPCombine(N, DCI.DAG, Subtarget);
18704 case ISD::FP_TO_SINT:
18705 case ISD::FP_TO_UINT:
18706 return PerformVCVTCombine(N, DCI.DAG, Subtarget);
18707 case ISD::FADD:
18708 return PerformFADDCombine(N, DCI.DAG, Subtarget);
18709 case ISD::FMUL:
18710 return PerformVMulVCTPCombine(N, DCI.DAG, Subtarget);
18712 return PerformIntrinsicCombine(N, DCI);
18713 case ISD::SHL:
18714 case ISD::SRA:
18715 case ISD::SRL:
18716 return PerformShiftCombine(N, DCI, Subtarget);
18717 case ISD::SIGN_EXTEND:
18718 case ISD::ZERO_EXTEND:
18719 case ISD::ANY_EXTEND:
18720 return PerformExtendCombine(N, DCI.DAG, Subtarget);
18721 case ISD::FP_EXTEND:
18722 return PerformFPExtendCombine(N, DCI.DAG, Subtarget);
18723 case ISD::SMIN:
18724 case ISD::UMIN:
18725 case ISD::SMAX:
18726 case ISD::UMAX:
18727 return PerformMinMaxCombine(N, DCI.DAG, Subtarget);
18728 case ARMISD::CMOV:
18729 return PerformCMOVCombine(N, DCI.DAG);
18730 case ARMISD::BRCOND:
18731 return PerformBRCONDCombine(N, DCI.DAG);
18732 case ARMISD::CMPZ:
18733 return PerformCMPZCombine(N, DCI.DAG);
18734 case ARMISD::CSINC:
18735 case ARMISD::CSINV:
18736 case ARMISD::CSNEG:
18737 return PerformCSETCombine(N, DCI.DAG);
18738 case ISD::LOAD:
18739 return PerformLOADCombine(N, DCI, Subtarget);
18740 case ARMISD::VLD1DUP:
18741 case ARMISD::VLD2DUP:
18742 case ARMISD::VLD3DUP:
18743 case ARMISD::VLD4DUP:
18744 return PerformVLDCombine(N, DCI);
18746 return PerformARMBUILD_VECTORCombine(N, DCI);
18747 case ISD::BITCAST:
18748 return PerformBITCASTCombine(N, DCI, Subtarget);
18749 case ARMISD::PREDICATE_CAST:
18750 return PerformPREDICATE_CASTCombine(N, DCI);
18751 case ARMISD::VECTOR_REG_CAST:
18752 return PerformVECTOR_REG_CASTCombine(N, DCI.DAG, Subtarget);
18753 case ARMISD::MVETRUNC:
18754 return PerformMVETruncCombine(N, DCI);
18755 case ARMISD::MVESEXT:
18756 case ARMISD::MVEZEXT:
18757 return PerformMVEExtCombine(N, DCI);
18758 case ARMISD::VCMP:
18759 return PerformVCMPCombine(N, DCI.DAG, Subtarget);
18760 case ISD::VECREDUCE_ADD:
18761 return PerformVECREDUCE_ADDCombine(N, DCI.DAG, Subtarget);
18762 case ARMISD::VADDVs:
18763 case ARMISD::VADDVu:
18764 case ARMISD::VADDLVs:
18765 case ARMISD::VADDLVu:
18766 case ARMISD::VADDLVAs:
18767 case ARMISD::VADDLVAu:
18768 case ARMISD::VMLAVs:
18769 case ARMISD::VMLAVu:
18770 case ARMISD::VMLALVs:
18771 case ARMISD::VMLALVu:
18772 case ARMISD::VMLALVAs:
18773 case ARMISD::VMLALVAu:
18774 return PerformReduceShuffleCombine(N, DCI.DAG);
18775 case ARMISD::VMOVN:
18776 return PerformVMOVNCombine(N, DCI);
18777 case ARMISD::VQMOVNs:
18778 case ARMISD::VQMOVNu:
18779 return PerformVQMOVNCombine(N, DCI);
18780 case ARMISD::VQDMULH:
18781 return PerformVQDMULHCombine(N, DCI);
18782 case ARMISD::ASRL:
18783 case ARMISD::LSRL:
18784 case ARMISD::LSLL:
18785 return PerformLongShiftCombine(N, DCI.DAG);
18786 case ARMISD::SMULWB: {
18787 unsigned BitWidth = N->getValueType(0).getSizeInBits();
18788 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
18789 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
18790 return SDValue();
18791 break;
18792 }
18793 case ARMISD::SMULWT: {
18794 unsigned BitWidth = N->getValueType(0).getSizeInBits();
18795 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
18796 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
18797 return SDValue();
18798 break;
18799 }
18800 case ARMISD::SMLALBB:
18801 case ARMISD::QADD16b:
18802 case ARMISD::QSUB16b:
18803 case ARMISD::UQADD16b:
18804 case ARMISD::UQSUB16b: {
18805 unsigned BitWidth = N->getValueType(0).getSizeInBits();
18806 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
18807 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
18808 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
18809 return SDValue();
18810 break;
18811 }
18812 case ARMISD::SMLALBT: {
18813 unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits();
18814 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
18815 unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits();
18816 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
18817 if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) ||
18818 (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI)))
18819 return SDValue();
18820 break;
18821 }
18822 case ARMISD::SMLALTB: {
18823 unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits();
18824 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
18825 unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits();
18826 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
18827 if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) ||
18828 (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI)))
18829 return SDValue();
18830 break;
18831 }
18832 case ARMISD::SMLALTT: {
18833 unsigned BitWidth = N->getValueType(0).getSizeInBits();
18834 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
18835 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
18836 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
18837 return SDValue();
18838 break;
18839 }
18840 case ARMISD::QADD8b:
18841 case ARMISD::QSUB8b:
18842 case ARMISD::UQADD8b:
18843 case ARMISD::UQSUB8b: {
18844 unsigned BitWidth = N->getValueType(0).getSizeInBits();
18845 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8);
18846 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
18847 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
18848 return SDValue();
18849 break;
18850 }
18851 case ARMISD::VBSP:
18852 if (N->getOperand(1) == N->getOperand(2))
18853 return N->getOperand(1);
18854 return SDValue();
18857 switch (N->getConstantOperandVal(1)) {
18858 case Intrinsic::arm_neon_vld1:
18859 case Intrinsic::arm_neon_vld1x2:
18860 case Intrinsic::arm_neon_vld1x3:
18861 case Intrinsic::arm_neon_vld1x4:
18862 case Intrinsic::arm_neon_vld2:
18863 case Intrinsic::arm_neon_vld3:
18864 case Intrinsic::arm_neon_vld4:
18865 case Intrinsic::arm_neon_vld2lane:
18866 case Intrinsic::arm_neon_vld3lane:
18867 case Intrinsic::arm_neon_vld4lane:
18868 case Intrinsic::arm_neon_vld2dup:
18869 case Intrinsic::arm_neon_vld3dup:
18870 case Intrinsic::arm_neon_vld4dup:
18871 case Intrinsic::arm_neon_vst1:
18872 case Intrinsic::arm_neon_vst1x2:
18873 case Intrinsic::arm_neon_vst1x3:
18874 case Intrinsic::arm_neon_vst1x4:
18875 case Intrinsic::arm_neon_vst2:
18876 case Intrinsic::arm_neon_vst3:
18877 case Intrinsic::arm_neon_vst4:
18878 case Intrinsic::arm_neon_vst2lane:
18879 case Intrinsic::arm_neon_vst3lane:
18880 case Intrinsic::arm_neon_vst4lane:
18881 return PerformVLDCombine(N, DCI);
18882 case Intrinsic::arm_mve_vld2q:
18883 case Intrinsic::arm_mve_vld4q:
18884 case Intrinsic::arm_mve_vst2q:
18885 case Intrinsic::arm_mve_vst4q:
18886 return PerformMVEVLDCombine(N, DCI);
18887 default: break;
18888 }
18889 break;
18890 }
18891 return SDValue();
18892}
18893
18895 EVT VT) const {
18896 return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
18897}
18898
18900 Align Alignment,
18902 unsigned *Fast) const {
18903 // Depends what it gets converted into if the type is weird.
18904 if (!VT.isSimple())
18905 return false;
18906
18907 // The AllowsUnaligned flag models the SCTLR.A setting in ARM cpus
18908 bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
18909 auto Ty = VT.getSimpleVT().SimpleTy;
18910
18911 if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) {
18912 // Unaligned access can use (for example) LRDB, LRDH, LDR
18913 if (AllowsUnaligned) {
18914 if (Fast)
18915 *Fast = Subtarget->hasV7Ops();
18916 return true;
18917 }
18918 }
18919
18920 if (Ty == MVT::f64 || Ty == MVT::v2f64) {
18921 // For any little-endian targets with neon, we can support unaligned ld/st
18922 // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
18923 // A big-endian target may also explicitly support unaligned accesses
18924 if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) {
18925 if (Fast)
18926 *Fast = 1;
18927 return true;
18928 }
18929 }
18930
18931 if (!Subtarget->hasMVEIntegerOps())
18932 return false;
18933
18934 // These are for predicates
18935 if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1 ||
18936 Ty == MVT::v2i1)) {
18937 if (Fast)
18938 *Fast = 1;
18939 return true;
18940 }
18941
18942 // These are for truncated stores/narrowing loads. They are fine so long as
18943 // the alignment is at least the size of the item being loaded
18944 if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) &&
18945 Alignment >= VT.getScalarSizeInBits() / 8) {
18946 if (Fast)
18947 *Fast = true;
18948 return true;
18949 }
18950
18951 // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and
18952 // VSTRW.U32 all store the vector register in exactly the same format, and
18953 // differ only in the range of their immediate offset field and the required
18954 // alignment. So there is always a store that can be used, regardless of
18955 // actual type.
18956 //
18957 // For big endian, that is not the case. But can still emit a (VSTRB.U8;
18958 // VREV64.8) pair and get the same effect. This will likely be better than
18959 // aligning the vector through the stack.
18960 if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 ||
18961 Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 ||
18962 Ty == MVT::v2f64) {
18963 if (Fast)
18964 *Fast = 1;
18965 return true;
18966 }
18967
18968 return false;
18969}
18970
18972 LLVMContext &Context, const MemOp &Op,
18973 const AttributeList &FuncAttributes) const {
18974 // See if we can use NEON instructions for this...
18975 if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() &&
18976 !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
18977 unsigned Fast;
18978 if (Op.size() >= 16 &&
18979 (Op.isAligned(Align(16)) ||
18980 (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, Align(1),
18982 Fast))) {
18983 return MVT::v2f64;
18984 } else if (Op.size() >= 8 &&
18985 (Op.isAligned(Align(8)) ||
18987 MVT::f64, 0, Align(1), MachineMemOperand::MONone, &Fast) &&
18988 Fast))) {
18989 return MVT::f64;
18990 }
18991 }
18992
18993 // Let the target-independent logic figure it out.
18994 return MVT::Other;
18995}
18996
18997// 64-bit integers are split into their high and low parts and held in two
18998// different registers, so the trunc is free since the low register can just
18999// be used.
19000bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const {
19001 if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
19002 return false;
19003 unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
19004 unsigned DestBits = DstTy->getPrimitiveSizeInBits();
19005 return (SrcBits == 64 && DestBits == 32);
19006}
19007
19009 if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() ||
19010 !DstVT.isInteger())
19011 return false;
19012 unsigned SrcBits = SrcVT.getSizeInBits();
19013 unsigned DestBits = DstVT.getSizeInBits();
19014 return (SrcBits == 64 && DestBits == 32);
19015}
19016
19018 if (Val.getOpcode() != ISD::LOAD)
19019 return false;
19020
19021 EVT VT1 = Val.getValueType();
19022 if (!VT1.isSimple() || !VT1.isInteger() ||
19023 !VT2.isSimple() || !VT2.isInteger())
19024 return false;
19025
19026 switch (VT1.getSimpleVT().SimpleTy) {
19027 default: break;
19028 case MVT::i1:
19029 case MVT::i8:
19030 case MVT::i16:
19031 // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
19032 return true;
19033 }
19034
19035 return false;
19036}
19037
19039 if (!VT.isSimple())
19040 return false;
19041
19042 // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that
19043 // negate values directly (fneg is free). So, we don't want to let the DAG
19044 // combiner rewrite fneg into xors and some other instructions. For f16 and
19045 // FullFP16 argument passing, some bitcast nodes may be introduced,
19046 // triggering this DAG combine rewrite, so we are avoiding that with this.
19047 switch (VT.getSimpleVT().SimpleTy) {
19048 default: break;
19049 case MVT::f16:
19050 return Subtarget->hasFullFP16();
19051 }
19052
19053 return false;
19054}
19055
19057 if (!Subtarget->hasMVEIntegerOps())
19058 return nullptr;
19059 Type *SVIType = SVI->getType();
19060 Type *ScalarType = SVIType->getScalarType();
19061
19062 if (ScalarType->isFloatTy())
19063 return Type::getInt32Ty(SVIType->getContext());
19064 if (ScalarType->isHalfTy())
19065 return Type::getInt16Ty(SVIType->getContext());
19066 return nullptr;
19067}
19068
19070 EVT VT = ExtVal.getValueType();
19071
19072 if (!isTypeLegal(VT))
19073 return false;
19074
19075 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal.getOperand(0))) {
19076 if (Ld->isExpandingLoad())
19077 return false;
19078 }
19079
19080 if (Subtarget->hasMVEIntegerOps())
19081 return true;
19082
19083 // Don't create a loadext if we can fold the extension into a wide/long
19084 // instruction.
19085 // If there's more than one user instruction, the loadext is desirable no
19086 // matter what. There can be two uses by the same instruction.
19087 if (ExtVal->use_empty() ||
19088 !ExtVal->user_begin()->isOnlyUserOf(ExtVal.getNode()))
19089 return true;
19090
19091 SDNode *U = *ExtVal->user_begin();
19092 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB ||
19093 U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM))
19094 return false;
19095
19096 return true;
19097}
19098
19100 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
19101 return false;
19102
19103 if (!isTypeLegal(EVT::getEVT(Ty1)))
19104 return false;
19105
19106 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
19107
19108 // Assuming the caller doesn't have a zeroext or signext return parameter,
19109 // truncation all the way down to i1 is valid.
19110 return true;
19111}
19112
19113/// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
19114/// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
19115/// expanded to FMAs when this method returns true, otherwise fmuladd is
19116/// expanded to fmul + fadd.
19117///
19118/// ARM supports both fused and unfused multiply-add operations; we already
19119/// lower a pair of fmul and fadd to the latter so it's not clear that there
19120/// would be a gain or that the gain would be worthwhile enough to risk
19121/// correctness bugs.
19122///
19123/// For MVE, we set this to true as it helps simplify the need for some
19124/// patterns (and we don't have the non-fused floating point instruction).
19125bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
19126 EVT VT) const {
19127 if (Subtarget->useSoftFloat())
19128 return false;
19129
19130 if (!VT.isSimple())
19131 return false;
19132
19133 switch (VT.getSimpleVT().SimpleTy) {
19134 case MVT::v4f32:
19135 case MVT::v8f16:
19136 return Subtarget->hasMVEFloatOps();
19137 case MVT::f16:
19138 return Subtarget->useFPVFMx16();
19139 case MVT::f32:
19140 return Subtarget->useFPVFMx();
19141 case MVT::f64:
19142 return Subtarget->useFPVFMx64();
19143 default:
19144 break;
19145 }
19146
19147 return false;
19148}
19149
19150static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
19151 if (V < 0)
19152 return false;
19153
19154 unsigned Scale = 1;
19155 switch (VT.getSimpleVT().SimpleTy) {
19156 case MVT::i1:
19157 case MVT::i8:
19158 // Scale == 1;
19159 break;
19160 case MVT::i16:
19161 // Scale == 2;
19162 Scale = 2;
19163 break;
19164 default:
19165 // On thumb1 we load most things (i32, i64, floats, etc) with a LDR
19166 // Scale == 4;
19167 Scale = 4;
19168 break;
19169 }
19170
19171 if ((V & (Scale - 1)) != 0)
19172 return false;
19173 return isUInt<5>(V / Scale);
19174}
19175
19176static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
19177 const ARMSubtarget *Subtarget) {
19178 if (!VT.isInteger() && !VT.isFloatingPoint())
19179 return false;
19180 if (VT.isVector() && Subtarget->hasNEON())
19181 return false;
19182 if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() &&
19183 !Subtarget->hasMVEFloatOps())
19184 return false;
19185
19186 bool IsNeg = false;
19187 if (V < 0) {
19188 IsNeg = true;
19189 V = -V;
19190 }
19191
19192 unsigned NumBytes = std::max((unsigned)VT.getSizeInBits() / 8, 1U);
19193
19194 // MVE: size * imm7
19195 if (VT.isVector() && Subtarget->hasMVEIntegerOps()) {
19196 switch (VT.getSimpleVT().getVectorElementType().SimpleTy) {
19197 case MVT::i32:
19198 case MVT::f32:
19199 return isShiftedUInt<7,2>(V);
19200 case MVT::i16:
19201 case MVT::f16:
19202 return isShiftedUInt<7,1>(V);
19203 case MVT::i8:
19204 return isUInt<7>(V);
19205 default:
19206 return false;
19207 }
19208 }
19209
19210 // half VLDR: 2 * imm8
19211 if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16())
19212 return isShiftedUInt<8, 1>(V);
19213 // VLDR and LDRD: 4 * imm8
19214 if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8)
19215 return isShiftedUInt<8, 2>(V);
19216
19217 if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) {
19218 // + imm12 or - imm8
19219 if (IsNeg)
19220 return isUInt<8>(V);
19221 return isUInt<12>(V);
19222 }
19223
19224 return false;
19225}
19226
19227/// isLegalAddressImmediate - Return true if the integer value can be used
19228/// as the offset of the target addressing mode for load / store of the
19229/// given type.
19230static bool isLegalAddressImmediate(int64_t V, EVT VT,
19231 const ARMSubtarget *Subtarget) {
19232 if (V == 0)
19233 return true;
19234
19235 if (!VT.isSimple())
19236 return false;
19237
19238 if (Subtarget->isThumb1Only())
19239 return isLegalT1AddressImmediate(V, VT);
19240 else if (Subtarget->isThumb2())
19241 return isLegalT2AddressImmediate(V, VT, Subtarget);
19242
19243 // ARM mode.
19244 if (V < 0)
19245 V = - V;
19246 switch (VT.getSimpleVT().SimpleTy) {
19247 default: return false;
19248 case MVT::i1:
19249 case MVT::i8:
19250 case MVT::i32:
19251 // +- imm12
19252 return isUInt<12>(V);
19253 case MVT::i16:
19254 // +- imm8
19255 return isUInt<8>(V);
19256 case MVT::f32:
19257 case MVT::f64:
19258 if (!Subtarget->hasVFP2Base()) // FIXME: NEON?
19259 return false;
19260 return isShiftedUInt<8, 2>(V);
19261 }
19262}
19263
19265 EVT VT) const {
19266 int Scale = AM.Scale;
19267 if (Scale < 0)
19268 return false;
19269
19270 switch (VT.getSimpleVT().SimpleTy) {
19271 default: return false;
19272 case MVT::i1:
19273 case MVT::i8:
19274 case MVT::i16:
19275 case MVT::i32:
19276 if (Scale == 1)
19277 return true;
19278 // r + r << imm
19279 Scale = Scale & ~1;
19280 return Scale == 2 || Scale == 4 || Scale == 8;
19281 case MVT::i64:
19282 // FIXME: What are we trying to model here? ldrd doesn't have an r + r
19283 // version in Thumb mode.
19284 // r + r
19285 if (Scale == 1)
19286 return true;
19287 // r * 2 (this can be lowered to r + r).
19288 if (!AM.HasBaseReg && Scale == 2)
19289 return true;
19290 return false;
19291 case MVT::isVoid:
19292 // Note, we allow "void" uses (basically, uses that aren't loads or
19293 // stores), because arm allows folding a scale into many arithmetic
19294 // operations. This should be made more precise and revisited later.
19295
19296 // Allow r << imm, but the imm has to be a multiple of two.
19297 if (Scale & 1) return false;
19298 return isPowerOf2_32(Scale);
19299 }
19300}
19301
19303 EVT VT) const {
19304 const int Scale = AM.Scale;
19305
19306 // Negative scales are not supported in Thumb1.
19307 if (Scale < 0)
19308 return false;
19309
19310 // Thumb1 addressing modes do not support register scaling excepting the
19311 // following cases:
19312 // 1. Scale == 1 means no scaling.
19313 // 2. Scale == 2 this can be lowered to r + r if there is no base register.
19314 return (Scale == 1) || (!AM.HasBaseReg && Scale == 2);
19315}
19316
19317/// isLegalAddressingMode - Return true if the addressing mode represented
19318/// by AM is legal for this target, for a load/store of the specified type.
19320 const AddrMode &AM, Type *Ty,
19321 unsigned AS, Instruction *I) const {
19322 EVT VT = getValueType(DL, Ty, true);
19323 if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
19324 return false;
19325
19326 // Can never fold addr of global into load/store.
19327 if (AM.BaseGV)
19328 return false;
19329
19330 switch (AM.Scale) {
19331 case 0: // no scale reg, must be "r+i" or "r", or "i".
19332 break;
19333 default:
19334 // ARM doesn't support any R+R*scale+imm addr modes.
19335 if (AM.BaseOffs)
19336 return false;
19337
19338 if (!VT.isSimple())
19339 return false;
19340
19341 if (Subtarget->isThumb1Only())
19342 return isLegalT1ScaledAddressingMode(AM, VT);
19343
19344 if (Subtarget->isThumb2())
19345 return isLegalT2ScaledAddressingMode(AM, VT);
19346
19347 int Scale = AM.Scale;
19348 switch (VT.getSimpleVT().SimpleTy) {
19349 default: return false;
19350 case MVT::i1:
19351 case MVT::i8:
19352 case MVT::i32:
19353 if (Scale < 0) Scale = -Scale;
19354 if (Scale == 1)
19355 return true;
19356 // r + r << imm
19357 return isPowerOf2_32(Scale & ~1);
19358 case MVT::i16:
19359 case MVT::i64:
19360 // r +/- r
19361 if (Scale == 1 || (AM.HasBaseReg && Scale == -1))
19362 return true;
19363 // r * 2 (this can be lowered to r + r).
19364 if (!AM.HasBaseReg && Scale == 2)
19365 return true;
19366 return false;
19367
19368 case MVT::isVoid:
19369 // Note, we allow "void" uses (basically, uses that aren't loads or
19370 // stores), because arm allows folding a scale into many arithmetic
19371 // operations. This should be made more precise and revisited later.
19372
19373 // Allow r << imm, but the imm has to be a multiple of two.
19374 if (Scale & 1) return false;
19375 return isPowerOf2_32(Scale);
19376 }
19377 }
19378 return true;
19379}
19380
19381/// isLegalICmpImmediate - Return true if the specified immediate is legal
19382/// icmp immediate, that is the target has icmp instructions which can compare
19383/// a register against the immediate without having to materialize the
19384/// immediate into a register.
19386 // Thumb2 and ARM modes can use cmn for negative immediates.
19387 if (!Subtarget->isThumb())
19388 return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 ||
19389 ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1;
19390 if (Subtarget->isThumb2())
19391 return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 ||
19392 ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1;
19393 // Thumb1 doesn't have cmn, and only 8-bit immediates.
19394 return Imm >= 0 && Imm <= 255;
19395}
19396
19397/// isLegalAddImmediate - Return true if the specified immediate is a legal add
19398/// *or sub* immediate, that is the target has add or sub instructions which can
19399/// add a register with the immediate without having to materialize the
19400/// immediate into a register.
19402 // Same encoding for add/sub, just flip the sign.
19403 uint64_t AbsImm = AbsoluteValue(Imm);
19404 if (!Subtarget->isThumb())
19405 return ARM_AM::getSOImmVal(AbsImm) != -1;
19406 if (Subtarget->isThumb2())
19407 return ARM_AM::getT2SOImmVal(AbsImm) != -1;
19408 // Thumb1 only has 8-bit unsigned immediate.
19409 return AbsImm <= 255;
19410}
19411
19412// Return false to prevent folding
19413// (mul (add r, c0), c1) -> (add (mul r, c1), c0*c1) in DAGCombine,
19414// if the folding leads to worse code.
19416 SDValue ConstNode) const {
19417 // Let the DAGCombiner decide for vector types and large types.
19418 const EVT VT = AddNode.getValueType();
19419 if (VT.isVector() || VT.getScalarSizeInBits() > 32)
19420 return true;
19421
19422 // It is worse if c0 is legal add immediate, while c1*c0 is not
19423 // and has to be composed by at least two instructions.
19424 const ConstantSDNode *C0Node = cast<ConstantSDNode>(AddNode.getOperand(1));
19425 const ConstantSDNode *C1Node = cast<ConstantSDNode>(ConstNode);
19426 const int64_t C0 = C0Node->getSExtValue();
19427 APInt CA = C0Node->getAPIntValue() * C1Node->getAPIntValue();
19429 return true;
19430 if (ConstantMaterializationCost((unsigned)CA.getZExtValue(), Subtarget) > 1)
19431 return false;
19432
19433 // Default to true and let the DAGCombiner decide.
19434 return true;
19435}
19436
19438 bool isSEXTLoad, SDValue &Base,
19439 SDValue &Offset, bool &isInc,
19440 SelectionDAG &DAG) {
19441 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19442 return false;
19443
19444 if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
19445 // AddressingMode 3
19446 Base = Ptr->getOperand(0);
19448 int RHSC = (int)RHS->getZExtValue();
19449 if (RHSC < 0 && RHSC > -256) {
19450 assert(Ptr->getOpcode() == ISD::ADD);
19451 isInc = false;
19452 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19453 return true;
19454 }
19455 }
19456 isInc = (Ptr->getOpcode() == ISD::ADD);
19457 Offset = Ptr->getOperand(1);
19458 return true;
19459 } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
19460 // AddressingMode 2
19462 int RHSC = (int)RHS->getZExtValue();
19463 if (RHSC < 0 && RHSC > -0x1000) {
19464 assert(Ptr->getOpcode() == ISD::ADD);
19465 isInc = false;
19466 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19467 Base = Ptr->getOperand(0);
19468 return true;
19469 }
19470 }
19471
19472 if (Ptr->getOpcode() == ISD::ADD) {
19473 isInc = true;
19474 ARM_AM::ShiftOpc ShOpcVal=
19476 if (ShOpcVal != ARM_AM::no_shift) {
19477 Base = Ptr->getOperand(1);
19478 Offset = Ptr->getOperand(0);
19479 } else {
19480 Base = Ptr->getOperand(0);
19481 Offset = Ptr->getOperand(1);
19482 }
19483 return true;
19484 }
19485
19486 isInc = (Ptr->getOpcode() == ISD::ADD);
19487 Base = Ptr->getOperand(0);
19488 Offset = Ptr->getOperand(1);
19489 return true;
19490 }
19491
19492 // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
19493 return false;
19494}
19495
19497 bool isSEXTLoad, SDValue &Base,
19498 SDValue &Offset, bool &isInc,
19499 SelectionDAG &DAG) {
19500 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19501 return false;
19502
19503 Base = Ptr->getOperand(0);
19505 int RHSC = (int)RHS->getZExtValue();
19506 if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
19507 assert(Ptr->getOpcode() == ISD::ADD);
19508 isInc = false;
19509 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19510 return true;
19511 } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
19512 isInc = Ptr->getOpcode() == ISD::ADD;
19513 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
19514 return true;
19515 }
19516 }
19517
19518 return false;
19519}
19520
19521static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment,
19522 bool isSEXTLoad, bool IsMasked, bool isLE,
19524 bool &isInc, SelectionDAG &DAG) {
19525 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19526 return false;
19527 if (!isa<ConstantSDNode>(Ptr->getOperand(1)))
19528 return false;
19529
19530 // We allow LE non-masked loads to change the type (for example use a vldrb.8
19531 // as opposed to a vldrw.32). This can allow extra addressing modes or
19532 // alignments for what is otherwise an equivalent instruction.
19533 bool CanChangeType = isLE && !IsMasked;
19534
19536 int RHSC = (int)RHS->getZExtValue();
19537
19538 auto IsInRange = [&](int RHSC, int Limit, int Scale) {
19539 if (RHSC < 0 && RHSC > -Limit * Scale && RHSC % Scale == 0) {
19540 assert(Ptr->getOpcode() == ISD::ADD);
19541 isInc = false;
19542 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19543 return true;
19544 } else if (RHSC > 0 && RHSC < Limit * Scale && RHSC % Scale == 0) {
19545 isInc = Ptr->getOpcode() == ISD::ADD;
19546 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
19547 return true;
19548 }
19549 return false;
19550 };
19551
19552 // Try to find a matching instruction based on s/zext, Alignment, Offset and
19553 // (in BE/masked) type.
19554 Base = Ptr->getOperand(0);
19555 if (VT == MVT::v4i16) {
19556 if (Alignment >= 2 && IsInRange(RHSC, 0x80, 2))
19557 return true;
19558 } else if (VT == MVT::v4i8 || VT == MVT::v8i8) {
19559 if (IsInRange(RHSC, 0x80, 1))
19560 return true;
19561 } else if (Alignment >= 4 &&
19562 (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) &&
19563 IsInRange(RHSC, 0x80, 4))
19564 return true;
19565 else if (Alignment >= 2 &&
19566 (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) &&
19567 IsInRange(RHSC, 0x80, 2))
19568 return true;
19569 else if ((CanChangeType || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1))
19570 return true;
19571 return false;
19572}
19573
19574/// getPreIndexedAddressParts - returns true by value, base pointer and
19575/// offset pointer and addressing mode by reference if the node's address
19576/// can be legally represented as pre-indexed load / store address.
19577bool
19579 SDValue &Offset,
19581 SelectionDAG &DAG) const {
19582 if (Subtarget->isThumb1Only())
19583 return false;
19584
19585 EVT VT;
19586 SDValue Ptr;
19587 Align Alignment;
19588 bool isSEXTLoad = false;
19589 bool IsMasked = false;
19590 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
19591 Ptr = LD->getBasePtr();
19592 VT = LD->getMemoryVT();
19593 Alignment = LD->getAlign();
19594 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19595 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
19596 Ptr = ST->getBasePtr();
19597 VT = ST->getMemoryVT();
19598 Alignment = ST->getAlign();
19599 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
19600 Ptr = LD->getBasePtr();
19601 VT = LD->getMemoryVT();
19602 Alignment = LD->getAlign();
19603 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19604 IsMasked = true;
19606 Ptr = ST->getBasePtr();
19607 VT = ST->getMemoryVT();
19608 Alignment = ST->getAlign();
19609 IsMasked = true;
19610 } else
19611 return false;
19612
19613 bool isInc;
19614 bool isLegal = false;
19615 if (VT.isVector())
19616 isLegal = Subtarget->hasMVEIntegerOps() &&
19618 Ptr.getNode(), VT, Alignment, isSEXTLoad, IsMasked,
19619 Subtarget->isLittle(), Base, Offset, isInc, DAG);
19620 else {
19621 if (Subtarget->isThumb2())
19622 isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
19623 Offset, isInc, DAG);
19624 else
19625 isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
19626 Offset, isInc, DAG);
19627 }
19628 if (!isLegal)
19629 return false;
19630
19631 AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
19632 return true;
19633}
19634
19635/// getPostIndexedAddressParts - returns true by value, base pointer and
19636/// offset pointer and addressing mode by reference if this node can be
19637/// combined with a load / store to form a post-indexed load / store.
19639 SDValue &Base,
19640 SDValue &Offset,
19642 SelectionDAG &DAG) const {
19643 EVT VT;
19644 SDValue Ptr;
19645 Align Alignment;
19646 bool isSEXTLoad = false, isNonExt;
19647 bool IsMasked = false;
19648 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
19649 VT = LD->getMemoryVT();
19650 Ptr = LD->getBasePtr();
19651 Alignment = LD->getAlign();
19652 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19653 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
19654 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
19655 VT = ST->getMemoryVT();
19656 Ptr = ST->getBasePtr();
19657 Alignment = ST->getAlign();
19658 isNonExt = !ST->isTruncatingStore();
19659 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
19660 VT = LD->getMemoryVT();
19661 Ptr = LD->getBasePtr();
19662 Alignment = LD->getAlign();
19663 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19664 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
19665 IsMasked = true;
19667 VT = ST->getMemoryVT();
19668 Ptr = ST->getBasePtr();
19669 Alignment = ST->getAlign();
19670 isNonExt = !ST->isTruncatingStore();
19671 IsMasked = true;
19672 } else
19673 return false;
19674
19675 if (Subtarget->isThumb1Only()) {
19676 // Thumb-1 can do a limited post-inc load or store as an updating LDM. It
19677 // must be non-extending/truncating, i32, with an offset of 4.
19678 assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!");
19679 if (Op->getOpcode() != ISD::ADD || !isNonExt)
19680 return false;
19681 auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1));
19682 if (!RHS || RHS->getZExtValue() != 4)
19683 return false;
19684 if (Alignment < Align(4))
19685 return false;
19686
19687 Offset = Op->getOperand(1);
19688 Base = Op->getOperand(0);
19689 AM = ISD::POST_INC;
19690 return true;
19691 }
19692
19693 bool isInc;
19694 bool isLegal = false;
19695 if (VT.isVector())
19696 isLegal = Subtarget->hasMVEIntegerOps() &&
19697 getMVEIndexedAddressParts(Op, VT, Alignment, isSEXTLoad, IsMasked,
19698 Subtarget->isLittle(), Base, Offset,
19699 isInc, DAG);
19700 else {
19701 if (Subtarget->isThumb2())
19702 isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
19703 isInc, DAG);
19704 else
19705 isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
19706 isInc, DAG);
19707 }
19708 if (!isLegal)
19709 return false;
19710
19711 if (Ptr != Base) {
19712 // Swap base ptr and offset to catch more post-index load / store when
19713 // it's legal. In Thumb2 mode, offset must be an immediate.
19714 if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
19715 !Subtarget->isThumb2())
19717
19718 // Post-indexed load / store update the base pointer.
19719 if (Ptr != Base)
19720 return false;
19721 }
19722
19723 AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
19724 return true;
19725}
19726
19728 KnownBits &Known,
19729 const APInt &DemandedElts,
19730 const SelectionDAG &DAG,
19731 unsigned Depth) const {
19732 unsigned BitWidth = Known.getBitWidth();
19733 Known.resetAll();
19734 switch (Op.getOpcode()) {
19735 default: break;
19736 case ARMISD::ADDC:
19737 case ARMISD::ADDE:
19738 case ARMISD::SUBC:
19739 case ARMISD::SUBE:
19740 // Special cases when we convert a carry to a boolean.
19741 if (Op.getResNo() == 0) {
19742 SDValue LHS = Op.getOperand(0);
19743 SDValue RHS = Op.getOperand(1);
19744 // (ADDE 0, 0, C) will give us a single bit.
19745 if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) &&
19746 isNullConstant(RHS)) {
19748 return;
19749 }
19750 }
19751 break;
19752 case ARMISD::CMOV: {
19753 // Bits are known zero/one if known on the LHS and RHS.
19754 Known = DAG.computeKnownBits(Op.getOperand(0), Depth+1);
19755 if (Known.isUnknown())
19756 return;
19757
19758 KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1);
19759 Known = Known.intersectWith(KnownRHS);
19760 return;
19761 }
19763 Intrinsic::ID IntID =
19764 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
19765 switch (IntID) {
19766 default: return;
19767 case Intrinsic::arm_ldaex:
19768 case Intrinsic::arm_ldrex: {
19769 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
19770 unsigned MemBits = VT.getScalarSizeInBits();
19771 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
19772 return;
19773 }
19774 }
19775 }
19776 case ARMISD::BFI: {
19777 // Conservatively, we can recurse down the first operand
19778 // and just mask out all affected bits.
19779 Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
19780
19781 // The operand to BFI is already a mask suitable for removing the bits it
19782 // sets.
19783 const APInt &Mask = Op.getConstantOperandAPInt(2);
19784 Known.Zero &= Mask;
19785 Known.One &= Mask;
19786 return;
19787 }
19788 case ARMISD::VGETLANEs:
19789 case ARMISD::VGETLANEu: {
19790 const SDValue &SrcSV = Op.getOperand(0);
19791 EVT VecVT = SrcSV.getValueType();
19792 assert(VecVT.isVector() && "VGETLANE expected a vector type");
19793 const unsigned NumSrcElts = VecVT.getVectorNumElements();
19794 ConstantSDNode *Pos = cast<ConstantSDNode>(Op.getOperand(1).getNode());
19795 assert(Pos->getAPIntValue().ult(NumSrcElts) &&
19796 "VGETLANE index out of bounds");
19797 unsigned Idx = Pos->getZExtValue();
19798 APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx);
19799 Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1);
19800
19801 EVT VT = Op.getValueType();
19802 const unsigned DstSz = VT.getScalarSizeInBits();
19803 const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits();
19804 (void)SrcSz;
19805 assert(SrcSz == Known.getBitWidth());
19806 assert(DstSz > SrcSz);
19807 if (Op.getOpcode() == ARMISD::VGETLANEs)
19808 Known = Known.sext(DstSz);
19809 else {
19810 Known = Known.zext(DstSz);
19811 }
19812 assert(DstSz == Known.getBitWidth());
19813 break;
19814 }
19815 case ARMISD::VMOVrh: {
19816 KnownBits KnownOp = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
19817 assert(KnownOp.getBitWidth() == 16);
19818 Known = KnownOp.zext(32);
19819 break;
19820 }
19821 case ARMISD::CSINC:
19822 case ARMISD::CSINV:
19823 case ARMISD::CSNEG: {
19824 KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
19825 KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
19826
19827 // The result is either:
19828 // CSINC: KnownOp0 or KnownOp1 + 1
19829 // CSINV: KnownOp0 or ~KnownOp1
19830 // CSNEG: KnownOp0 or KnownOp1 * -1
19831 if (Op.getOpcode() == ARMISD::CSINC)
19832 KnownOp1 =
19833 KnownBits::add(KnownOp1, KnownBits::makeConstant(APInt(32, 1)));
19834 else if (Op.getOpcode() == ARMISD::CSINV)
19835 std::swap(KnownOp1.Zero, KnownOp1.One);
19836 else if (Op.getOpcode() == ARMISD::CSNEG)
19837 KnownOp1 = KnownBits::mul(KnownOp1,
19839
19840 Known = KnownOp0.intersectWith(KnownOp1);
19841 break;
19842 }
19843 case ARMISD::VORRIMM:
19844 case ARMISD::VBICIMM: {
19845 unsigned Encoded = Op.getConstantOperandVal(1);
19846 unsigned DecEltBits = 0;
19847 uint64_t DecodedVal = ARM_AM::decodeVMOVModImm(Encoded, DecEltBits);
19848
19849 unsigned EltBits = Op.getScalarValueSizeInBits();
19850 if (EltBits != DecEltBits) {
19851 // Be conservative: only update Known when EltBits == DecEltBits.
19852 // This is believed to always be true for VORRIMM/VBICIMM today, but if
19853 // that changes in the future, doing nothing here is safer than risking
19854 // subtle bugs.
19855 break;
19856 }
19857
19858 KnownBits KnownLHS = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
19859 bool IsVORR = Op.getOpcode() == ARMISD::VORRIMM;
19860 APInt Imm(DecEltBits, DecodedVal);
19861
19862 Known.One = IsVORR ? (KnownLHS.One | Imm) : (KnownLHS.One & ~Imm);
19863 Known.Zero = IsVORR ? (KnownLHS.Zero & ~Imm) : (KnownLHS.Zero | Imm);
19864 break;
19865 }
19866 }
19867}
19868
19870 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
19871 TargetLoweringOpt &TLO) const {
19872 // Delay optimization, so we don't have to deal with illegal types, or block
19873 // optimizations.
19874 if (!TLO.LegalOps)
19875 return false;
19876
19877 // Only optimize AND for now.
19878 if (Op.getOpcode() != ISD::AND)
19879 return false;
19880
19881 EVT VT = Op.getValueType();
19882
19883 // Ignore vectors.
19884 if (VT.isVector())
19885 return false;
19886
19887 assert(VT == MVT::i32 && "Unexpected integer type");
19888
19889 // Make sure the RHS really is a constant.
19890 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
19891 if (!C)
19892 return false;
19893
19894 unsigned Mask = C->getZExtValue();
19895
19896 unsigned Demanded = DemandedBits.getZExtValue();
19897 unsigned ShrunkMask = Mask & Demanded;
19898 unsigned ExpandedMask = Mask | ~Demanded;
19899
19900 // If the mask is all zeros, let the target-independent code replace the
19901 // result with zero.
19902 if (ShrunkMask == 0)
19903 return false;
19904
19905 // If the mask is all ones, erase the AND. (Currently, the target-independent
19906 // code won't do this, so we have to do it explicitly to avoid an infinite
19907 // loop in obscure cases.)
19908 if (ExpandedMask == ~0U)
19909 return TLO.CombineTo(Op, Op.getOperand(0));
19910
19911 auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool {
19912 return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0;
19913 };
19914 auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool {
19915 if (NewMask == Mask)
19916 return true;
19917 SDLoc DL(Op);
19918 SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT);
19919 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
19920 return TLO.CombineTo(Op, NewOp);
19921 };
19922
19923 // Prefer uxtb mask.
19924 if (IsLegalMask(0xFF))
19925 return UseMask(0xFF);
19926
19927 // Prefer uxth mask.
19928 if (IsLegalMask(0xFFFF))
19929 return UseMask(0xFFFF);
19930
19931 // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2.
19932 // FIXME: Prefer a contiguous sequence of bits for other optimizations.
19933 if (ShrunkMask < 256)
19934 return UseMask(ShrunkMask);
19935
19936 // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2.
19937 // FIXME: Prefer a contiguous sequence of bits for other optimizations.
19938 if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256)
19939 return UseMask(ExpandedMask);
19940
19941 // Potential improvements:
19942 //
19943 // We could try to recognize lsls+lsrs or lsrs+lsls pairs here.
19944 // We could try to prefer Thumb1 immediates which can be lowered to a
19945 // two-instruction sequence.
19946 // We could try to recognize more legal ARM/Thumb2 immediates here.
19947
19948 return false;
19949}
19950
19952 SDValue Op, const APInt &OriginalDemandedBits,
19953 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
19954 unsigned Depth) const {
19955 unsigned Opc = Op.getOpcode();
19956
19957 switch (Opc) {
19958 case ARMISD::ASRL:
19959 case ARMISD::LSRL: {
19960 // If this is result 0 and the other result is unused, see if the demand
19961 // bits allow us to shrink this long shift into a standard small shift in
19962 // the opposite direction.
19963 if (Op.getResNo() == 0 && !Op->hasAnyUseOfValue(1) &&
19964 isa<ConstantSDNode>(Op->getOperand(2))) {
19965 unsigned ShAmt = Op->getConstantOperandVal(2);
19966 if (ShAmt < 32 && OriginalDemandedBits.isSubsetOf(APInt::getAllOnes(32)
19967 << (32 - ShAmt)))
19968 return TLO.CombineTo(
19969 Op, TLO.DAG.getNode(
19970 ISD::SHL, SDLoc(Op), MVT::i32, Op.getOperand(1),
19971 TLO.DAG.getConstant(32 - ShAmt, SDLoc(Op), MVT::i32)));
19972 }
19973 break;
19974 }
19975 case ARMISD::VBICIMM: {
19976 SDValue Op0 = Op.getOperand(0);
19977 unsigned ModImm = Op.getConstantOperandVal(1);
19978 unsigned EltBits = 0;
19979 uint64_t Mask = ARM_AM::decodeVMOVModImm(ModImm, EltBits);
19980 if ((OriginalDemandedBits & Mask) == 0)
19981 return TLO.CombineTo(Op, Op0);
19982 }
19983 }
19984
19986 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
19987}
19988
19989//===----------------------------------------------------------------------===//
19990// ARM Inline Assembly Support
19991//===----------------------------------------------------------------------===//
19992
19993const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
19994 // At this point, we have to lower this constraint to something else, so we
19995 // lower it to an "r" or "w". However, by doing this we will force the result
19996 // to be in register, while the X constraint is much more permissive.
19997 //
19998 // Although we are correct (we are free to emit anything, without
19999 // constraints), we might break use cases that would expect us to be more
20000 // efficient and emit something else.
20001 if (!Subtarget->hasVFP2Base())
20002 return "r";
20003 if (ConstraintVT.isFloatingPoint())
20004 return "w";
20005 if (ConstraintVT.isVector() && Subtarget->hasNEON() &&
20006 (ConstraintVT.getSizeInBits() == 64 ||
20007 ConstraintVT.getSizeInBits() == 128))
20008 return "w";
20009
20010 return "r";
20011}
20012
20013/// getConstraintType - Given a constraint letter, return the type of
20014/// constraint it is for this target.
20017 unsigned S = Constraint.size();
20018 if (S == 1) {
20019 switch (Constraint[0]) {
20020 default: break;
20021 case 'l': return C_RegisterClass;
20022 case 'w': return C_RegisterClass;
20023 case 'h': return C_RegisterClass;
20024 case 'x': return C_RegisterClass;
20025 case 't': return C_RegisterClass;
20026 case 'j': return C_Immediate; // Constant for movw.
20027 // An address with a single base register. Due to the way we
20028 // currently handle addresses it is the same as an 'r' memory constraint.
20029 case 'Q': return C_Memory;
20030 }
20031 } else if (S == 2) {
20032 switch (Constraint[0]) {
20033 default: break;
20034 case 'T': return C_RegisterClass;
20035 // All 'U+' constraints are addresses.
20036 case 'U': return C_Memory;
20037 }
20038 }
20039 return TargetLowering::getConstraintType(Constraint);
20040}
20041
20042/// Examine constraint type and operand type and determine a weight value.
20043/// This object must already have been set up with the operand type
20044/// and the current alternative constraint selected.
20047 AsmOperandInfo &info, const char *constraint) const {
20049 Value *CallOperandVal = info.CallOperandVal;
20050 // If we don't have a value, we can't do a match,
20051 // but allow it at the lowest weight.
20052 if (!CallOperandVal)
20053 return CW_Default;
20054 Type *type = CallOperandVal->getType();
20055 // Look at the constraint type.
20056 switch (*constraint) {
20057 default:
20059 break;
20060 case 'l':
20061 if (type->isIntegerTy()) {
20062 if (Subtarget->isThumb())
20063 weight = CW_SpecificReg;
20064 else
20065 weight = CW_Register;
20066 }
20067 break;
20068 case 'w':
20069 if (type->isFloatingPointTy())
20070 weight = CW_Register;
20071 break;
20072 }
20073 return weight;
20074}
20075
20076static bool isIncompatibleReg(const MCPhysReg &PR, MVT VT) {
20077 if (PR == 0 || VT == MVT::Other)
20078 return false;
20079 if (ARM::SPRRegClass.contains(PR))
20080 return VT != MVT::f32 && VT != MVT::f16 && VT != MVT::i32;
20081 if (ARM::DPRRegClass.contains(PR))
20082 return VT != MVT::f64 && !VT.is64BitVector();
20083 return false;
20084}
20085
20086using RCPair = std::pair<unsigned, const TargetRegisterClass *>;
20087
20089 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
20090 switch (Constraint.size()) {
20091 case 1:
20092 // GCC ARM Constraint Letters
20093 switch (Constraint[0]) {
20094 case 'l': // Low regs or general regs.
20095 if (Subtarget->isThumb())
20096 return RCPair(0U, &ARM::tGPRRegClass);
20097 return RCPair(0U, &ARM::GPRRegClass);
20098 case 'h': // High regs or no regs.
20099 if (Subtarget->isThumb())
20100 return RCPair(0U, &ARM::hGPRRegClass);
20101 break;
20102 case 'r':
20103 if (Subtarget->isThumb1Only())
20104 return RCPair(0U, &ARM::tGPRRegClass);
20105 return RCPair(0U, &ARM::GPRRegClass);
20106 case 'w':
20107 if (VT == MVT::Other)
20108 break;
20109 if (VT == MVT::f32 || VT == MVT::f16 || VT == MVT::bf16)
20110 return RCPair(0U, &ARM::SPRRegClass);
20111 if (VT.getSizeInBits() == 64)
20112 return RCPair(0U, &ARM::DPRRegClass);
20113 if (VT.getSizeInBits() == 128)
20114 return RCPair(0U, &ARM::QPRRegClass);
20115 break;
20116 case 'x':
20117 if (VT == MVT::Other)
20118 break;
20119 if (VT == MVT::f32 || VT == MVT::f16 || VT == MVT::bf16)
20120 return RCPair(0U, &ARM::SPR_8RegClass);
20121 if (VT.getSizeInBits() == 64)
20122 return RCPair(0U, &ARM::DPR_8RegClass);
20123 if (VT.getSizeInBits() == 128)
20124 return RCPair(0U, &ARM::QPR_8RegClass);
20125 break;
20126 case 't':
20127 if (VT == MVT::Other)
20128 break;
20129 if (VT == MVT::f32 || VT == MVT::i32 || VT == MVT::f16 || VT == MVT::bf16)
20130 return RCPair(0U, &ARM::SPRRegClass);
20131 if (VT.getSizeInBits() == 64)
20132 return RCPair(0U, &ARM::DPR_VFP2RegClass);
20133 if (VT.getSizeInBits() == 128)
20134 return RCPair(0U, &ARM::QPR_VFP2RegClass);
20135 break;
20136 }
20137 break;
20138
20139 case 2:
20140 if (Constraint[0] == 'T') {
20141 switch (Constraint[1]) {
20142 default:
20143 break;
20144 case 'e':
20145 return RCPair(0U, &ARM::tGPREvenRegClass);
20146 case 'o':
20147 return RCPair(0U, &ARM::tGPROddRegClass);
20148 }
20149 }
20150 break;
20151
20152 default:
20153 break;
20154 }
20155
20156 if (StringRef("{cc}").equals_insensitive(Constraint))
20157 return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
20158
20159 auto RCP = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
20160 if (isIncompatibleReg(RCP.first, VT))
20161 return {0, nullptr};
20162 return RCP;
20163}
20164
20165/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
20166/// vector. If it is invalid, don't add anything to Ops.
20168 StringRef Constraint,
20169 std::vector<SDValue> &Ops,
20170 SelectionDAG &DAG) const {
20171 SDValue Result;
20172
20173 // Currently only support length 1 constraints.
20174 if (Constraint.size() != 1)
20175 return;
20176
20177 char ConstraintLetter = Constraint[0];
20178 switch (ConstraintLetter) {
20179 default: break;
20180 case 'j':
20181 case 'I': case 'J': case 'K': case 'L':
20182 case 'M': case 'N': case 'O':
20184 if (!C)
20185 return;
20186
20187 int64_t CVal64 = C->getSExtValue();
20188 int CVal = (int) CVal64;
20189 // None of these constraints allow values larger than 32 bits. Check
20190 // that the value fits in an int.
20191 if (CVal != CVal64)
20192 return;
20193
20194 switch (ConstraintLetter) {
20195 case 'j':
20196 // Constant suitable for movw, must be between 0 and
20197 // 65535.
20198 if (Subtarget->hasV6T2Ops() || (Subtarget->hasV8MBaselineOps()))
20199 if (CVal >= 0 && CVal <= 65535)
20200 break;
20201 return;
20202 case 'I':
20203 if (Subtarget->isThumb1Only()) {
20204 // This must be a constant between 0 and 255, for ADD
20205 // immediates.
20206 if (CVal >= 0 && CVal <= 255)
20207 break;
20208 } else if (Subtarget->isThumb2()) {
20209 // A constant that can be used as an immediate value in a
20210 // data-processing instruction.
20211 if (ARM_AM::getT2SOImmVal(CVal) != -1)
20212 break;
20213 } else {
20214 // A constant that can be used as an immediate value in a
20215 // data-processing instruction.
20216 if (ARM_AM::getSOImmVal(CVal) != -1)
20217 break;
20218 }
20219 return;
20220
20221 case 'J':
20222 if (Subtarget->isThumb1Only()) {
20223 // This must be a constant between -255 and -1, for negated ADD
20224 // immediates. This can be used in GCC with an "n" modifier that
20225 // prints the negated value, for use with SUB instructions. It is
20226 // not useful otherwise but is implemented for compatibility.
20227 if (CVal >= -255 && CVal <= -1)
20228 break;
20229 } else {
20230 // This must be a constant between -4095 and 4095. This is suitable
20231 // for use as the immediate offset field in LDR and STR instructions
20232 // such as LDR r0,[r1,#offset].
20233 if (CVal >= -4095 && CVal <= 4095)
20234 break;
20235 }
20236 return;
20237
20238 case 'K':
20239 if (Subtarget->isThumb1Only()) {
20240 // A 32-bit value where only one byte has a nonzero value. Exclude
20241 // zero to match GCC. This constraint is used by GCC internally for
20242 // constants that can be loaded with a move/shift combination.
20243 // It is not useful otherwise but is implemented for compatibility.
20244 if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
20245 break;
20246 } else if (Subtarget->isThumb2()) {
20247 // A constant whose bitwise inverse can be used as an immediate
20248 // value in a data-processing instruction. This can be used in GCC
20249 // with a "B" modifier that prints the inverted value, for use with
20250 // BIC and MVN instructions. It is not useful otherwise but is
20251 // implemented for compatibility.
20252 if (ARM_AM::getT2SOImmVal(~CVal) != -1)
20253 break;
20254 } else {
20255 // A constant whose bitwise inverse can be used as an immediate
20256 // value in a data-processing instruction. This can be used in GCC
20257 // with a "B" modifier that prints the inverted value, for use with
20258 // BIC and MVN instructions. It is not useful otherwise but is
20259 // implemented for compatibility.
20260 if (ARM_AM::getSOImmVal(~CVal) != -1)
20261 break;
20262 }
20263 return;
20264
20265 case 'L':
20266 if (Subtarget->isThumb1Only()) {
20267 // This must be a constant between -7 and 7,
20268 // for 3-operand ADD/SUB immediate instructions.
20269 if (CVal >= -7 && CVal < 7)
20270 break;
20271 } else if (Subtarget->isThumb2()) {
20272 // A constant whose negation can be used as an immediate value in a
20273 // data-processing instruction. This can be used in GCC with an "n"
20274 // modifier that prints the negated value, for use with SUB
20275 // instructions. It is not useful otherwise but is implemented for
20276 // compatibility.
20277 if (ARM_AM::getT2SOImmVal(-CVal) != -1)
20278 break;
20279 } else {
20280 // A constant whose negation can be used as an immediate value in a
20281 // data-processing instruction. This can be used in GCC with an "n"
20282 // modifier that prints the negated value, for use with SUB
20283 // instructions. It is not useful otherwise but is implemented for
20284 // compatibility.
20285 if (ARM_AM::getSOImmVal(-CVal) != -1)
20286 break;
20287 }
20288 return;
20289
20290 case 'M':
20291 if (Subtarget->isThumb1Only()) {
20292 // This must be a multiple of 4 between 0 and 1020, for
20293 // ADD sp + immediate.
20294 if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
20295 break;
20296 } else {
20297 // A power of two or a constant between 0 and 32. This is used in
20298 // GCC for the shift amount on shifted register operands, but it is
20299 // useful in general for any shift amounts.
20300 if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
20301 break;
20302 }
20303 return;
20304
20305 case 'N':
20306 if (Subtarget->isThumb1Only()) {
20307 // This must be a constant between 0 and 31, for shift amounts.
20308 if (CVal >= 0 && CVal <= 31)
20309 break;
20310 }
20311 return;
20312
20313 case 'O':
20314 if (Subtarget->isThumb1Only()) {
20315 // This must be a multiple of 4 between -508 and 508, for
20316 // ADD/SUB sp = sp + immediate.
20317 if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
20318 break;
20319 }
20320 return;
20321 }
20322 Result = DAG.getSignedTargetConstant(CVal, SDLoc(Op), Op.getValueType());
20323 break;
20324 }
20325
20326 if (Result.getNode()) {
20327 Ops.push_back(Result);
20328 return;
20329 }
20330 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
20331}
20332
20333static RTLIB::Libcall getDivRemLibcall(
20334 const SDNode *N, MVT::SimpleValueType SVT) {
20335 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
20336 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
20337 "Unhandled Opcode in getDivRemLibcall");
20338 bool isSigned = N->getOpcode() == ISD::SDIVREM ||
20339 N->getOpcode() == ISD::SREM;
20340 RTLIB::Libcall LC;
20341 switch (SVT) {
20342 default: llvm_unreachable("Unexpected request for libcall!");
20343 case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break;
20344 case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
20345 case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
20346 case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
20347 }
20348 return LC;
20349}
20350
20352 const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) {
20353 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
20354 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
20355 "Unhandled Opcode in getDivRemArgList");
20356 bool isSigned = N->getOpcode() == ISD::SDIVREM ||
20357 N->getOpcode() == ISD::SREM;
20359 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
20360 EVT ArgVT = N->getOperand(i).getValueType();
20361 Type *ArgTy = ArgVT.getTypeForEVT(*Context);
20362 TargetLowering::ArgListEntry Entry(N->getOperand(i), ArgTy);
20363 Entry.IsSExt = isSigned;
20364 Entry.IsZExt = !isSigned;
20365 Args.push_back(Entry);
20366 }
20367 if (Subtarget->isTargetWindows() && Args.size() >= 2)
20368 std::swap(Args[0], Args[1]);
20369 return Args;
20370}
20371
20372SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
20373 assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
20374 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
20375 Subtarget->isTargetFuchsia() || Subtarget->isTargetWindows()) &&
20376 "Register-based DivRem lowering only");
20377 unsigned Opcode = Op->getOpcode();
20378 assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
20379 "Invalid opcode for Div/Rem lowering");
20380 bool isSigned = (Opcode == ISD::SDIVREM);
20381 EVT VT = Op->getValueType(0);
20382 SDLoc dl(Op);
20383
20384 if (VT == MVT::i64 && isa<ConstantSDNode>(Op.getOperand(1))) {
20386 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i32, DAG)) {
20387 SDValue Res0 =
20388 DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[0], Result[1]);
20389 SDValue Res1 =
20390 DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[2], Result[3]);
20391 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
20392 {Res0, Res1});
20393 }
20394 }
20395
20396 Type *Ty = VT.getTypeForEVT(*DAG.getContext());
20397
20398 // If the target has hardware divide, use divide + multiply + subtract:
20399 // div = a / b
20400 // rem = a - b * div
20401 // return {div, rem}
20402 // This should be lowered into UDIV/SDIV + MLS later on.
20403 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
20404 : Subtarget->hasDivideInARMMode();
20405 if (hasDivide && Op->getValueType(0).isSimple() &&
20406 Op->getSimpleValueType(0) == MVT::i32) {
20407 unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
20408 const SDValue Dividend = Op->getOperand(0);
20409 const SDValue Divisor = Op->getOperand(1);
20410 SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor);
20411 SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor);
20412 SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul);
20413
20414 SDValue Values[2] = {Div, Rem};
20415 return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values);
20416 }
20417
20418 RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(),
20419 VT.getSimpleVT().SimpleTy);
20420 SDValue InChain = DAG.getEntryNode();
20421
20423 DAG.getContext(),
20424 Subtarget);
20425
20428
20429 Type *RetTy = StructType::get(Ty, Ty);
20430
20431 if (Subtarget->isTargetWindows())
20432 InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain);
20433
20434 TargetLowering::CallLoweringInfo CLI(DAG);
20435 CLI.setDebugLoc(dl).setChain(InChain)
20436 .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
20438
20439 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
20440 return CallInfo.first;
20441}
20442
20443// Lowers REM using divmod helpers
20444// see RTABI section 4.2/4.3
20445SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
20446 EVT VT = N->getValueType(0);
20447
20448 if (VT == MVT::i64 && isa<ConstantSDNode>(N->getOperand(1))) {
20450 if (expandDIVREMByConstant(N, Result, MVT::i32, DAG))
20451 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), N->getValueType(0),
20452 Result[0], Result[1]);
20453 }
20454
20455 // Build return types (div and rem)
20456 std::vector<Type*> RetTyParams;
20457 Type *RetTyElement;
20458
20459 switch (VT.getSimpleVT().SimpleTy) {
20460 default: llvm_unreachable("Unexpected request for libcall!");
20461 case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break;
20462 case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break;
20463 case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break;
20464 case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break;
20465 }
20466
20467 RetTyParams.push_back(RetTyElement);
20468 RetTyParams.push_back(RetTyElement);
20469 ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams);
20470 Type *RetTy = StructType::get(*DAG.getContext(), ret);
20471
20472 RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT().
20473 SimpleTy);
20474 SDValue InChain = DAG.getEntryNode();
20476 Subtarget);
20477 bool isSigned = N->getOpcode() == ISD::SREM;
20480
20481 if (Subtarget->isTargetWindows())
20482 InChain = WinDBZCheckDenominator(DAG, N, InChain);
20483
20484 // Lower call
20485 CallLoweringInfo CLI(DAG);
20486 CLI.setChain(InChain)
20487 .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args))
20489 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
20490
20491 // Return second (rem) result operand (first contains div)
20492 SDNode *ResNode = CallResult.first.getNode();
20493 assert(ResNode->getNumOperands() == 2 && "divmod should return two operands");
20494 return ResNode->getOperand(1);
20495}
20496
20497SDValue
20498ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
20499 assert(Subtarget->isTargetWindows() && "unsupported target platform");
20500 SDLoc DL(Op);
20501
20502 // Get the inputs.
20503 SDValue Chain = Op.getOperand(0);
20504 SDValue Size = Op.getOperand(1);
20505
20507 "no-stack-arg-probe")) {
20508 MaybeAlign Align =
20509 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
20510 SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
20511 Chain = SP.getValue(1);
20512 SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size);
20513 if (Align)
20514 SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0),
20515 DAG.getSignedConstant(-Align->value(), DL, MVT::i32));
20516 Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP);
20517 SDValue Ops[2] = { SP, Chain };
20518 return DAG.getMergeValues(Ops, DL);
20519 }
20520
20521 SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
20522 DAG.getConstant(2, DL, MVT::i32));
20523
20524 SDValue Glue;
20525 Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Glue);
20526 Glue = Chain.getValue(1);
20527
20528 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
20529 Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Glue);
20530
20531 SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
20532 Chain = NewSP.getValue(1);
20533
20534 SDValue Ops[2] = { NewSP, Chain };
20535 return DAG.getMergeValues(Ops, DL);
20536}
20537
20538SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
20539 bool IsStrict = Op->isStrictFPOpcode();
20540 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
20541 const unsigned DstSz = Op.getValueType().getSizeInBits();
20542 const unsigned SrcSz = SrcVal.getValueType().getSizeInBits();
20543 assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 &&
20544 "Unexpected type for custom-lowering FP_EXTEND");
20545
20546 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
20547 "With both FP DP and 16, any FP conversion is legal!");
20548
20549 assert(!(DstSz == 32 && Subtarget->hasFP16()) &&
20550 "With FP16, 16 to 32 conversion is legal!");
20551
20552 // Converting from 32 -> 64 is valid if we have FP64.
20553 if (SrcSz == 32 && DstSz == 64 && Subtarget->hasFP64()) {
20554 // FIXME: Remove this when we have strict fp instruction selection patterns
20555 if (IsStrict) {
20556 SDLoc Loc(Op);
20557 SDValue Result = DAG.getNode(ISD::FP_EXTEND,
20558 Loc, Op.getValueType(), SrcVal);
20559 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
20560 }
20561 return Op;
20562 }
20563
20564 // Either we are converting from 16 -> 64, without FP16 and/or
20565 // FP.double-precision or without Armv8-fp. So we must do it in two
20566 // steps.
20567 // Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32
20568 // without FP16. So we must do a function call.
20569 SDLoc Loc(Op);
20570 RTLIB::Libcall LC;
20571 MakeLibCallOptions CallOptions;
20572 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
20573 for (unsigned Sz = SrcSz; Sz <= 32 && Sz < DstSz; Sz *= 2) {
20574 bool Supported = (Sz == 16 ? Subtarget->hasFP16() : Subtarget->hasFP64());
20575 MVT SrcVT = (Sz == 16 ? MVT::f16 : MVT::f32);
20576 MVT DstVT = (Sz == 16 ? MVT::f32 : MVT::f64);
20577 if (Supported) {
20578 if (IsStrict) {
20579 SrcVal = DAG.getNode(ISD::STRICT_FP_EXTEND, Loc,
20580 {DstVT, MVT::Other}, {Chain, SrcVal});
20581 Chain = SrcVal.getValue(1);
20582 } else {
20583 SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, DstVT, SrcVal);
20584 }
20585 } else {
20586 LC = RTLIB::getFPEXT(SrcVT, DstVT);
20587 assert(LC != RTLIB::UNKNOWN_LIBCALL &&
20588 "Unexpected type for custom-lowering FP_EXTEND");
20589 std::tie(SrcVal, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
20590 Loc, Chain);
20591 }
20592 }
20593
20594 return IsStrict ? DAG.getMergeValues({SrcVal, Chain}, Loc) : SrcVal;
20595}
20596
20597SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
20598 bool IsStrict = Op->isStrictFPOpcode();
20599
20600 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
20601 EVT SrcVT = SrcVal.getValueType();
20602 EVT DstVT = Op.getValueType();
20603 const unsigned DstSz = Op.getValueType().getSizeInBits();
20604 const unsigned SrcSz = SrcVT.getSizeInBits();
20605 (void)DstSz;
20606 assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 &&
20607 "Unexpected type for custom-lowering FP_ROUND");
20608
20609 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
20610 "With both FP DP and 16, any FP conversion is legal!");
20611
20612 SDLoc Loc(Op);
20613
20614 // Instruction from 32 -> 16 if hasFP16 is valid
20615 if (SrcSz == 32 && Subtarget->hasFP16())
20616 return Op;
20617
20618 // Lib call from 32 -> 16 / 64 -> [32, 16]
20619 RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT);
20620 assert(LC != RTLIB::UNKNOWN_LIBCALL &&
20621 "Unexpected type for custom-lowering FP_ROUND");
20622 MakeLibCallOptions CallOptions;
20623 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
20625 std::tie(Result, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
20626 Loc, Chain);
20627 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
20628}
20629
20630bool
20632 // The ARM target isn't yet aware of offsets.
20633 return false;
20634}
20635
20637 if (v == 0xffffffff)
20638 return false;
20639
20640 // there can be 1's on either or both "outsides", all the "inside"
20641 // bits must be 0's
20642 return isShiftedMask_32(~v);
20643}
20644
20645/// isFPImmLegal - Returns true if the target can instruction select the
20646/// specified FP immediate natively. If false, the legalizer will
20647/// materialize the FP immediate as a load from a constant pool.
20649 bool ForCodeSize) const {
20650 if (!Subtarget->hasVFP3Base())
20651 return false;
20652 if (VT == MVT::f16 && Subtarget->hasFullFP16())
20653 return ARM_AM::getFP16Imm(Imm) != -1;
20654 if (VT == MVT::f32 && Subtarget->hasFullFP16() &&
20655 ARM_AM::getFP32FP16Imm(Imm) != -1)
20656 return true;
20657 if (VT == MVT::f32)
20658 return ARM_AM::getFP32Imm(Imm) != -1;
20659 if (VT == MVT::f64 && Subtarget->hasFP64())
20660 return ARM_AM::getFP64Imm(Imm) != -1;
20661 return false;
20662}
20663
20664/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
20665/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
20666/// specified in the intrinsic calls.
20668 const CallBase &I,
20669 MachineFunction &MF,
20670 unsigned Intrinsic) const {
20671 switch (Intrinsic) {
20672 case Intrinsic::arm_neon_vld1:
20673 case Intrinsic::arm_neon_vld2:
20674 case Intrinsic::arm_neon_vld3:
20675 case Intrinsic::arm_neon_vld4:
20676 case Intrinsic::arm_neon_vld2lane:
20677 case Intrinsic::arm_neon_vld3lane:
20678 case Intrinsic::arm_neon_vld4lane:
20679 case Intrinsic::arm_neon_vld2dup:
20680 case Intrinsic::arm_neon_vld3dup:
20681 case Intrinsic::arm_neon_vld4dup: {
20682 Info.opc = ISD::INTRINSIC_W_CHAIN;
20683 // Conservatively set memVT to the entire set of vectors loaded.
20684 auto &DL = I.getDataLayout();
20685 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
20686 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20687 Info.ptrVal = I.getArgOperand(0);
20688 Info.offset = 0;
20689 Value *AlignArg = I.getArgOperand(I.arg_size() - 1);
20690 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
20691 // volatile loads with NEON intrinsics not supported
20692 Info.flags = MachineMemOperand::MOLoad;
20693 return true;
20694 }
20695 case Intrinsic::arm_neon_vld1x2:
20696 case Intrinsic::arm_neon_vld1x3:
20697 case Intrinsic::arm_neon_vld1x4: {
20698 Info.opc = ISD::INTRINSIC_W_CHAIN;
20699 // Conservatively set memVT to the entire set of vectors loaded.
20700 auto &DL = I.getDataLayout();
20701 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
20702 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20703 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
20704 Info.offset = 0;
20705 Info.align = I.getParamAlign(I.arg_size() - 1).valueOrOne();
20706 // volatile loads with NEON intrinsics not supported
20707 Info.flags = MachineMemOperand::MOLoad;
20708 return true;
20709 }
20710 case Intrinsic::arm_neon_vst1:
20711 case Intrinsic::arm_neon_vst2:
20712 case Intrinsic::arm_neon_vst3:
20713 case Intrinsic::arm_neon_vst4:
20714 case Intrinsic::arm_neon_vst2lane:
20715 case Intrinsic::arm_neon_vst3lane:
20716 case Intrinsic::arm_neon_vst4lane: {
20717 Info.opc = ISD::INTRINSIC_VOID;
20718 // Conservatively set memVT to the entire set of vectors stored.
20719 auto &DL = I.getDataLayout();
20720 unsigned NumElts = 0;
20721 for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
20722 Type *ArgTy = I.getArgOperand(ArgI)->getType();
20723 if (!ArgTy->isVectorTy())
20724 break;
20725 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
20726 }
20727 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20728 Info.ptrVal = I.getArgOperand(0);
20729 Info.offset = 0;
20730 Value *AlignArg = I.getArgOperand(I.arg_size() - 1);
20731 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
20732 // volatile stores with NEON intrinsics not supported
20733 Info.flags = MachineMemOperand::MOStore;
20734 return true;
20735 }
20736 case Intrinsic::arm_neon_vst1x2:
20737 case Intrinsic::arm_neon_vst1x3:
20738 case Intrinsic::arm_neon_vst1x4: {
20739 Info.opc = ISD::INTRINSIC_VOID;
20740 // Conservatively set memVT to the entire set of vectors stored.
20741 auto &DL = I.getDataLayout();
20742 unsigned NumElts = 0;
20743 for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
20744 Type *ArgTy = I.getArgOperand(ArgI)->getType();
20745 if (!ArgTy->isVectorTy())
20746 break;
20747 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
20748 }
20749 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20750 Info.ptrVal = I.getArgOperand(0);
20751 Info.offset = 0;
20752 Info.align = I.getParamAlign(0).valueOrOne();
20753 // volatile stores with NEON intrinsics not supported
20754 Info.flags = MachineMemOperand::MOStore;
20755 return true;
20756 }
20757 case Intrinsic::arm_mve_vld2q:
20758 case Intrinsic::arm_mve_vld4q: {
20759 Info.opc = ISD::INTRINSIC_W_CHAIN;
20760 // Conservatively set memVT to the entire set of vectors loaded.
20761 Type *VecTy = cast<StructType>(I.getType())->getElementType(1);
20762 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vld2q ? 2 : 4;
20763 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
20764 Info.ptrVal = I.getArgOperand(0);
20765 Info.offset = 0;
20766 Info.align = Align(VecTy->getScalarSizeInBits() / 8);
20767 // volatile loads with MVE intrinsics not supported
20768 Info.flags = MachineMemOperand::MOLoad;
20769 return true;
20770 }
20771 case Intrinsic::arm_mve_vst2q:
20772 case Intrinsic::arm_mve_vst4q: {
20773 Info.opc = ISD::INTRINSIC_VOID;
20774 // Conservatively set memVT to the entire set of vectors stored.
20775 Type *VecTy = I.getArgOperand(1)->getType();
20776 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vst2q ? 2 : 4;
20777 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
20778 Info.ptrVal = I.getArgOperand(0);
20779 Info.offset = 0;
20780 Info.align = Align(VecTy->getScalarSizeInBits() / 8);
20781 // volatile stores with MVE intrinsics not supported
20782 Info.flags = MachineMemOperand::MOStore;
20783 return true;
20784 }
20785 case Intrinsic::arm_mve_vldr_gather_base:
20786 case Intrinsic::arm_mve_vldr_gather_base_predicated: {
20787 Info.opc = ISD::INTRINSIC_W_CHAIN;
20788 Info.ptrVal = nullptr;
20789 Info.memVT = MVT::getVT(I.getType());
20790 Info.align = Align(1);
20791 Info.flags |= MachineMemOperand::MOLoad;
20792 return true;
20793 }
20794 case Intrinsic::arm_mve_vldr_gather_base_wb:
20795 case Intrinsic::arm_mve_vldr_gather_base_wb_predicated: {
20796 Info.opc = ISD::INTRINSIC_W_CHAIN;
20797 Info.ptrVal = nullptr;
20798 Info.memVT = MVT::getVT(I.getType()->getContainedType(0));
20799 Info.align = Align(1);
20800 Info.flags |= MachineMemOperand::MOLoad;
20801 return true;
20802 }
20803 case Intrinsic::arm_mve_vldr_gather_offset:
20804 case Intrinsic::arm_mve_vldr_gather_offset_predicated: {
20805 Info.opc = ISD::INTRINSIC_W_CHAIN;
20806 Info.ptrVal = nullptr;
20807 MVT DataVT = MVT::getVT(I.getType());
20808 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();
20809 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
20810 DataVT.getVectorNumElements());
20811 Info.align = Align(1);
20812 Info.flags |= MachineMemOperand::MOLoad;
20813 return true;
20814 }
20815 case Intrinsic::arm_mve_vstr_scatter_base:
20816 case Intrinsic::arm_mve_vstr_scatter_base_predicated: {
20817 Info.opc = ISD::INTRINSIC_VOID;
20818 Info.ptrVal = nullptr;
20819 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
20820 Info.align = Align(1);
20821 Info.flags |= MachineMemOperand::MOStore;
20822 return true;
20823 }
20824 case Intrinsic::arm_mve_vstr_scatter_base_wb:
20825 case Intrinsic::arm_mve_vstr_scatter_base_wb_predicated: {
20826 Info.opc = ISD::INTRINSIC_W_CHAIN;
20827 Info.ptrVal = nullptr;
20828 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
20829 Info.align = Align(1);
20830 Info.flags |= MachineMemOperand::MOStore;
20831 return true;
20832 }
20833 case Intrinsic::arm_mve_vstr_scatter_offset:
20834 case Intrinsic::arm_mve_vstr_scatter_offset_predicated: {
20835 Info.opc = ISD::INTRINSIC_VOID;
20836 Info.ptrVal = nullptr;
20837 MVT DataVT = MVT::getVT(I.getArgOperand(2)->getType());
20838 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
20839 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
20840 DataVT.getVectorNumElements());
20841 Info.align = Align(1);
20842 Info.flags |= MachineMemOperand::MOStore;
20843 return true;
20844 }
20845 case Intrinsic::arm_ldaex:
20846 case Intrinsic::arm_ldrex: {
20847 auto &DL = I.getDataLayout();
20848 Type *ValTy = I.getParamElementType(0);
20849 Info.opc = ISD::INTRINSIC_W_CHAIN;
20850 Info.memVT = MVT::getVT(ValTy);
20851 Info.ptrVal = I.getArgOperand(0);
20852 Info.offset = 0;
20853 Info.align = DL.getABITypeAlign(ValTy);
20855 return true;
20856 }
20857 case Intrinsic::arm_stlex:
20858 case Intrinsic::arm_strex: {
20859 auto &DL = I.getDataLayout();
20860 Type *ValTy = I.getParamElementType(1);
20861 Info.opc = ISD::INTRINSIC_W_CHAIN;
20862 Info.memVT = MVT::getVT(ValTy);
20863 Info.ptrVal = I.getArgOperand(1);
20864 Info.offset = 0;
20865 Info.align = DL.getABITypeAlign(ValTy);
20867 return true;
20868 }
20869 case Intrinsic::arm_stlexd:
20870 case Intrinsic::arm_strexd:
20871 Info.opc = ISD::INTRINSIC_W_CHAIN;
20872 Info.memVT = MVT::i64;
20873 Info.ptrVal = I.getArgOperand(2);
20874 Info.offset = 0;
20875 Info.align = Align(8);
20877 return true;
20878
20879 case Intrinsic::arm_ldaexd:
20880 case Intrinsic::arm_ldrexd:
20881 Info.opc = ISD::INTRINSIC_W_CHAIN;
20882 Info.memVT = MVT::i64;
20883 Info.ptrVal = I.getArgOperand(0);
20884 Info.offset = 0;
20885 Info.align = Align(8);
20887 return true;
20888
20889 default:
20890 break;
20891 }
20892
20893 return false;
20894}
20895
20896/// Returns true if it is beneficial to convert a load of a constant
20897/// to just the constant itself.
20899 Type *Ty) const {
20900 assert(Ty->isIntegerTy());
20901
20902 unsigned Bits = Ty->getPrimitiveSizeInBits();
20903 if (Bits == 0 || Bits > 32)
20904 return false;
20905 return true;
20906}
20907
20909 unsigned Index) const {
20911 return false;
20912
20913 return (Index == 0 || Index == ResVT.getVectorNumElements());
20914}
20915
20917 ARM_MB::MemBOpt Domain) const {
20918 // First, if the target has no DMB, see what fallback we can use.
20919 if (!Subtarget->hasDataBarrier()) {
20920 // Some ARMv6 cpus can support data barriers with an mcr instruction.
20921 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
20922 // here.
20923 if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
20924 Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
20925 Builder.getInt32(0), Builder.getInt32(7),
20926 Builder.getInt32(10), Builder.getInt32(5)};
20927 return Builder.CreateIntrinsic(Intrinsic::arm_mcr, args);
20928 } else {
20929 // Instead of using barriers, atomic accesses on these subtargets use
20930 // libcalls.
20931 llvm_unreachable("makeDMB on a target so old that it has no barriers");
20932 }
20933 } else {
20934 // Only a full system barrier exists in the M-class architectures.
20935 Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;
20936 Constant *CDomain = Builder.getInt32(Domain);
20937 return Builder.CreateIntrinsic(Intrinsic::arm_dmb, CDomain);
20938 }
20939}
20940
20941// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
20943 Instruction *Inst,
20944 AtomicOrdering Ord) const {
20945 switch (Ord) {
20948 llvm_unreachable("Invalid fence: unordered/non-atomic");
20951 return nullptr; // Nothing to do
20953 if (!Inst->hasAtomicStore())
20954 return nullptr; // Nothing to do
20955 [[fallthrough]];
20958 if (Subtarget->preferISHSTBarriers())
20959 return makeDMB(Builder, ARM_MB::ISHST);
20960 // FIXME: add a comment with a link to documentation justifying this.
20961 else
20962 return makeDMB(Builder, ARM_MB::ISH);
20963 }
20964 llvm_unreachable("Unknown fence ordering in emitLeadingFence");
20965}
20966
20968 Instruction *Inst,
20969 AtomicOrdering Ord) const {
20970 switch (Ord) {
20973 llvm_unreachable("Invalid fence: unordered/not-atomic");
20976 return nullptr; // Nothing to do
20980 return makeDMB(Builder, ARM_MB::ISH);
20981 }
20982 llvm_unreachable("Unknown fence ordering in emitTrailingFence");
20983}
20984
20985// Loads and stores less than 64-bits are already atomic; ones above that
20986// are doomed anyway, so defer to the default libcall and blame the OS when
20987// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
20988// anything for those.
20991 bool has64BitAtomicStore;
20992 if (Subtarget->isMClass())
20993 has64BitAtomicStore = false;
20994 else if (Subtarget->isThumb())
20995 has64BitAtomicStore = Subtarget->hasV7Ops();
20996 else
20997 has64BitAtomicStore = Subtarget->hasV6Ops();
20998
20999 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
21000 return Size == 64 && has64BitAtomicStore ? AtomicExpansionKind::Expand
21002}
21003
21004// Loads and stores less than 64-bits are already atomic; ones above that
21005// are doomed anyway, so defer to the default libcall and blame the OS when
21006// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
21007// anything for those.
21008// FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
21009// guarantee, see DDI0406C ARM architecture reference manual,
21010// sections A8.8.72-74 LDRD)
21013 bool has64BitAtomicLoad;
21014 if (Subtarget->isMClass())
21015 has64BitAtomicLoad = false;
21016 else if (Subtarget->isThumb())
21017 has64BitAtomicLoad = Subtarget->hasV7Ops();
21018 else
21019 has64BitAtomicLoad = Subtarget->hasV6Ops();
21020
21021 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
21022 return (Size == 64 && has64BitAtomicLoad) ? AtomicExpansionKind::LLOnly
21024}
21025
21026// For the real atomic operations, we have ldrex/strex up to 32 bits,
21027// and up to 64 bits on the non-M profiles
21030 if (AI->isFloatingPointOperation())
21032
21033 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
21034 bool hasAtomicRMW;
21035 if (Subtarget->isMClass())
21036 hasAtomicRMW = Subtarget->hasV8MBaselineOps();
21037 else if (Subtarget->isThumb())
21038 hasAtomicRMW = Subtarget->hasV7Ops();
21039 else
21040 hasAtomicRMW = Subtarget->hasV6Ops();
21041 if (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) {
21042 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21043 // implement atomicrmw without spilling. If the target address is also on
21044 // the stack and close enough to the spill slot, this can lead to a
21045 // situation where the monitor always gets cleared and the atomic operation
21046 // can never succeed. So at -O0 lower this operation to a CAS loop.
21047 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
21050 }
21052}
21053
21054// Similar to shouldExpandAtomicRMWInIR, ldrex/strex can be used up to 32
21055// bits, and up to 64 bits on the non-M profiles.
21058 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21059 // implement cmpxchg without spilling. If the address being exchanged is also
21060 // on the stack and close enough to the spill slot, this can lead to a
21061 // situation where the monitor always gets cleared and the atomic operation
21062 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
21063 unsigned Size = AI->getOperand(1)->getType()->getPrimitiveSizeInBits();
21064 bool HasAtomicCmpXchg;
21065 if (Subtarget->isMClass())
21066 HasAtomicCmpXchg = Subtarget->hasV8MBaselineOps();
21067 else if (Subtarget->isThumb())
21068 HasAtomicCmpXchg = Subtarget->hasV7Ops();
21069 else
21070 HasAtomicCmpXchg = Subtarget->hasV6Ops();
21071 if (getTargetMachine().getOptLevel() != CodeGenOptLevel::None &&
21072 HasAtomicCmpXchg && Size <= (Subtarget->isMClass() ? 32U : 64U))
21075}
21076
21078 const Instruction *I) const {
21079 return InsertFencesForAtomic;
21080}
21081
21083 // ROPI/RWPI are not supported currently.
21084 return !Subtarget->isROPI() && !Subtarget->isRWPI();
21085}
21086
21088 // MSVC CRT provides functionalities for stack protection.
21089 RTLIB::LibcallImpl SecurityCheckCookieLibcall =
21090 getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
21091
21092 RTLIB::LibcallImpl SecurityCookieVar =
21093 getLibcallImpl(RTLIB::STACK_CHECK_GUARD);
21094 if (SecurityCheckCookieLibcall != RTLIB::Unsupported &&
21095 SecurityCookieVar != RTLIB::Unsupported) {
21096 // MSVC CRT has a global variable holding security cookie.
21097 M.getOrInsertGlobal(getLibcallImplName(SecurityCookieVar),
21098 PointerType::getUnqual(M.getContext()));
21099
21100 // MSVC CRT has a function to validate security cookie.
21101 FunctionCallee SecurityCheckCookie =
21102 M.getOrInsertFunction(getLibcallImplName(SecurityCheckCookieLibcall),
21103 Type::getVoidTy(M.getContext()),
21104 PointerType::getUnqual(M.getContext()));
21105 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))
21106 F->addParamAttr(0, Attribute::AttrKind::InReg);
21107 }
21108
21110}
21111
21113 unsigned &Cost) const {
21114 // If we do not have NEON, vector types are not natively supported.
21115 if (!Subtarget->hasNEON())
21116 return false;
21117
21118 // Floating point values and vector values map to the same register file.
21119 // Therefore, although we could do a store extract of a vector type, this is
21120 // better to leave at float as we have more freedom in the addressing mode for
21121 // those.
21122 if (VectorTy->isFPOrFPVectorTy())
21123 return false;
21124
21125 // If the index is unknown at compile time, this is very expensive to lower
21126 // and it is not possible to combine the store with the extract.
21127 if (!isa<ConstantInt>(Idx))
21128 return false;
21129
21130 assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
21131 unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedValue();
21132 // We can do a store + vector extract on any vector that fits perfectly in a D
21133 // or Q register.
21134 if (BitWidth == 64 || BitWidth == 128) {
21135 Cost = 0;
21136 return true;
21137 }
21138 return false;
21139}
21140
21142 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
21143 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
21144 unsigned Opcode = Op.getOpcode();
21145 switch (Opcode) {
21146 case ARMISD::VORRIMM:
21147 case ARMISD::VBICIMM:
21148 return false;
21149 }
21151 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
21152}
21153
21155 return Subtarget->hasV5TOps() && !Subtarget->isThumb1Only();
21156}
21157
21159 return Subtarget->hasV5TOps() && !Subtarget->isThumb1Only();
21160}
21161
21163 const Instruction &AndI) const {
21164 if (!Subtarget->hasV7Ops())
21165 return false;
21166
21167 // Sink the `and` instruction only if the mask would fit into a modified
21168 // immediate operand.
21170 if (!Mask || Mask->getValue().getBitWidth() > 32u)
21171 return false;
21172 auto MaskVal = unsigned(Mask->getValue().getZExtValue());
21173 return (Subtarget->isThumb2() ? ARM_AM::getT2SOImmVal(MaskVal)
21174 : ARM_AM::getSOImmVal(MaskVal)) != -1;
21175}
21176
21179 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
21180 if (Subtarget->hasMinSize() && !Subtarget->isTargetWindows())
21183 ExpansionFactor);
21184}
21185
21187 Value *Addr,
21188 AtomicOrdering Ord) const {
21189 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21190 bool IsAcquire = isAcquireOrStronger(Ord);
21191
21192 // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
21193 // intrinsic must return {i32, i32} and we have to recombine them into a
21194 // single i64 here.
21195 if (ValueTy->getPrimitiveSizeInBits() == 64) {
21197 IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
21198
21199 Value *LoHi =
21200 Builder.CreateIntrinsic(Int, Addr, /*FMFSource=*/nullptr, "lohi");
21201
21202 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
21203 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
21204 if (!Subtarget->isLittle())
21205 std::swap (Lo, Hi);
21206 Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
21207 Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
21208 return Builder.CreateOr(
21209 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 32)), "val64");
21210 }
21211
21212 Type *Tys[] = { Addr->getType() };
21213 Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
21214 CallInst *CI = Builder.CreateIntrinsic(Int, Tys, Addr);
21215
21216 CI->addParamAttr(
21217 0, Attribute::get(M->getContext(), Attribute::ElementType, ValueTy));
21218 return Builder.CreateTruncOrBitCast(CI, ValueTy);
21219}
21220
21222 IRBuilderBase &Builder) const {
21223 if (!Subtarget->hasV7Ops())
21224 return;
21225 Builder.CreateIntrinsic(Intrinsic::arm_clrex, {});
21226}
21227
21229 Value *Val, Value *Addr,
21230 AtomicOrdering Ord) const {
21231 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21232 bool IsRelease = isReleaseOrStronger(Ord);
21233
21234 // Since the intrinsics must have legal type, the i64 intrinsics take two
21235 // parameters: "i32, i32". We must marshal Val into the appropriate form
21236 // before the call.
21237 if (Val->getType()->getPrimitiveSizeInBits() == 64) {
21239 IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
21240 Type *Int32Ty = Type::getInt32Ty(M->getContext());
21241
21242 Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
21243 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
21244 if (!Subtarget->isLittle())
21245 std::swap(Lo, Hi);
21246 return Builder.CreateIntrinsic(Int, {Lo, Hi, Addr});
21247 }
21248
21249 Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
21250 Type *Tys[] = { Addr->getType() };
21252
21253 CallInst *CI = Builder.CreateCall(
21254 Strex, {Builder.CreateZExtOrBitCast(
21255 Val, Strex->getFunctionType()->getParamType(0)),
21256 Addr});
21257 CI->addParamAttr(1, Attribute::get(M->getContext(), Attribute::ElementType,
21258 Val->getType()));
21259 return CI;
21260}
21261
21262
21264 return Subtarget->isMClass();
21265}
21266
21267/// A helper function for determining the number of interleaved accesses we
21268/// will generate when lowering accesses of the given type.
21269unsigned
21271 const DataLayout &DL) const {
21272 return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
21273}
21274
21276 unsigned Factor, FixedVectorType *VecTy, Align Alignment,
21277 const DataLayout &DL) const {
21278
21279 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
21280 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
21281
21282 if (!Subtarget->hasNEON() && !Subtarget->hasMVEIntegerOps())
21283 return false;
21284
21285 // Ensure the vector doesn't have f16 elements. Even though we could do an
21286 // i16 vldN, we can't hold the f16 vectors and will end up converting via
21287 // f32.
21288 if (Subtarget->hasNEON() && VecTy->getElementType()->isHalfTy())
21289 return false;
21290 if (Subtarget->hasMVEIntegerOps() && Factor == 3)
21291 return false;
21292
21293 // Ensure the number of vector elements is greater than 1.
21294 if (VecTy->getNumElements() < 2)
21295 return false;
21296
21297 // Ensure the element type is legal.
21298 if (ElSize != 8 && ElSize != 16 && ElSize != 32)
21299 return false;
21300 // And the alignment if high enough under MVE.
21301 if (Subtarget->hasMVEIntegerOps() && Alignment < ElSize / 8)
21302 return false;
21303
21304 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
21305 // 128 will be split into multiple interleaved accesses.
21306 if (Subtarget->hasNEON() && VecSize == 64)
21307 return true;
21308 return VecSize % 128 == 0;
21309}
21310
21312 if (Subtarget->hasNEON())
21313 return 4;
21314 if (Subtarget->hasMVEIntegerOps())
21317}
21318
21319/// Lower an interleaved load into a vldN intrinsic.
21320///
21321/// E.g. Lower an interleaved load (Factor = 2):
21322/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
21323/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
21324/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
21325///
21326/// Into:
21327/// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4)
21328/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
21329/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
21331 Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
21332 ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
21333 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
21334 "Invalid interleave factor");
21335 assert(!Shuffles.empty() && "Empty shufflevector input");
21336 assert(Shuffles.size() == Indices.size() &&
21337 "Unmatched number of shufflevectors and indices");
21338
21339 auto *LI = dyn_cast<LoadInst>(Load);
21340 if (!LI)
21341 return false;
21342 assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");
21343
21344 auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
21345 Type *EltTy = VecTy->getElementType();
21346
21347 const DataLayout &DL = LI->getDataLayout();
21348 Align Alignment = LI->getAlign();
21349
21350 // Skip if we do not have NEON and skip illegal vector types. We can
21351 // "legalize" wide vector types into multiple interleaved accesses as long as
21352 // the vector types are divisible by 128.
21353 if (!isLegalInterleavedAccessType(Factor, VecTy, Alignment, DL))
21354 return false;
21355
21356 unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
21357
21358 // A pointer vector can not be the return type of the ldN intrinsics. Need to
21359 // load integer vectors first and then convert to pointer vectors.
21360 if (EltTy->isPointerTy())
21361 VecTy = FixedVectorType::get(DL.getIntPtrType(EltTy), VecTy);
21362
21363 IRBuilder<> Builder(LI);
21364
21365 // The base address of the load.
21366 Value *BaseAddr = LI->getPointerOperand();
21367
21368 if (NumLoads > 1) {
21369 // If we're going to generate more than one load, reset the sub-vector type
21370 // to something legal.
21371 VecTy = FixedVectorType::get(VecTy->getElementType(),
21372 VecTy->getNumElements() / NumLoads);
21373 }
21374
21375 assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!");
21376
21377 auto createLoadIntrinsic = [&](Value *BaseAddr) {
21378 if (Subtarget->hasNEON()) {
21379 Type *PtrTy = Builder.getPtrTy(LI->getPointerAddressSpace());
21380 Type *Tys[] = {VecTy, PtrTy};
21381 static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
21382 Intrinsic::arm_neon_vld3,
21383 Intrinsic::arm_neon_vld4};
21384
21386 Ops.push_back(BaseAddr);
21387 Ops.push_back(Builder.getInt32(LI->getAlign().value()));
21388
21389 return Builder.CreateIntrinsic(LoadInts[Factor - 2], Tys, Ops,
21390 /*FMFSource=*/nullptr, "vldN");
21391 } else {
21392 assert((Factor == 2 || Factor == 4) &&
21393 "expected interleave factor of 2 or 4 for MVE");
21394 Intrinsic::ID LoadInts =
21395 Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q;
21396 Type *PtrTy = Builder.getPtrTy(LI->getPointerAddressSpace());
21397 Type *Tys[] = {VecTy, PtrTy};
21398
21400 Ops.push_back(BaseAddr);
21401 return Builder.CreateIntrinsic(LoadInts, Tys, Ops, /*FMFSource=*/nullptr,
21402 "vldN");
21403 }
21404 };
21405
21406 // Holds sub-vectors extracted from the load intrinsic return values. The
21407 // sub-vectors are associated with the shufflevector instructions they will
21408 // replace.
21410
21411 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
21412 // If we're generating more than one load, compute the base address of
21413 // subsequent loads as an offset from the previous.
21414 if (LoadCount > 0)
21415 BaseAddr = Builder.CreateConstGEP1_32(VecTy->getElementType(), BaseAddr,
21416 VecTy->getNumElements() * Factor);
21417
21418 CallInst *VldN = createLoadIntrinsic(BaseAddr);
21419
21420 // Replace uses of each shufflevector with the corresponding vector loaded
21421 // by ldN.
21422 for (unsigned i = 0; i < Shuffles.size(); i++) {
21423 ShuffleVectorInst *SV = Shuffles[i];
21424 unsigned Index = Indices[i];
21425
21426 Value *SubVec = Builder.CreateExtractValue(VldN, Index);
21427
21428 // Convert the integer vector to pointer vector if the element is pointer.
21429 if (EltTy->isPointerTy())
21430 SubVec = Builder.CreateIntToPtr(
21431 SubVec,
21433
21434 SubVecs[SV].push_back(SubVec);
21435 }
21436 }
21437
21438 // Replace uses of the shufflevector instructions with the sub-vectors
21439 // returned by the load intrinsic. If a shufflevector instruction is
21440 // associated with more than one sub-vector, those sub-vectors will be
21441 // concatenated into a single wide vector.
21442 for (ShuffleVectorInst *SVI : Shuffles) {
21443 auto &SubVec = SubVecs[SVI];
21444 auto *WideVec =
21445 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
21446 SVI->replaceAllUsesWith(WideVec);
21447 }
21448
21449 return true;
21450}
21451
21452/// Lower an interleaved store into a vstN intrinsic.
21453///
21454/// E.g. Lower an interleaved store (Factor = 3):
21455/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
21456/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
21457/// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4
21458///
21459/// Into:
21460/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
21461/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
21462/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
21463/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
21464///
21465/// Note that the new shufflevectors will be removed and we'll only generate one
21466/// vst3 instruction in CodeGen.
21467///
21468/// Example for a more general valid mask (Factor 3). Lower:
21469/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
21470/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
21471/// store <12 x i32> %i.vec, <12 x i32>* %ptr
21472///
21473/// Into:
21474/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
21475/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
21476/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
21477/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
21479 Value *LaneMask,
21480 ShuffleVectorInst *SVI,
21481 unsigned Factor,
21482 const APInt &GapMask) const {
21483 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
21484 "Invalid interleave factor");
21485 auto *SI = dyn_cast<StoreInst>(Store);
21486 if (!SI)
21487 return false;
21488 assert(!LaneMask && GapMask.popcount() == Factor &&
21489 "Unexpected mask on store");
21490
21491 auto *VecTy = cast<FixedVectorType>(SVI->getType());
21492 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
21493
21494 unsigned LaneLen = VecTy->getNumElements() / Factor;
21495 Type *EltTy = VecTy->getElementType();
21496 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
21497
21498 const DataLayout &DL = SI->getDataLayout();
21499 Align Alignment = SI->getAlign();
21500
21501 // Skip if we do not have NEON and skip illegal vector types. We can
21502 // "legalize" wide vector types into multiple interleaved accesses as long as
21503 // the vector types are divisible by 128.
21504 if (!isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
21505 return false;
21506
21507 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
21508
21509 Value *Op0 = SVI->getOperand(0);
21510 Value *Op1 = SVI->getOperand(1);
21511 IRBuilder<> Builder(SI);
21512
21513 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
21514 // vectors to integer vectors.
21515 if (EltTy->isPointerTy()) {
21516 Type *IntTy = DL.getIntPtrType(EltTy);
21517
21518 // Convert to the corresponding integer vector.
21519 auto *IntVecTy =
21521 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
21522 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
21523
21524 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
21525 }
21526
21527 // The base address of the store.
21528 Value *BaseAddr = SI->getPointerOperand();
21529
21530 if (NumStores > 1) {
21531 // If we're going to generate more than one store, reset the lane length
21532 // and sub-vector type to something legal.
21533 LaneLen /= NumStores;
21534 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
21535 }
21536
21537 assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!");
21538
21539 auto Mask = SVI->getShuffleMask();
21540
21541 auto createStoreIntrinsic = [&](Value *BaseAddr,
21542 SmallVectorImpl<Value *> &Shuffles) {
21543 if (Subtarget->hasNEON()) {
21544 static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
21545 Intrinsic::arm_neon_vst3,
21546 Intrinsic::arm_neon_vst4};
21547 Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace());
21548 Type *Tys[] = {PtrTy, SubVecTy};
21549
21551 Ops.push_back(BaseAddr);
21552 append_range(Ops, Shuffles);
21553 Ops.push_back(Builder.getInt32(SI->getAlign().value()));
21554 Builder.CreateIntrinsic(StoreInts[Factor - 2], Tys, Ops);
21555 } else {
21556 assert((Factor == 2 || Factor == 4) &&
21557 "expected interleave factor of 2 or 4 for MVE");
21558 Intrinsic::ID StoreInts =
21559 Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q;
21560 Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace());
21561 Type *Tys[] = {PtrTy, SubVecTy};
21562
21564 Ops.push_back(BaseAddr);
21565 append_range(Ops, Shuffles);
21566 for (unsigned F = 0; F < Factor; F++) {
21567 Ops.push_back(Builder.getInt32(F));
21568 Builder.CreateIntrinsic(StoreInts, Tys, Ops);
21569 Ops.pop_back();
21570 }
21571 }
21572 };
21573
21574 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
21575 // If we generating more than one store, we compute the base address of
21576 // subsequent stores as an offset from the previous.
21577 if (StoreCount > 0)
21578 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
21579 BaseAddr, LaneLen * Factor);
21580
21581 SmallVector<Value *, 4> Shuffles;
21582
21583 // Split the shufflevector operands into sub vectors for the new vstN call.
21584 for (unsigned i = 0; i < Factor; i++) {
21585 unsigned IdxI = StoreCount * LaneLen * Factor + i;
21586 if (Mask[IdxI] >= 0) {
21587 Shuffles.push_back(Builder.CreateShuffleVector(
21588 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)));
21589 } else {
21590 unsigned StartMask = 0;
21591 for (unsigned j = 1; j < LaneLen; j++) {
21592 unsigned IdxJ = StoreCount * LaneLen * Factor + j;
21593 if (Mask[IdxJ * Factor + IdxI] >= 0) {
21594 StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
21595 break;
21596 }
21597 }
21598 // Note: If all elements in a chunk are undefs, StartMask=0!
21599 // Note: Filling undef gaps with random elements is ok, since
21600 // those elements were being written anyway (with undefs).
21601 // In the case of all undefs we're defaulting to using elems from 0
21602 // Note: StartMask cannot be negative, it's checked in
21603 // isReInterleaveMask
21604 Shuffles.push_back(Builder.CreateShuffleVector(
21605 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)));
21606 }
21607 }
21608
21609 createStoreIntrinsic(BaseAddr, Shuffles);
21610 }
21611 return true;
21612}
21613
21621
21623 uint64_t &Members) {
21624 if (auto *ST = dyn_cast<StructType>(Ty)) {
21625 for (unsigned i = 0; i < ST->getNumElements(); ++i) {
21626 uint64_t SubMembers = 0;
21627 if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers))
21628 return false;
21629 Members += SubMembers;
21630 }
21631 } else if (auto *AT = dyn_cast<ArrayType>(Ty)) {
21632 uint64_t SubMembers = 0;
21633 if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers))
21634 return false;
21635 Members += SubMembers * AT->getNumElements();
21636 } else if (Ty->isFloatTy()) {
21637 if (Base != HA_UNKNOWN && Base != HA_FLOAT)
21638 return false;
21639 Members = 1;
21640 Base = HA_FLOAT;
21641 } else if (Ty->isDoubleTy()) {
21642 if (Base != HA_UNKNOWN && Base != HA_DOUBLE)
21643 return false;
21644 Members = 1;
21645 Base = HA_DOUBLE;
21646 } else if (auto *VT = dyn_cast<VectorType>(Ty)) {
21647 Members = 1;
21648 switch (Base) {
21649 case HA_FLOAT:
21650 case HA_DOUBLE:
21651 return false;
21652 case HA_VECT64:
21653 return VT->getPrimitiveSizeInBits().getFixedValue() == 64;
21654 case HA_VECT128:
21655 return VT->getPrimitiveSizeInBits().getFixedValue() == 128;
21656 case HA_UNKNOWN:
21657 switch (VT->getPrimitiveSizeInBits().getFixedValue()) {
21658 case 64:
21659 Base = HA_VECT64;
21660 return true;
21661 case 128:
21662 Base = HA_VECT128;
21663 return true;
21664 default:
21665 return false;
21666 }
21667 }
21668 }
21669
21670 return (Members > 0 && Members <= 4);
21671}
21672
21673/// Return the correct alignment for the current calling convention.
21675 Type *ArgTy, const DataLayout &DL) const {
21676 const Align ABITypeAlign = DL.getABITypeAlign(ArgTy);
21677 if (!ArgTy->isVectorTy())
21678 return ABITypeAlign;
21679
21680 // Avoid over-aligning vector parameters. It would require realigning the
21681 // stack and waste space for no real benefit.
21682 MaybeAlign StackAlign = DL.getStackAlignment();
21683 assert(StackAlign && "data layout string is missing stack alignment");
21684 return std::min(ABITypeAlign, *StackAlign);
21685}
21686
21687/// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
21688/// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
21689/// passing according to AAPCS rules.
21691 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
21692 const DataLayout &DL) const {
21693 if (getEffectiveCallingConv(CallConv, isVarArg) !=
21695 return false;
21696
21698 uint64_t Members = 0;
21699 bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
21700 LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
21701
21702 bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
21703 return IsHA || IsIntArray;
21704}
21705
21707 const Constant *PersonalityFn) const {
21708 // Platforms which do not use SjLj EH may return values in these registers
21709 // via the personality function.
21711 return EM == ExceptionHandling::SjLj ? Register() : ARM::R0;
21712}
21713
21715 const Constant *PersonalityFn) const {
21716 // Platforms which do not use SjLj EH may return values in these registers
21717 // via the personality function.
21719 return EM == ExceptionHandling::SjLj ? Register() : ARM::R1;
21720}
21721
21722void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
21723 // Update IsSplitCSR in ARMFunctionInfo.
21724 ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>();
21725 AFI->setIsSplitCSR(true);
21726}
21727
21728void ARMTargetLowering::insertCopiesSplitCSR(
21729 MachineBasicBlock *Entry,
21730 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
21731 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
21732 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
21733 if (!IStart)
21734 return;
21735
21736 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21737 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
21738 MachineBasicBlock::iterator MBBI = Entry->begin();
21739 for (const MCPhysReg *I = IStart; *I; ++I) {
21740 const TargetRegisterClass *RC = nullptr;
21741 if (ARM::GPRRegClass.contains(*I))
21742 RC = &ARM::GPRRegClass;
21743 else if (ARM::DPRRegClass.contains(*I))
21744 RC = &ARM::DPRRegClass;
21745 else
21746 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
21747
21748 Register NewVR = MRI->createVirtualRegister(RC);
21749 // Create copy from CSR to a virtual register.
21750 // FIXME: this currently does not emit CFI pseudo-instructions, it works
21751 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
21752 // nounwind. If we want to generalize this later, we may need to emit
21753 // CFI pseudo-instructions.
21754 assert(Entry->getParent()->getFunction().hasFnAttribute(
21755 Attribute::NoUnwind) &&
21756 "Function should be nounwind in insertCopiesSplitCSR!");
21757 Entry->addLiveIn(*I);
21758 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
21759 .addReg(*I);
21760
21761 // Insert the copy-back instructions right before the terminator.
21762 for (auto *Exit : Exits)
21763 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
21764 TII->get(TargetOpcode::COPY), *I)
21765 .addReg(NewVR);
21766 }
21767}
21768
21773
21775 return Subtarget->hasMVEIntegerOps();
21776}
21777
21780 auto *VTy = dyn_cast<FixedVectorType>(Ty);
21781 if (!VTy)
21782 return false;
21783
21784 auto *ScalarTy = VTy->getScalarType();
21785 unsigned NumElements = VTy->getNumElements();
21786
21787 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
21788 if (VTyWidth < 128 || !llvm::isPowerOf2_32(VTyWidth))
21789 return false;
21790
21791 // Both VCADD and VCMUL/VCMLA support the same types, F16 and F32
21792 if (ScalarTy->isHalfTy() || ScalarTy->isFloatTy())
21793 return Subtarget->hasMVEFloatOps();
21794
21796 return false;
21797
21798 return Subtarget->hasMVEIntegerOps() &&
21799 (ScalarTy->isIntegerTy(8) || ScalarTy->isIntegerTy(16) ||
21800 ScalarTy->isIntegerTy(32));
21801}
21802
21804 static const MCPhysReg RCRegs[] = {ARM::FPSCR_RM};
21805 return RCRegs;
21806}
21807
21810 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
21811 Value *Accumulator) const {
21812
21814
21815 unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getNumElements();
21816
21817 assert(TyWidth >= 128 && "Width of vector type must be at least 128 bits");
21818
21819 if (TyWidth > 128) {
21820 int Stride = Ty->getNumElements() / 2;
21821 auto SplitSeq = llvm::seq<int>(0, Ty->getNumElements());
21822 auto SplitSeqVec = llvm::to_vector(SplitSeq);
21823 ArrayRef<int> LowerSplitMask(&SplitSeqVec[0], Stride);
21824 ArrayRef<int> UpperSplitMask(&SplitSeqVec[Stride], Stride);
21825
21826 auto *LowerSplitA = B.CreateShuffleVector(InputA, LowerSplitMask);
21827 auto *LowerSplitB = B.CreateShuffleVector(InputB, LowerSplitMask);
21828 auto *UpperSplitA = B.CreateShuffleVector(InputA, UpperSplitMask);
21829 auto *UpperSplitB = B.CreateShuffleVector(InputB, UpperSplitMask);
21830 Value *LowerSplitAcc = nullptr;
21831 Value *UpperSplitAcc = nullptr;
21832
21833 if (Accumulator) {
21834 LowerSplitAcc = B.CreateShuffleVector(Accumulator, LowerSplitMask);
21835 UpperSplitAcc = B.CreateShuffleVector(Accumulator, UpperSplitMask);
21836 }
21837
21838 auto *LowerSplitInt = createComplexDeinterleavingIR(
21839 B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
21840 auto *UpperSplitInt = createComplexDeinterleavingIR(
21841 B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
21842
21843 ArrayRef<int> JoinMask(&SplitSeqVec[0], Ty->getNumElements());
21844 return B.CreateShuffleVector(LowerSplitInt, UpperSplitInt, JoinMask);
21845 }
21846
21847 auto *IntTy = Type::getInt32Ty(B.getContext());
21848
21849 ConstantInt *ConstRotation = nullptr;
21850 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
21851 ConstRotation = ConstantInt::get(IntTy, (int)Rotation);
21852
21853 if (Accumulator)
21854 return B.CreateIntrinsic(Intrinsic::arm_mve_vcmlaq, Ty,
21855 {ConstRotation, Accumulator, InputB, InputA});
21856 return B.CreateIntrinsic(Intrinsic::arm_mve_vcmulq, Ty,
21857 {ConstRotation, InputB, InputA});
21858 }
21859
21860 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
21861 // 1 means the value is not halved.
21862 auto *ConstHalving = ConstantInt::get(IntTy, 1);
21863
21865 ConstRotation = ConstantInt::get(IntTy, 0);
21867 ConstRotation = ConstantInt::get(IntTy, 1);
21868
21869 if (!ConstRotation)
21870 return nullptr; // Invalid rotation for arm_mve_vcaddq
21871
21872 return B.CreateIntrinsic(Intrinsic::arm_mve_vcaddq, Ty,
21873 {ConstHalving, ConstRotation, InputA, InputB});
21874 }
21875
21876 return nullptr;
21877}
unsigned const MachineRegisterInfo * MRI
static bool isAddSubSExt(SDValue N, SelectionDAG &DAG)
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, bool isSigned)
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
return SDValue()
static const MCPhysReg GPRArgRegs[]
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &DL)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
constexpr MVT FlagsVT
Value type used for NZCV flags.
static bool isNegatedInteger(SDValue Op)
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
static bool isAddSubZExt(SDValue N, SelectionDAG &DAG)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static bool isConstant(const MachineInstr &MI)
constexpr LLT F64
constexpr LLT S1
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG)
static bool isStore(int Opcode)
static bool isThumb(const MCSubtargetInfo &STI)
static SDValue LowerUADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG)
static SDValue PerformExtractEltToVMOVRRD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isIncompatibleReg(const MCPhysReg &PR, MVT VT)
static SDValue PerformVQDMULHCombine(SDNode *N, SelectionDAG &DAG)
static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, const EVT &OrigTy, const EVT &ExtTy, unsigned ExtOpcode)
AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total value size to 64 bits.
static cl::opt< unsigned > ConstpoolPromotionMaxSize("arm-promote-constant-max-size", cl::Hidden, cl::desc("Maximum size of constant to promote into a constant pool"), cl::init(64))
static bool isZeroOrAllOnes(SDValue N, bool AllOnes)
static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isVTBLMask(ArrayRef< int > M, EVT VT)
static SDValue PerformSUBCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
static cl::opt< bool > EnableConstpoolPromotion("arm-promote-constant", cl::Hidden, cl::desc("Enable / disable promotion of unnamed_addr constants into " "constant pools"), cl::init(false))
static SDValue PerformFAddVSelectCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformExtractFpToIntStores(StoreSDNode *St, SelectionDAG &DAG)
static SDValue PerformVDUPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
static SDValue PerformExtractEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static const APInt * isPowerOf2Constant(SDValue V)
static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) can replace combinations of ...
static SDValue PerformVMOVhrCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG)
static SDValue LowerVECTOR_SHUFFLEUsingOneOff(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static bool isValidMVECond(unsigned CC, bool IsFloat)
static SDValue PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC)
IntCCToARMCC - Convert a DAG integer condition code to an ARM CC.
static SDValue PerformSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformSTORECombine - Target-specific dag combine xforms for ISD::STORE.
static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry, SelectionDAG &DAG)
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isGTorGE(ISD::CondCode CC)
static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a vldN-lane (N > 1) intrinsic,...
static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask)
static bool isReverseMask(ArrayRef< int > M, EVT VT)
static bool isVZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of "vector_shuffle v,...
static SDValue PerformSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformVECTOR_REG_CASTCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVMulVCTPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVMulVCTPCombine - VCVT (fixed-point to floating-point, Advanced SIMD) can replace combinations...
static SDValue createGPRPairNode2xi32(SelectionDAG &DAG, SDValue V0, SDValue V1)
static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG)
static bool findPointerConstIncrement(SDNode *N, SDValue *Ptr, SDValue *CInc)
static bool isVTRNMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool CanInvertMVEVCMP(SDValue N)
static SDValue PerformLongShiftCombine(SDNode *N, SelectionDAG &DAG)
static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformShiftCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
PerformShiftCombine - Checks for immediate versions of vector shifts and lowers them.
static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, ARMCC::CondCodes &CondCode2)
FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static EVT getVectorTyFromPredicateVector(EVT VT)
static SDValue PerformFADDVCMLACombine(SDNode *N, SelectionDAG &DAG)
static SDValue handleCMSEValue(const SDValue &Value, const ISD::InputArg &Arg, SelectionDAG &DAG, const SDLoc &DL)
static SDValue PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
static bool isSRL16(const SDValue &Op)
static SDValue PerformVMOVrhCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue IsCMPZCSINC(SDNode *Cmp, ARMCC::CondCodes &CC)
static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr, SDValue Inc, const SelectionDAG &DAG)
static SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static Register genTPEntry(MachineBasicBlock *TpEntry, MachineBasicBlock *TpLoopBody, MachineBasicBlock *TpExit, Register OpSizeReg, const TargetInstrInfo *TII, DebugLoc Dl, MachineRegisterInfo &MRI)
Adds logic in loop entry MBB to calculate loop iteration count and adds t2WhileLoopSetup and t2WhileL...
static SDValue createGPRPairNodei64(SelectionDAG &DAG, SDValue V)
static bool isLTorLE(ISD::CondCode CC)
static SDValue PerformVCMPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformMVEVMULLCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl, SelectionDAG &DAG)
static SDValue performNegCMovCombine(SDNode *N, SelectionDAG &DAG)
static EVT getExtensionTo64Bits(const EVT &OrigVT)
static SDValue PerformBITCASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG)
static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static SDValue PerformCMPZCombine(SDNode *N, SelectionDAG &DAG)
static bool hasNormalLoadOperand(SDNode *N)
hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node are normal,...
static SDValue PerformInsertEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
PerformInsertEltCombine - Target-specific dag combine xforms for ISD::INSERT_VECTOR_ELT.
static SDValue PerformVDUPLANECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVDUPLANECombine - Target-specific dag combine xforms for ARMISD::VDUPLANE.
static SDValue LowerBuildVectorOfFPTrunc(SDValue BV, SelectionDAG &DAG, const ARMSubtarget *ST)
static cl::opt< unsigned > ConstpoolPromotionMaxTotal("arm-promote-constant-max-total", cl::Hidden, cl::desc("Maximum size of ALL constants to promote into a constant pool"), cl::init(128))
static SDValue LowerTruncatei1(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static RTLIB::Libcall getDivRemLibcall(const SDNode *N, MVT::SimpleValueType SVT)
static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG &DAG)
SkipLoadExtensionForVMULL - return a load of the original vector size that does not do any sign/zero ...
static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombineWithOperands - Try DAG combinations for an ADD with operands N0 and N1.
static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT, SelectionDAG &DAG)
static bool isVZIPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue PerformORCombineToSMULWBT(SDNode *OR, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool isVTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of "vector_shuffle v,...
static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue FindBFIToCombineWith(SDNode *N)
static SDValue LowerADDSUBSAT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT, SelectionDAG &DAG)
static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, bool &swpCmpOps, bool &swpVselOps)
static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static bool isS16(const SDValue &Op, SelectionDAG &DAG)
static bool isSRA16(const SDValue &Op)
static SDValue AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue LowerInterruptReturn(SmallVectorImpl< SDValue > &RetOps, const SDLoc &DL, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl, SelectionDAG &DAG)
static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, SDValue &RetVal1, SDValue &RetVal2)
static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVLDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isSHL16(const SDValue &Op)
static bool isVEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseVEXT, unsigned &Imm)
static SDValue PerformMVEVLDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
cl::opt< unsigned > ArmMaxBaseUpdatesToCheck("arm-max-base-updates-to-check", cl::Hidden, cl::desc("Maximum number of base-updates to check generating postindex."), cl::init(64))
static bool isTruncMask(ArrayRef< int > M, EVT VT, bool Top, bool SingleSource)
static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2)
Return the load opcode for a given load size.
static bool isLegalT2AddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget)
static bool isLegalMVEShuffleOp(unsigned PFEntry)
static SDValue PerformSignExtendInregCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformShuffleVMOVNCombine(ShuffleVectorSDNode *N, SelectionDAG &DAG)
static bool isVUZPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG)
PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for ISD::VECTOR_SHUFFLE.
static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG)
SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, ANY_EXTEND,...
static bool isVMOVNTruncMask(ArrayRef< int > M, EVT ToVT, bool rev)
static SDValue PerformVQMOVNCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static MachineBasicBlock * OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ)
static SDValue LowerVecReduceMinMax(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformFPExtendCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformAddcSubcCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static TargetLowering::ArgListTy getDivRemArgList(const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget)
static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, SelectionDAG &DAG)
static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
static ARMCC::CondCodes getVCMPCondCode(SDValue N)
static cl::opt< bool > ARMInterworking("arm-interworking", cl::Hidden, cl::desc("Enable / disable ARM interworking (for debugging only)"), cl::init(true))
static void ReplaceREADCYCLECOUNTER(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformORCombineToBFI(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, SDValue &CC, bool &Invert, SDValue &OtherOp, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVSetCCToVCTPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerBUILD_VECTORToVIDUP(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isZeroVector(SDValue N)
static SDValue PerformAddeSubeCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static void ReplaceCMP_SWAP_64Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static bool isLowerSaturate(const SDValue LHS, const SDValue RHS, const SDValue TrueVal, const SDValue FalseVal, const ISD::CondCode CC, const SDValue K)
static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG)
static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, const TargetInstrInfo *TII, const DebugLoc &dl, unsigned StSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2)
Emit a post-increment store operation with given size.
static bool isVMOVNMask(ArrayRef< int > M, EVT VT, bool Top, bool SingleSource)
static SDValue CombineBaseUpdate(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, NEON load/store intrinsics,...
static SDValue LowerSaturatingConditional(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSubCSINCCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformVMOVRRDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVMOVRRDCombine - Target-specific dag combine xforms for ARMISD::VMOVRRD.
static SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformCSETCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformVMOVNCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue PerformInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerVectorExtend(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain)
static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue PerformVMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVMULCombine Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the special multi...
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformORCombine - Target-specific dag combine xforms for ISD::OR.
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
static SDValue PerformTruncatingStoreCombine(StoreSDNode *St, SelectionDAG &DAG)
static unsigned SelectPairHalf(unsigned Elements, ArrayRef< int > Mask, unsigned Index)
static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, const TargetInstrInfo *TII, const DebugLoc &dl, unsigned LdSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2)
Emit a post-increment load operation with given size.
static SDValue TryDistrubutionADDVecReduce(SDNode *N, SelectionDAG &DAG)
static bool isValidBaseUpdate(SDNode *N, SDNode *User)
static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, const ARMSubtarget *ST, const SDLoc &dl)
static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op)
static SDValue PerformMinMaxToSatCombine(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformXORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment, bool isSEXTLoad, bool IsMasked, bool isLE, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
std::pair< unsigned, const TargetRegisterClass * > RCPair
static SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, TargetLowering::DAGCombinerInfo &DCI, bool AllOnes=false)
static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, ISD::ZERO_EXTEND,...
static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, cl::desc("Maximum interleave factor for MVE VLDn to generate."), cl::init(2))
static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, unsigned SplatBitSize, SelectionDAG &DAG, const SDLoc &dl, EVT &VT, EVT VectorVT, VMOVModImmType type)
isVMOVModifiedImm - Check if the specified splat value corresponds to a valid vector constant for a N...
static SDValue LowerBuildVectorOfFPExt(SDValue BV, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, SelectionDAG &DAG)
BC is a bitcast that is about to be turned into a VMOVDRR.
static SDValue promoteToConstantPool(const ARMTargetLowering *TLI, const GlobalValue *GV, SelectionDAG &DAG, EVT PtrVT, const SDLoc &dl)
static unsigned isNEONTwoResultShuffleMask(ArrayRef< int > ShuffleMask, EVT VT, unsigned &WhichResult, bool &isV_UNDEF)
Check if ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN), and return the corresponding AR...
static bool BitsProperlyConcatenate(const APInt &A, const APInt &B)
static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
static SDValue LowerVecReduce(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG)
static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target, struct BaseUpdateUser &User, bool SimpleConstIncOnly, TargetLowering::DAGCombinerInfo &DCI)
static bool allUsersAreInFunction(const Value *V, const Function *F)
Return true if all users of V are within function F, looking through ConstantExprs.
static bool isSingletonVEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG)
PerformVMOVDRRCombine - Target-specific dag combine xforms for ARMISD::VMOVDRR.
static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V, SDValue &SatK)
static bool isLegalAddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget)
isLegalAddressImmediate - Return true if the integer value can be used as the offset of the target ad...
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isLegalT1AddressImmediate(int64_t V, EVT VT)
static SDValue CombineANDShift(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSHLSimplify(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue PerformADDECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDECombine - Target-specific dag combine transform from ARMISD::ADDC, ARMISD::ADDE,...
static SDValue PerformReduceShuffleCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerTruncate(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformHWLoopCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue PerformSplittingMVETruncToNarrowingStores(StoreSDNode *St, SelectionDAG &DAG)
static bool isVUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of "vector_shuffle v,...
static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, uint64_t &Members)
static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerReverse_VECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG)
static SDValue PerformANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformADDVecReduce(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG)
static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm, bool &Negate)
static bool canChangeToInt(SDValue Op, bool &SeenZero, const ARMSubtarget *Subtarget)
canChangeToInt - Given the fp compare operand, return true if it is suitable to morph to an integer c...
static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2)
Return the store opcode for a given store size.
static bool IsVUZPShuffleNode(SDNode *N)
static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, MachineInstr &MI, const SDNode *Node)
Attaches vregs to MEMCPY that it will use as scratch registers when it is expanded into LDM/STM.
static bool isFloatingPointZero(SDValue Op)
isFloatingPointZero - Return true if this is +0.0.
static SDValue findMUL_LOHI(SDValue V)
static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformORCombine_i1(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformSplittingMVEEXTToWideningLoad(SDNode *N, SelectionDAG &DAG)
static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG)
static void genTPLoopBody(MachineBasicBlock *TpLoopBody, MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit, const TargetInstrInfo *TII, DebugLoc Dl, MachineRegisterInfo &MRI, Register OpSrcReg, Register OpDestReg, Register ElementCountReg, Register TotalIterationsReg, bool IsMemcpy)
Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and t2DoLoopEnd.
static SDValue PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformBUILD_VECTORCombine - Target-specific dag combine xforms for ISD::BUILD_VECTOR.
static SDValue LowerVecReduceF(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
PerformMinMaxCombine - Target-specific DAG combining for creating truncating saturates.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
This file a TargetTransformInfoImplBase conforming object specific to the ARM target machine.
Function Alias Analysis false
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
This file implements the BitVector class.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static std::optional< bool > isBigEndian(const SmallDenseMap< int64_t, int64_t, 8 > &MemOffset2Idx, int64_t LowestIdx)
Given a map from byte offsets in memory to indices in a load/store, determine if that map corresponds...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static void createLoadIntrinsic(IntrinsicInst *II, LoadInst *LI, Value *Offset, dxil::ResourceTypeInfo &RTI)
static void createStoreIntrinsic(IntrinsicInst *II, StoreInst *SI, Value *Offset, dxil::ResourceTypeInfo &RTI)
This file defines the DenseMap class.
static bool isSigned(unsigned int Opcode)
#define Check(C,...)
#define op(i)
#define im(i)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
lazy value info
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
This file declares the MachineConstantPool class which is an abstract constant pool to keep track of ...
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
nvptx lower args
uint64_t High
uint64_t IntrinsicInst * II
PowerPC Reduce CR logical Operation
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
SI Lower i1 Copies
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
static cl::opt< unsigned > MaxSteps("has-predecessor-max-steps", cl::Hidden, cl::init(8192), cl::desc("DAG combiner limit number of steps when searching DAG " "for predecessor nodes"))
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
This file contains some functions that are useful when dealing with strings.
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
static X86::CondCode getSwappedCondition(X86::CondCode CC)
Assuming the flags are set by MI(a,b), return the condition code if we modify the instructions such t...
static constexpr int Concat[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static bool isIntrinsic(const CallBase &Call, Intrinsic::ID ID)
The Input class is used to parse a yaml document into in-memory structs and vectors.
static constexpr roundingMode rmTowardZero
Definition APFloat.h:348
LLVM_ABI bool getExactInverse(APFloat *Inv) const
If this value is normal and has an exact, normal, multiplicative inverse, store it in inv and return ...
Definition APFloat.cpp:5995
APInt bitcastToAPInt() const
Definition APFloat.h:1335
opStatus convertToInteger(MutableArrayRef< integerPart > Input, unsigned int Width, bool IsSigned, roundingMode RM, bool *IsExact) const
Definition APFloat.h:1314
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition APInt.h:424
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1541
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1671
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition APInt.cpp:1033
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1513
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1331
bool sgt(const APInt &RHS) const
Signed greater than comparison.
Definition APInt.h:1202
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1489
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1112
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1640
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition APInt.h:1599
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:651
unsigned logBase2() const
Definition APInt.h:1762
uint64_t getLimitedValue(uint64_t Limit=UINT64_MAX) const
If this value is smaller than the specified limit, return it, otherwise return the limit value.
Definition APInt.h:476
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1258
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1563
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition APInt.h:859
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition APInt.h:852
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1657
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1222
An arbitrary precision integer that knows its signedness.
Definition APSInt.h:24
const ARMBaseRegisterInfo & getRegisterInfo() const
const uint32_t * getSjLjDispatchPreservedMask(const MachineFunction &MF) const
const MCPhysReg * getCalleeSavedRegs(const MachineFunction *MF) const override
Code Generation virtual methods...
Register getFrameRegister(const MachineFunction &MF) const override
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
const uint32_t * getTLSCallPreservedMask(const MachineFunction &MF) const
const uint32_t * getThisReturnPreservedMask(const MachineFunction &MF, CallingConv::ID) const
getThisReturnPreservedMask - Returns a call preserved mask specific to the case that 'returned' is on...
static ARMConstantPoolConstant * Create(const Constant *C, unsigned ID)
static ARMConstantPoolMBB * Create(LLVMContext &C, const MachineBasicBlock *mbb, unsigned ID, unsigned char PCAdj)
static ARMConstantPoolSymbol * Create(LLVMContext &C, StringRef s, unsigned ID, unsigned char PCAdj)
ARMConstantPoolValue - ARM specific constantpool value.
ARMFunctionInfo - This class is derived from MachineFunctionInfo and contains private ARM-specific in...
SmallPtrSet< const GlobalVariable *, 2 > & getGlobalsPromotedToConstantPool()
void setArgumentStackToRestore(unsigned v)
void setArgRegsSaveSize(unsigned s)
void setReturnRegsCount(unsigned s)
unsigned getArgRegsSaveSize() const
void markGlobalAsPromotedToConstantPool(const GlobalVariable *GV)
Indicate to the backend that GV has had its storage changed to inside a constant pool.
void setArgumentStackSize(unsigned size)
unsigned getArgumentStackSize() const
const Triple & getTargetTriple() const
const ARMBaseInstrInfo * getInstrInfo() const override
bool isThumb1Only() const
bool useFPVFMx() const
bool isThumb2() const
bool isTargetWindows() const
bool hasBaseDSP() const
const ARMTargetLowering * getTargetLowering() const override
const ARMBaseRegisterInfo * getRegisterInfo() const override
bool hasVFP2Base() const
bool useFPVFMx64() const
bool isLittle() const
bool useFPVFMx16() const
bool isMClass() const
bool useMulOps() const
Align getDualLoadStoreAlignment() const
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool isReadOnly(const GlobalValue *GV) const
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
Align getABIAlignmentForCallingConv(Type *ArgTy, const DataLayout &DL) const override
Return the correct alignment for the current calling convention.
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const override
Examine constraint string and operand type and determine a weight value.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
isLegalAddressingMode - Return true if the addressing mode represented by AM is legal for this target...
const ARMSubtarget * getSubtarget() const
bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const
bool isLegalT1ScaledAddressingMode(const AddrMode &AM, EVT VT) const
Returns true if the addressing mode representing by AM is legal for the Thumb1 target,...
bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override
getPreIndexedAddressParts - returns true by value, base pointer and offset pointer and addressing mod...
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallBase &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, Align &PrefAlign) const override
Return true if the pointer arguments to CI should be aligned by aligning the object whose address is ...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
ReplaceNodeResults - Replace the results of node with an illegal result type with new values built ou...
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override
Return true if it may be profitable to transform (mul (add x, c1), c2) -> (add (mul x,...
bool isLegalAddImmediate(int64_t Imm) const override
isLegalAddImmediate - Return true if the specified immediate is legal add immediate,...
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
SDValue PerformMVETruncCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize=false) const override
isFPImmLegal - Returns true if the target can instruction select the specified FP immediate natively.
ConstraintType getConstraintType(StringRef Constraint) const override
getConstraintType - Given a constraint letter, return the type of constraint it is for this target.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void insertSSPDeclarations(Module &M) const override
Inserts necessary declarations for SSP (stack protection) purpose.
SDValue PerformIntrinsicCombine(SDNode *N, DAGCombinerInfo &DCI) const
PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
bool shouldFoldConstantShiftPairToMask(const SDNode *N) const override
Return true if it is profitable to fold a pair of shifts into a mask.
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override
Return true if it is profitable to combine an XOR of a logical shift to create a logical shift of NOT...
SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const
PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
Value * createComplexDeinterleavingIR(IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator=nullptr) const override
Create the IR node for the given complex deinterleaving operation.
bool isComplexDeinterleavingSupported() const override
Does this target support complex deinterleaving.
SDValue PerformMVEExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &OriginalDemandedBits, const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
getSetCCResultType - Return the value type to use for ISD::SETCC.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool isVarArg) const
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
createFastISel - This method returns a target specific FastISel object, or null if the target does no...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
This method should be implemented by targets that mark instructions with the 'hasPostISelHook' flag.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool isTruncateFree(Type *SrcTy, Type *DstTy) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
isShuffleMaskLegal - Targets can use this to indicate that they only support some VECTOR_SHUFFLE oper...
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool lowerInterleavedStore(Instruction *Store, Value *Mask, ShuffleVectorInst *SVI, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved store into a vstN intrinsic.
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const override
getRegClassFor - Return the register class that should be used for the specified value type.
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved load into a vldN intrinsic.
std::pair< const TargetRegisterClass *, uint8_t > findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const override
Return the largest legal super-reg register class of the register class for the specified type and it...
bool preferSelectsOverBooleanArithmetic(EVT VT) const override
Should we prefer selects to doing arithmetic on boolean types.
bool isZExtFree(SDValue Val, EVT VT2) const override
Return true if zero-extending the specific node Val to type VT2 is free (either because it's implicit...
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
ARMTargetLowering(const TargetMachine &TM, const ARMSubtarget &STI)
bool isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation Operation, Type *Ty) const override
Does this target support complex deinterleaving with the given operation and type.
bool supportKCFIBundles() const override
Return true if the target supports kcfi operand bundles.
SDValue PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const
PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
Type * shouldConvertSplatType(ShuffleVectorInst *SVI) const override
Given a shuffle vector SVI representing a vector splat, return a new scalar type of size equal to SVI...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
Instruction * makeDMB(IRBuilderBase &Builder, ARM_MB::MemBOpt Domain) const
bool isLegalICmpImmediate(int64_t Imm) const override
isLegalICmpImmediate - Return true if the specified immediate is legal icmp immediate,...
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const override
Return true if it is profitable for dag combiner to transform a floating point op of specified opcode...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool isVarArg) const
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, unsigned *Fast) const override
allowsMisalignedMemoryAccesses - Returns true if the target allows unaligned memory accesses of the s...
bool isLegalInterleavedAccessType(unsigned Factor, FixedVectorType *VecTy, Align Alignment, const DataLayout &DL) const
Returns true if VecTy is a legal interleaved access type.
bool isVectorLoadExtDesirable(SDValue ExtVal) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx, unsigned &Cost) const override
Return true if the target can combine store(extractelement VectorTy,Idx).
bool useSoftFloat() const override
bool alignLoopsWithOptSize() const override
Should loops be aligned even when the function is marked OptSize (but not MinSize).
SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
LowerAsmOperandForConstraint - Lower the specified operand into the Ops vector.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
Returns true if an argument of type Ty needs to be passed in a contiguous block of registers in calli...
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
const ARMBaseTargetMachine & getTM() const
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override
getPostIndexedAddressParts - returns true by value, base pointer and offset pointer and addressing mo...
Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
Inserts in the IR a target-specific intrinsic specifying a fence.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
bool isFloatingPointOperation() const
static LLVM_ABI Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
static LLVM_ABI BaseIndexOffset match(const SDNode *N, const SelectionDAG &DAG)
Parses tree in N for base, index, offset addresses.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
The address of a basic block.
Definition Constants.h:904
static BranchProbability getZero()
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
LLVM_ABI int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const
If this is a constant FP splat and the splatted constant FP is an exact power or 2,...
CCState - This class holds information needed while lowering arguments and return values.
void getInRegsParamInfo(unsigned InRegsParamRecordIndex, unsigned &BeginReg, unsigned &EndReg) const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
unsigned getInRegsParamsProcessed() const
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
void addInRegsParamInfo(unsigned RegBegin, unsigned RegEnd)
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
unsigned getInRegsParamsCount() const
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
bool needsCustom() const
int64_t getLocMemOffset() const
unsigned getValNo() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
LLVM_ABI bool isIndirectCall() const
Return true if the callsite is an indirect call.
AttributeList getAttributes() const
Return the attributes for this call.
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition Constants.h:720
const APFloat & getValueAPF() const
ConstantFP - Floating Point Values [float, double].
Definition Constants.h:282
This is the shared class of boolean and integer constants.
Definition Constants.h:87
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
bool isLittleEndian() const
Layout endianness...
Definition DataLayout.h:207
bool isBigEndian() const
Definition DataLayout.h:208
MaybeAlign getStackAlignment() const
Returns the natural stack alignment, or MaybeAlign() if one wasn't specified.
Definition DataLayout.h:237
LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
LLVM_ABI Align getPreferredAlign(const GlobalVariable *GV) const
Returns the preferred alignment of the specified global.
StringRef getPrivateGlobalPrefix() const
Definition DataLayout.h:295
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
A debug info location.
Definition DebugLoc.h:123
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
iterator begin()
Definition DenseMap.h:78
iterator end()
Definition DenseMap.h:81
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition FastISel.h:66
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
arg_iterator arg_begin()
Definition Function.h:866
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
bool hasStructRetAttr() const
Determine if the function returns a structure through first or second pointer argument.
Definition Function.h:687
const Argument * const_arg_iterator
Definition Function.h:73
bool isVarArg() const
isVarArg - Return true if this function takes a variable number of arguments.
Definition Function.h:227
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:730
const GlobalValue * getGlobal() const
bool isDSOLocal() const
bool hasExternalWeakLinkage() const
bool hasDLLImportStorageClass() const
Module * getParent()
Get the module that this global value is contained inside of...
bool isStrongDefinitionForLinker() const
Returns true if this global's definition will be the one chosen by the linker.
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
LLVM_ABI bool hasAtomicStore() const LLVM_READONLY
Return true if this atomic instruction stores to memory.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool isUnindexed() const
Return true if this is NOT a pre/post inc/dec load/store.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Describe properties that are true of each instruction in the target description file.
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isInteger() const
Return true if this is an integer or a vector integer type.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
bool is64BitVector() const
Return true if this is a 64-bit vector type.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
bool isEHPad() const
Returns true if the block is a landing pad.
LLVM_ABI MachineBasicBlock * getFallThrough(bool JumpToFallThrough=true)
Return the fallthrough block if the block can implicitly transfer control to the block after it by fa...
void setCallFrameSize(unsigned N)
Set the call frame size on entry to this basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
LLVM_ABI bool canFallThrough()
Return true if the block can implicitly transfer control to the block after it by falling off the end...
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
Instructions::iterator instr_iterator
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< succ_iterator > successors()
iterator_range< pred_iterator > predecessors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
LLVM_ABI void moveAfter(MachineBasicBlock *NewBefore)
void setIsEHPad(bool V=true)
Indicates the block is a landing pad.
The MachineConstantPool class keeps track of constants referenced by a function which must be spilled...
unsigned getConstantPoolIndex(const Constant *C, Align Alignment)
getConstantPoolIndex - Create a new entry in the constant pool or return an existing one.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
LLVM_ABI void computeMaxCallFrameSize(MachineFunction &MF, std::vector< MachineBasicBlock::iterator > *FrameSDOps=nullptr)
Computes the maximum size of a callframe.
void setFrameAddressIsTaken(bool T)
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
bool hasVAStart() const
Returns true if the function calls the llvm.va_start intrinsic.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
int getFunctionContextIndex() const
Return the index for the function context object.
Properties which a MachineFunction may have at a given point in time.
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineConstantPool * getConstantPool()
getConstantPool - Return the constant pool object for the current function.
const MachineFunctionProperties & getProperties() const
Get the function properties.
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addConstantPoolIndex(unsigned Idx, int Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
MachineOperand * mop_iterator
iterator/begin/end - Iterate over all operands of a machine instruction.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
@ EK_Inline
EK_Inline - Jump table entries are emitted inline at their point of use.
@ EK_BlockAddress
EK_BlockAddress - Each entry is a plain address of block, e.g.: .word LBB123.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
LLVM_ABI void setIsRenamable(bool Val=true)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
LLVM_ABI void setIsDef(bool Val=true)
Change a def to a use, or a use to a def.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This class is used to represent an MLOAD node.
This class is used to represent an MSTORE node.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
Align getBaseAlign() const
Returns alignment and volatility of the memory access.
Align getAlign() const
bool isVolatile() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Wrapper class representing virtual and physical registers.
Definition Register.h:20
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
LLVM_ABI bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< use_iterator > uses()
SDNodeFlags getFlags() const
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
bool use_empty() const
Return true if there are no uses of this node.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
const APInt & getConstantOperandAPInt(unsigned Num) const
Helper method returns the APInt of a ConstantSDNode operand.
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
LLVM_ABI bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
void setCFIType(uint32_t Type)
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
const SDValue & getOperand(unsigned i) const
const APInt & getConstantOperandAPInt(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
unsigned getNumOperands() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
LLVM_ABI SDValue getStackArgumentTokenFactor(SDValue Chain)
Compute a TokenFactor to force all the incoming stack arguments to be loaded from the stack.
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags, bool AllowCommute=false)
Get the specified node if it's already available, or else return NULL.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
LLVM_ABI MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo)
Set CallSiteInfo to be associated with Node.
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
LLVM_ABI SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
DenormalMode getDenormalMode(EVT VT) const
Return the current function's default denormal handling kind for the given floating point type.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
LLVM_ABI SDValue getLogicalNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a logical NOT operation as (XOR Val, BooleanOne).
This instruction constructs a fixed permutation of two input vectors.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
static LLVM_ABI bool isSplatMask(ArrayRef< int > Mask)
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
bool empty() const
Definition SmallSet.h:168
bool erase(const T &V)
Definition SmallSet.h:199
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
const unsigned char * bytes_end() const
Definition StringRef.h:127
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
const unsigned char * bytes_begin() const
Definition StringRef.h:124
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:413
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
void setMinStackArgumentAlignment(Align Alignment)
Set the minimum stack alignment of an argument.
const TargetMachine & getTargetMachine() const
void setIndexedMaskedLoadAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed masked load does or does not work with the specified type and ind...
void setIndexedLoadAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
Sched::Preference getSchedulingPreference() const
Return target scheduling preference.
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setIndexedStoreAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
virtual unsigned getMaxSupportedInterleaveFactor() const
Get the maximum supported factor for interleaved memory accesses.
void setIndexedMaskedStoreAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed masked store does or does not work with the specified type and in...
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
virtual std::pair< const TargetRegisterClass *, uint8_t > findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const
Return the largest legal super-reg register class of the register class for the specified type and it...
RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const
Get the libcall impl routine name for the specified libcall.
static StringRef getLibcallImplName(RTLIB::LibcallImpl Call)
Get the libcall routine name for the specified libcall implementation.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void insertSSPDeclarations(Module &M) const
Inserts necessary declarations for SSP (stack protection) purpose.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl< SDValue > &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL=SDValue(), SDValue LH=SDValue()) const
Attempt to expand an n-bit div/rem/divrem by constant using a n/2-bit urem by constant and other arit...
bool isPositionIndependent() const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
SDValue buildLegalVectorShuffle(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1, MutableArrayRef< int > Mask, SelectionDAG &DAG) const
Tries to build a legal vector shuffle using the provided parameters or equivalent variations.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
TargetLowering(const TargetLowering &)=delete
bool isConstTrueVal(SDValue N) const
Return if the N is a constant or constant vector equal to the true value from getBooleanContents().
virtual ArrayRef< MCPhysReg > getRoundingControlRegisters() const
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::LibcallImpl LibcallImpl, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
ExceptionHandling getExceptionModel() const
Return the ExceptionHandling to use, considering TargetOptions and the Triple's default.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
TargetOptions Options
unsigned EnableFastISel
EnableFastISel - This flag enables fast-path instruction selection which trades away generated code q...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned EmitCallGraphSection
Emit section containing call graph metadata.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
ObjectFormatType getObjectFormat() const
Get the object format for this triple.
Definition Triple.h:439
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:280
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:294
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:295
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition Type.h:225
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
Value * getOperand(unsigned i) const
Definition User.h:232
unsigned getNumOperands() const
Definition User.h:254
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
Base class of all SIMD vector types.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
static CondCodes getOppositeCondition(CondCodes CC)
Definition ARMBaseInfo.h:48
@ SECREL
Thread Pointer Offset.
@ SBREL
Section Relative (Windows TLS)
@ GOTTPOFF
Global Offset Table, PC Relative.
@ TPOFF
Global Offset Table, Thread Pointer Offset.
TOF
Target Operand Flag enum.
@ MO_NONLAZY
MO_NONLAZY - This is an independent flag, on a symbol operand "FOO" it represents a symbol which,...
@ MO_SBREL
MO_SBREL - On a symbol operand, this represents a static base relative relocation.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_GOT
MO_GOT - On a symbol operand, this represents a GOT relative relocation.
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
static ShiftOpc getShiftOpcForNode(unsigned Opcode)
int getSOImmVal(unsigned Arg)
getSOImmVal - Given a 32-bit immediate, if it is something that can fit into an shifter_operand immed...
int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
uint64_t decodeVMOVModImm(unsigned ModImm, unsigned &EltBits)
decodeVMOVModImm - Decode a NEON/MVE modified immediate value into the element value and the element ...
unsigned getAM2Offset(unsigned AM2Opc)
bool isThumbImmShiftedVal(unsigned V)
isThumbImmShiftedVal - Return true if the specified value can be obtained by left shifting a 8-bit im...
int getT2SOImmVal(unsigned Arg)
getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit into a Thumb-2 shifter_oper...
unsigned createVMOVModImm(unsigned OpCmode, unsigned Val)
int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
unsigned getSORegOpc(ShiftOpc ShOp, unsigned Imm)
int getFP32FP16Imm(const APInt &Imm)
If this is a FP16Imm encoded as a fp32 value, return the 8-bit encoding for it.
AddrOpc getAM2Op(unsigned AM2Opc)
bool isBitFieldInvertedMask(unsigned v)
const unsigned FPStatusBits
const unsigned FPReservedBits
const unsigned RoundingBitsPos
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Swift
Calling convention for Swift.
Definition CallingConv.h:69
@ ARM_APCS
ARM Procedure Calling Standard (obsolete, but still used on some targets).
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition CallingConv.h:82
@ PreserveMost
Used for runtime calls that preserves most registers.
Definition CallingConv.h:63
@ ARM_AAPCS
ARM Architecture Procedure Calling Standard calling convention (aka EABI).
@ CXX_FAST_TLS
Used for access functions.
Definition CallingConv.h:72
@ GHC
Used by the Glasgow Haskell Compiler (GHC).
Definition CallingConv.h:50
@ PreserveAll
Used for runtime calls that preserves (almost) all registers.
Definition CallingConv.h:66
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition CallingConv.h:87
@ ARM_AAPCS_VFP
Same as ARM_AAPCS, but uses hard floating point ABI.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:807
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:256
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:780
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition ISDOpcodes.h:504
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition ISDOpcodes.h:163
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition ISDOpcodes.h:531
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:593
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:771
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:841
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition ISDOpcodes.h:167
@ GlobalAddress
Definition ISDOpcodes.h:88
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:868
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:577
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:744
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition ISDOpcodes.h:431
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ GlobalTLSAddress
Definition ISDOpcodes.h:89
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:832
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition ISDOpcodes.h:712
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:662
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:779
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition ISDOpcodes.h:815
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:784
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:242
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:958
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:701
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:762
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:642
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:607
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition ISDOpcodes.h:134
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:569
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:838
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:799
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:876
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:724
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:793
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ STRICT_FROUNDEVEN
Definition ISDOpcodes.h:457
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:471
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:470
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:914
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:736
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ SCMP
[US]CMP - 3-way comparison of signed or unsigned integers.
Definition ISDOpcodes.h:732
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition ISDOpcodes.h:707
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition ISDOpcodes.h:420
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:558
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:947
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition ISDOpcodes.h:122
@ STRICT_FNEARBYINT
Definition ISDOpcodes.h:451
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:933
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition ISDOpcodes.h:157
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:844
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:821
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:527
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition ISDOpcodes.h:719
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:549
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool isZEXTLoad(const SDNode *N)
Returns true if the specified node is a ZEXTLOAD.
LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
LLVM_ABI bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
bool isSEXTLoad(const SDNode *N)
Returns true if the specified node is a SEXTLOAD.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
static const int LAST_INDEXED_MODE
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
LLVM_ABI Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPEXT(EVT OpVT, EVT RetVT)
getFPEXT - Return the FPEXT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition LLVMContext.h:55
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
bool RetFastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:532
@ Length
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2070
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1763
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
bool HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2, const ARMSubtarget *Subtarget, bool ForCodesize=false)
Returns true if Val1 has a lower Constant Materialization Cost than Val2.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
InstructionCost Cost
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
bool isStrongerThanMonotonic(AtomicOrdering AO)
int countr_one(T Value)
Count the number of ones from the least significant bit to the first zero bit.
Definition bit.h:293
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
bool CC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:255
ExceptionHandling
Definition CodeGen.h:53
@ SjLj
setjmp/longjmp based exceptions
Definition CodeGen.h:56
bool RetCC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool RetCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:303
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2148
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:243
bool RetCC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
void shuffle(Iterator first, Iterator last, RNG &&g)
Definition STLExtras.h:1528
bool CC_ARM_APCS_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
static std::array< MachineOperand, 2 > predOps(ARMCC::CondCodes Pred, unsigned PredReg=0)
Get the operands corresponding to the given Pred value.
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
constexpr bool isShiftedMask_32(uint32_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (32 bit ver...
Definition MathExtras.h:267
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
bool isReleaseOrStronger(AtomicOrdering AO)
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:236
LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
bool FastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool CC_ARM_Win32_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
CombineLevel
Definition DAGCombine.h:15
@ BeforeLegalizeTypes
Definition DAGCombine.h:16
unsigned ConstantMaterializationCost(unsigned Val, const ARMSubtarget *Subtarget, bool ForCodesize=false)
Returns the number of instructions required to materialize the given constant in a register,...
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ FAdd
Sum of floats.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr U AbsoluteValue(T X)
Return the absolute value of a signed integer, converted to the corresponding unsigned integer type.
Definition MathExtras.h:592
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
static MachineOperand t1CondCodeOp(bool isDead=false)
Get the operand corresponding to the conditional code result for Thumb1.
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1973
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1770
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
static MachineOperand condCodeOp(unsigned CCReg=0)
Get the operand corresponding to the conditional code result.
bool isVREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isVREVMask - Check if a vector shuffle corresponds to a VREV instruction with the specified blocksize...
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
unsigned gettBLXrOpcode(const MachineFunction &MF)
bool CC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
@ Increment
Incrementally increasing token ID.
Definition AllocToken.h:26
bool CC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
constexpr bool isShiftedUInt(uint64_t x)
Checks if a unsigned integer is an N bit number shifted left by S.
Definition MathExtras.h:198
unsigned convertAddSubFlagsOpcode(unsigned OldOpc)
Map pseudo instructions that imply an 'S' bit onto real opcodes.
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
static const unsigned PerfectShuffleTable[6561+1]
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
Load/store instruction that can be merged with a base address update.
SDNode * N
Instruction that updates a pointer.
unsigned ConstInc
Pointer increment value if it is a constant, or 0 otherwise.
SDValue Inc
Pointer increment operand.
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:761
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition ValueTypes.h:94
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
ElementCount getVectorElementCount() const
Definition ValueTypes.h:350
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:463
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:359
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:207
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
bool isFixedLengthVector() const
Definition ValueTypes.h:181
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition ValueTypes.h:59
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition ValueTypes.h:308
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:453
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition ValueTypes.h:202
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
EVT ArgVT
Usually the non-legalized type of the argument, which is the EVT corresponding to the OrigTy IR type.
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:301
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:172
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition KnownBits.h:311
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:180
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
APInt getSignedMinValue() const
Return the minimal signed value possible given these KnownBits.
Definition KnownBits.h:135
Matching combinators.
SmallVector< ArgRegPair, 1 > ArgRegPairs
Vector of call argument and its forwarding register.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getJumpTable(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a jump table entry.
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
static LLVM_ABI MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoSignedZeros() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This contains information for each constraint that we are lowering.
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setInRegister(bool Value=true)
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
SmallVector< ISD::InputArg, 32 > Ins
CallLoweringInfo & setZExtResult(bool Value=true)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
CallLoweringInfo & setSExtResult(bool Value=true)
SmallVector< ISD::OutputArg, 32 > Outs
CallLoweringInfo & setChain(SDValue InChain)
CallLoweringInfo & setCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList, AttributeSet ResultAttrs={})
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
This structure is used to pass arguments to makeLibCall function.
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...