LLVM 20.0.0git
RISCVISelLowering.cpp
Go to the documentation of this file.
1//===-- RISCVISelLowering.cpp - RISC-V DAG Lowering Implementation -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that RISC-V uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "RISCVISelLowering.h"
16#include "RISCV.h"
18#include "RISCVRegisterInfo.h"
19#include "RISCVSubtarget.h"
20#include "RISCVTargetMachine.h"
21#include "llvm/ADT/SmallSet.h"
22#include "llvm/ADT/Statistic.h"
36#include "llvm/IR/IRBuilder.h"
38#include "llvm/IR/IntrinsicsRISCV.h"
41#include "llvm/Support/Debug.h"
47#include <optional>
48
49using namespace llvm;
50
51#define DEBUG_TYPE "riscv-lower"
52
53STATISTIC(NumTailCalls, "Number of tail calls");
54
56 DEBUG_TYPE "-ext-max-web-size", cl::Hidden,
57 cl::desc("Give the maximum size (in number of nodes) of the web of "
58 "instructions that we will consider for VW expansion"),
59 cl::init(18));
60
61static cl::opt<bool>
62 AllowSplatInVW_W(DEBUG_TYPE "-form-vw-w-with-splat", cl::Hidden,
63 cl::desc("Allow the formation of VW_W operations (e.g., "
64 "VWADD_W) with splat constants"),
65 cl::init(false));
66
68 DEBUG_TYPE "-fp-repeated-divisors", cl::Hidden,
69 cl::desc("Set the minimum number of repetitions of a divisor to allow "
70 "transformation to multiplications by the reciprocal"),
71 cl::init(2));
72
73static cl::opt<int>
75 cl::desc("Give the maximum number of instructions that we will "
76 "use for creating a floating-point immediate value"),
77 cl::init(2));
78
80 const RISCVSubtarget &STI)
81 : TargetLowering(TM), Subtarget(STI) {
82
83 RISCVABI::ABI ABI = Subtarget.getTargetABI();
84 assert(ABI != RISCVABI::ABI_Unknown && "Improperly initialised target ABI");
85
86 if ((ABI == RISCVABI::ABI_ILP32F || ABI == RISCVABI::ABI_LP64F) &&
87 !Subtarget.hasStdExtF()) {
88 errs() << "Hard-float 'f' ABI can't be used for a target that "
89 "doesn't support the F instruction set extension (ignoring "
90 "target-abi)\n";
92 } else if ((ABI == RISCVABI::ABI_ILP32D || ABI == RISCVABI::ABI_LP64D) &&
93 !Subtarget.hasStdExtD()) {
94 errs() << "Hard-float 'd' ABI can't be used for a target that "
95 "doesn't support the D instruction set extension (ignoring "
96 "target-abi)\n";
98 }
99
100 switch (ABI) {
101 default:
102 report_fatal_error("Don't know how to lower this ABI");
111 break;
112 }
113
114 MVT XLenVT = Subtarget.getXLenVT();
115
116 // Set up the register classes.
117 addRegisterClass(XLenVT, &RISCV::GPRRegClass);
118
119 if (Subtarget.hasStdExtZfhmin())
120 addRegisterClass(MVT::f16, &RISCV::FPR16RegClass);
121 if (Subtarget.hasStdExtZfbfmin())
122 addRegisterClass(MVT::bf16, &RISCV::FPR16RegClass);
123 if (Subtarget.hasStdExtF())
124 addRegisterClass(MVT::f32, &RISCV::FPR32RegClass);
125 if (Subtarget.hasStdExtD())
126 addRegisterClass(MVT::f64, &RISCV::FPR64RegClass);
127 if (Subtarget.hasStdExtZhinxmin())
128 addRegisterClass(MVT::f16, &RISCV::GPRF16RegClass);
129 if (Subtarget.hasStdExtZfinx())
130 addRegisterClass(MVT::f32, &RISCV::GPRF32RegClass);
131 if (Subtarget.hasStdExtZdinx()) {
132 if (Subtarget.is64Bit())
133 addRegisterClass(MVT::f64, &RISCV::GPRRegClass);
134 else
135 addRegisterClass(MVT::f64, &RISCV::GPRPairRegClass);
136 }
137
138 static const MVT::SimpleValueType BoolVecVTs[] = {
139 MVT::nxv1i1, MVT::nxv2i1, MVT::nxv4i1, MVT::nxv8i1,
140 MVT::nxv16i1, MVT::nxv32i1, MVT::nxv64i1};
141 static const MVT::SimpleValueType IntVecVTs[] = {
142 MVT::nxv1i8, MVT::nxv2i8, MVT::nxv4i8, MVT::nxv8i8, MVT::nxv16i8,
143 MVT::nxv32i8, MVT::nxv64i8, MVT::nxv1i16, MVT::nxv2i16, MVT::nxv4i16,
144 MVT::nxv8i16, MVT::nxv16i16, MVT::nxv32i16, MVT::nxv1i32, MVT::nxv2i32,
145 MVT::nxv4i32, MVT::nxv8i32, MVT::nxv16i32, MVT::nxv1i64, MVT::nxv2i64,
146 MVT::nxv4i64, MVT::nxv8i64};
147 static const MVT::SimpleValueType F16VecVTs[] = {
148 MVT::nxv1f16, MVT::nxv2f16, MVT::nxv4f16,
149 MVT::nxv8f16, MVT::nxv16f16, MVT::nxv32f16};
150 static const MVT::SimpleValueType BF16VecVTs[] = {
151 MVT::nxv1bf16, MVT::nxv2bf16, MVT::nxv4bf16,
152 MVT::nxv8bf16, MVT::nxv16bf16, MVT::nxv32bf16};
153 static const MVT::SimpleValueType F32VecVTs[] = {
154 MVT::nxv1f32, MVT::nxv2f32, MVT::nxv4f32, MVT::nxv8f32, MVT::nxv16f32};
155 static const MVT::SimpleValueType F64VecVTs[] = {
156 MVT::nxv1f64, MVT::nxv2f64, MVT::nxv4f64, MVT::nxv8f64};
157
158 if (Subtarget.hasVInstructions()) {
159 auto addRegClassForRVV = [this](MVT VT) {
160 // Disable the smallest fractional LMUL types if ELEN is less than
161 // RVVBitsPerBlock.
162 unsigned MinElts = RISCV::RVVBitsPerBlock / Subtarget.getELen();
163 if (VT.getVectorMinNumElements() < MinElts)
164 return;
165
166 unsigned Size = VT.getSizeInBits().getKnownMinValue();
167 const TargetRegisterClass *RC;
169 RC = &RISCV::VRRegClass;
170 else if (Size == 2 * RISCV::RVVBitsPerBlock)
171 RC = &RISCV::VRM2RegClass;
172 else if (Size == 4 * RISCV::RVVBitsPerBlock)
173 RC = &RISCV::VRM4RegClass;
174 else if (Size == 8 * RISCV::RVVBitsPerBlock)
175 RC = &RISCV::VRM8RegClass;
176 else
177 llvm_unreachable("Unexpected size");
178
179 addRegisterClass(VT, RC);
180 };
181
182 for (MVT VT : BoolVecVTs)
183 addRegClassForRVV(VT);
184 for (MVT VT : IntVecVTs) {
185 if (VT.getVectorElementType() == MVT::i64 &&
186 !Subtarget.hasVInstructionsI64())
187 continue;
188 addRegClassForRVV(VT);
189 }
190
191 if (Subtarget.hasVInstructionsF16Minimal())
192 for (MVT VT : F16VecVTs)
193 addRegClassForRVV(VT);
194
195 if (Subtarget.hasVInstructionsBF16Minimal())
196 for (MVT VT : BF16VecVTs)
197 addRegClassForRVV(VT);
198
199 if (Subtarget.hasVInstructionsF32())
200 for (MVT VT : F32VecVTs)
201 addRegClassForRVV(VT);
202
203 if (Subtarget.hasVInstructionsF64())
204 for (MVT VT : F64VecVTs)
205 addRegClassForRVV(VT);
206
207 if (Subtarget.useRVVForFixedLengthVectors()) {
208 auto addRegClassForFixedVectors = [this](MVT VT) {
209 MVT ContainerVT = getContainerForFixedLengthVector(VT);
210 unsigned RCID = getRegClassIDForVecVT(ContainerVT);
211 const RISCVRegisterInfo &TRI = *Subtarget.getRegisterInfo();
212 addRegisterClass(VT, TRI.getRegClass(RCID));
213 };
215 if (useRVVForFixedLengthVectorVT(VT))
216 addRegClassForFixedVectors(VT);
217
219 if (useRVVForFixedLengthVectorVT(VT))
220 addRegClassForFixedVectors(VT);
221 }
222 }
223
224 // Compute derived properties from the register classes.
226
228
230 MVT::i1, Promote);
231 // DAGCombiner can call isLoadExtLegal for types that aren't legal.
233 MVT::i1, Promote);
234
235 // TODO: add all necessary setOperationAction calls.
237
242
247 if (!(Subtarget.hasVendorXCValu() && !Subtarget.is64Bit())) {
250 }
251
253
256
258
260
261 if (!Subtarget.hasStdExtZbb() && !Subtarget.hasVendorXTHeadBb() &&
262 !(Subtarget.hasVendorXCValu() && !Subtarget.is64Bit()))
263 setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::i8, MVT::i16}, Expand);
264
265 if (Subtarget.is64Bit()) {
267
270 MVT::i32, Custom);
272 Custom);
273 if (!Subtarget.hasStdExtZbb())
275 Custom);
277 }
278 if (!Subtarget.hasStdExtZmmul()) {
280 } else if (Subtarget.is64Bit()) {
283 } else {
285 }
286
287 if (!Subtarget.hasStdExtM()) {
289 Expand);
290 } else if (Subtarget.is64Bit()) {
292 {MVT::i8, MVT::i16, MVT::i32}, Custom);
293 }
294
297 Expand);
298
300 Custom);
301
302 if (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtZbkb()) {
303 if (Subtarget.is64Bit())
305 } else if (Subtarget.hasVendorXTHeadBb()) {
306 if (Subtarget.is64Bit())
309 } else if (Subtarget.hasVendorXCVbitmanip() && !Subtarget.is64Bit()) {
311 } else {
313 }
314
315 // With Zbb we have an XLen rev8 instruction, but not GREVI. So we'll
316 // pattern match it directly in isel.
318 (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtZbkb() ||
319 Subtarget.hasVendorXTHeadBb())
320 ? Legal
321 : Expand);
322
323 if (Subtarget.hasVendorXCVbitmanip() && !Subtarget.is64Bit()) {
325 } else {
326 // Zbkb can use rev8+brev8 to implement bitreverse.
328 Subtarget.hasStdExtZbkb() ? Custom : Expand);
329 }
330
331 if (Subtarget.hasStdExtZbb() ||
332 (Subtarget.hasVendorXCValu() && !Subtarget.is64Bit())) {
334 Legal);
335 }
336
337 if (Subtarget.hasStdExtZbb() ||
338 (Subtarget.hasVendorXCVbitmanip() && !Subtarget.is64Bit())) {
339 if (Subtarget.is64Bit())
341 } else {
343 }
344
345 if (Subtarget.hasStdExtZbb() || Subtarget.hasVendorXTHeadBb() ||
346 (Subtarget.hasVendorXCVbitmanip() && !Subtarget.is64Bit())) {
347 // We need the custom lowering to make sure that the resulting sequence
348 // for the 32bit case is efficient on 64bit targets.
349 if (Subtarget.is64Bit())
351 } else {
353 }
354
355 if (Subtarget.hasVendorXCValu() && !Subtarget.is64Bit()) {
357 } else if (Subtarget.hasShortForwardBranchOpt()) {
358 // We can use PseudoCCSUB to implement ABS.
360 } else if (Subtarget.is64Bit()) {
362 }
363
364 if (!Subtarget.hasVendorXTHeadCondMov())
366
367 static const unsigned FPLegalNodeTypes[] = {
374
375 static const ISD::CondCode FPCCToExpand[] = {
379
380 static const unsigned FPOpToExpand[] = {
382 ISD::FREM};
383
384 static const unsigned FPRndMode[] = {
387
388 if (Subtarget.hasStdExtZfhminOrZhinxmin())
390
391 static const unsigned ZfhminZfbfminPromoteOps[] = {
401
402 if (Subtarget.hasStdExtZfbfmin()) {
411 setOperationAction(ZfhminZfbfminPromoteOps, MVT::bf16, Promote);
413 // FIXME: Need to promote bf16 FCOPYSIGN to f32, but the
414 // DAGCombiner::visitFP_ROUND probably needs improvements first.
416 }
417
418 if (Subtarget.hasStdExtZfhminOrZhinxmin()) {
419 if (Subtarget.hasStdExtZfhOrZhinx()) {
420 setOperationAction(FPLegalNodeTypes, MVT::f16, Legal);
421 setOperationAction(FPRndMode, MVT::f16,
422 Subtarget.hasStdExtZfa() ? Legal : Custom);
425 } else {
426 setOperationAction(ZfhminZfbfminPromoteOps, MVT::f16, Promote);
429 MVT::f16, Legal);
430 // FIXME: Need to promote f16 FCOPYSIGN to f32, but the
431 // DAGCombiner::visitFP_ROUND probably needs improvements first.
433 }
434
437 setCondCodeAction(FPCCToExpand, MVT::f16, Expand);
440
442 Subtarget.hasStdExtZfa() ? Legal : Promote);
447 MVT::f16, Promote);
448
449 // FIXME: Need to promote f16 STRICT_* to f32 libcalls, but we don't have
450 // complete support for all operations in LegalizeDAG.
455 MVT::f16, Promote);
456
457 // We need to custom promote this.
458 if (Subtarget.is64Bit())
460
462 Subtarget.hasStdExtZfa() ? Legal : Custom);
463 }
464
465 if (Subtarget.hasStdExtFOrZfinx()) {
466 setOperationAction(FPLegalNodeTypes, MVT::f32, Legal);
467 setOperationAction(FPRndMode, MVT::f32,
468 Subtarget.hasStdExtZfa() ? Legal : Custom);
469 setCondCodeAction(FPCCToExpand, MVT::f32, Expand);
473 setOperationAction(FPOpToExpand, MVT::f32, Expand);
474 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
475 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
476 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
477 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
481 Subtarget.isSoftFPABI() ? LibCall : Custom);
484
485 if (Subtarget.hasStdExtZfa()) {
488 } else {
490 }
491 }
492
493 if (Subtarget.hasStdExtFOrZfinx() && Subtarget.is64Bit())
495
496 if (Subtarget.hasStdExtDOrZdinx()) {
497 setOperationAction(FPLegalNodeTypes, MVT::f64, Legal);
498
499 if (!Subtarget.is64Bit())
501
502 if (Subtarget.hasStdExtZfa()) {
503 setOperationAction(FPRndMode, MVT::f64, Legal);
506 } else {
507 if (Subtarget.is64Bit())
508 setOperationAction(FPRndMode, MVT::f64, Custom);
509
511 }
512
515 setCondCodeAction(FPCCToExpand, MVT::f64, Expand);
519 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
520 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
521 setOperationAction(FPOpToExpand, MVT::f64, Expand);
522 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
523 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
524 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
525 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
529 Subtarget.isSoftFPABI() ? LibCall : Custom);
532 }
533
534 if (Subtarget.is64Bit()) {
537 MVT::i32, Custom);
539 }
540
541 if (Subtarget.hasStdExtFOrZfinx()) {
543 Custom);
544
547 XLenVT, Legal);
548
551 }
552
555 XLenVT, Custom);
556
558
559 if (Subtarget.is64Bit())
561
562 // TODO: On M-mode only targets, the cycle[h]/time[h] CSR may not be present.
563 // Unfortunately this can't be determined just from the ISA naming string.
565 Subtarget.is64Bit() ? Legal : Custom);
567 Subtarget.is64Bit() ? Legal : Custom);
568
571 if (Subtarget.is64Bit())
573
574 if (Subtarget.hasStdExtZicbop()) {
576 }
577
578 if (Subtarget.hasStdExtA()) {
580 if (Subtarget.hasStdExtZabha() && Subtarget.hasStdExtZacas())
582 else
584 } else if (Subtarget.hasForcedAtomics()) {
586 } else {
588 }
589
591
593
594 if (getTargetMachine().getTargetTriple().isOSLinux()) {
595 // Custom lowering of llvm.clear_cache.
597 }
598
599 if (Subtarget.hasVInstructions()) {
601
603
604 // RVV intrinsics may have illegal operands.
605 // We also need to custom legalize vmv.x.s.
608 {MVT::i8, MVT::i16}, Custom);
609 if (Subtarget.is64Bit())
611 MVT::i32, Custom);
612 else
614 MVT::i64, Custom);
615
617 MVT::Other, Custom);
618
619 static const unsigned IntegerVPOps[] = {
620 ISD::VP_ADD, ISD::VP_SUB, ISD::VP_MUL,
621 ISD::VP_SDIV, ISD::VP_UDIV, ISD::VP_SREM,
622 ISD::VP_UREM, ISD::VP_AND, ISD::VP_OR,
623 ISD::VP_XOR, ISD::VP_SRA, ISD::VP_SRL,
624 ISD::VP_SHL, ISD::VP_REDUCE_ADD, ISD::VP_REDUCE_AND,
625 ISD::VP_REDUCE_OR, ISD::VP_REDUCE_XOR, ISD::VP_REDUCE_SMAX,
626 ISD::VP_REDUCE_SMIN, ISD::VP_REDUCE_UMAX, ISD::VP_REDUCE_UMIN,
627 ISD::VP_MERGE, ISD::VP_SELECT, ISD::VP_FP_TO_SINT,
628 ISD::VP_FP_TO_UINT, ISD::VP_SETCC, ISD::VP_SIGN_EXTEND,
629 ISD::VP_ZERO_EXTEND, ISD::VP_TRUNCATE, ISD::VP_SMIN,
630 ISD::VP_SMAX, ISD::VP_UMIN, ISD::VP_UMAX,
631 ISD::VP_ABS, ISD::EXPERIMENTAL_VP_REVERSE, ISD::EXPERIMENTAL_VP_SPLICE,
632 ISD::VP_SADDSAT, ISD::VP_UADDSAT, ISD::VP_SSUBSAT,
633 ISD::VP_USUBSAT, ISD::VP_CTTZ_ELTS, ISD::VP_CTTZ_ELTS_ZERO_UNDEF,
634 ISD::EXPERIMENTAL_VP_SPLAT};
635
636 static const unsigned FloatingPointVPOps[] = {
637 ISD::VP_FADD, ISD::VP_FSUB, ISD::VP_FMUL,
638 ISD::VP_FDIV, ISD::VP_FNEG, ISD::VP_FABS,
639 ISD::VP_FMA, ISD::VP_REDUCE_FADD, ISD::VP_REDUCE_SEQ_FADD,
640 ISD::VP_REDUCE_FMIN, ISD::VP_REDUCE_FMAX, ISD::VP_MERGE,
641 ISD::VP_SELECT, ISD::VP_SINT_TO_FP, ISD::VP_UINT_TO_FP,
642 ISD::VP_SETCC, ISD::VP_FP_ROUND, ISD::VP_FP_EXTEND,
643 ISD::VP_SQRT, ISD::VP_FMINNUM, ISD::VP_FMAXNUM,
644 ISD::VP_FCEIL, ISD::VP_FFLOOR, ISD::VP_FROUND,
645 ISD::VP_FROUNDEVEN, ISD::VP_FCOPYSIGN, ISD::VP_FROUNDTOZERO,
646 ISD::VP_FRINT, ISD::VP_FNEARBYINT, ISD::VP_IS_FPCLASS,
647 ISD::VP_FMINIMUM, ISD::VP_FMAXIMUM, ISD::VP_LRINT,
648 ISD::VP_LLRINT, ISD::EXPERIMENTAL_VP_REVERSE,
649 ISD::EXPERIMENTAL_VP_SPLICE, ISD::VP_REDUCE_FMINIMUM,
650 ISD::VP_REDUCE_FMAXIMUM, ISD::EXPERIMENTAL_VP_SPLAT};
651
652 static const unsigned IntegerVecReduceOps[] = {
656
657 static const unsigned FloatingPointVecReduceOps[] = {
660
661 if (!Subtarget.is64Bit()) {
662 // We must custom-lower certain vXi64 operations on RV32 due to the vector
663 // element type being illegal.
665 MVT::i64, Custom);
666
667 setOperationAction(IntegerVecReduceOps, MVT::i64, Custom);
668
669 setOperationAction({ISD::VP_REDUCE_ADD, ISD::VP_REDUCE_AND,
670 ISD::VP_REDUCE_OR, ISD::VP_REDUCE_XOR,
671 ISD::VP_REDUCE_SMAX, ISD::VP_REDUCE_SMIN,
672 ISD::VP_REDUCE_UMAX, ISD::VP_REDUCE_UMIN},
673 MVT::i64, Custom);
674 }
675
676 for (MVT VT : BoolVecVTs) {
677 if (!isTypeLegal(VT))
678 continue;
679
681
682 // Mask VTs are custom-expanded into a series of standard nodes
686 VT, Custom);
687
689 Custom);
690
693 {ISD::SELECT_CC, ISD::VSELECT, ISD::VP_MERGE, ISD::VP_SELECT}, VT,
694 Expand);
695
696 setOperationAction({ISD::VP_CTTZ_ELTS, ISD::VP_CTTZ_ELTS_ZERO_UNDEF}, VT,
697 Custom);
698
699 setOperationAction({ISD::VP_AND, ISD::VP_OR, ISD::VP_XOR}, VT, Custom);
700
703 Custom);
704
706 {ISD::VP_REDUCE_AND, ISD::VP_REDUCE_OR, ISD::VP_REDUCE_XOR}, VT,
707 Custom);
708
709 // RVV has native int->float & float->int conversions where the
710 // element type sizes are within one power-of-two of each other. Any
711 // wider distances between type sizes have to be lowered as sequences
712 // which progressively narrow the gap in stages.
717 VT, Custom);
719 Custom);
720
721 // Expand all extending loads to types larger than this, and truncating
722 // stores from types larger than this.
724 setTruncStoreAction(VT, OtherVT, Expand);
726 OtherVT, Expand);
727 }
728
729 setOperationAction({ISD::VP_FP_TO_SINT, ISD::VP_FP_TO_UINT,
730 ISD::VP_TRUNCATE, ISD::VP_SETCC},
731 VT, Custom);
732
735
737
738 setOperationAction(ISD::EXPERIMENTAL_VP_SPLICE, VT, Custom);
739 setOperationAction(ISD::EXPERIMENTAL_VP_REVERSE, VT, Custom);
740
743 MVT::getVectorVT(MVT::i8, VT.getVectorElementCount()));
744 }
745
746 for (MVT VT : IntVecVTs) {
747 if (!isTypeLegal(VT))
748 continue;
749
752
753 // Vectors implement MULHS/MULHU.
755
756 // nxvXi64 MULHS/MULHU requires the V extension instead of Zve64*.
757 if (VT.getVectorElementType() == MVT::i64 && !Subtarget.hasStdExtV())
759
761 Legal);
762
764
765 // Custom-lower extensions and truncations from/to mask types.
767 VT, Custom);
768
769 // RVV has native int->float & float->int conversions where the
770 // element type sizes are within one power-of-two of each other. Any
771 // wider distances between type sizes have to be lowered as sequences
772 // which progressively narrow the gap in stages.
777 VT, Custom);
779 Custom);
783 VT, Legal);
784
785 // Integer VTs are lowered as a series of "RISCVISD::TRUNCATE_VECTOR_VL"
786 // nodes which truncate by one power of two at a time.
788
789 // Custom-lower insert/extract operations to simplify patterns.
791 Custom);
792
793 // Custom-lower reduction operations to set up the corresponding custom
794 // nodes' operands.
795 setOperationAction(IntegerVecReduceOps, VT, Custom);
796
797 setOperationAction(IntegerVPOps, VT, Custom);
798
800
802 VT, Custom);
803
805 {ISD::VP_LOAD, ISD::VP_STORE, ISD::EXPERIMENTAL_VP_STRIDED_LOAD,
806 ISD::EXPERIMENTAL_VP_STRIDED_STORE, ISD::VP_GATHER, ISD::VP_SCATTER},
807 VT, Custom);
808
811 VT, Custom);
812
815
817
819 setTruncStoreAction(VT, OtherVT, Expand);
821 OtherVT, Expand);
822 }
823
826
827 // Splice
829
830 if (Subtarget.hasStdExtZvkb()) {
832 setOperationAction(ISD::VP_BSWAP, VT, Custom);
833 } else {
834 setOperationAction({ISD::BSWAP, ISD::VP_BSWAP}, VT, Expand);
836 }
837
838 if (Subtarget.hasStdExtZvbb()) {
840 setOperationAction(ISD::VP_BITREVERSE, VT, Custom);
841 setOperationAction({ISD::VP_CTLZ, ISD::VP_CTLZ_ZERO_UNDEF, ISD::VP_CTTZ,
842 ISD::VP_CTTZ_ZERO_UNDEF, ISD::VP_CTPOP},
843 VT, Custom);
844 } else {
845 setOperationAction({ISD::BITREVERSE, ISD::VP_BITREVERSE}, VT, Expand);
847 setOperationAction({ISD::VP_CTLZ, ISD::VP_CTLZ_ZERO_UNDEF, ISD::VP_CTTZ,
848 ISD::VP_CTTZ_ZERO_UNDEF, ISD::VP_CTPOP},
849 VT, Expand);
850
851 // Lower CTLZ_ZERO_UNDEF and CTTZ_ZERO_UNDEF if element of VT in the
852 // range of f32.
853 EVT FloatVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount());
854 if (isTypeLegal(FloatVT)) {
856 ISD::CTTZ_ZERO_UNDEF, ISD::VP_CTLZ,
857 ISD::VP_CTLZ_ZERO_UNDEF, ISD::VP_CTTZ_ZERO_UNDEF},
858 VT, Custom);
859 }
860 }
861 }
862
863 // Expand various CCs to best match the RVV ISA, which natively supports UNE
864 // but no other unordered comparisons, and supports all ordered comparisons
865 // except ONE. Additionally, we expand GT,OGT,GE,OGE for optimization
866 // purposes; they are expanded to their swapped-operand CCs (LT,OLT,LE,OLE),
867 // and we pattern-match those back to the "original", swapping operands once
868 // more. This way we catch both operations and both "vf" and "fv" forms with
869 // fewer patterns.
870 static const ISD::CondCode VFPCCToExpand[] = {
874 };
875
876 // TODO: support more ops.
877 static const unsigned ZvfhminPromoteOps[] = {
885
886 // TODO: support more vp ops.
887 static const unsigned ZvfhminPromoteVPOps[] = {
888 ISD::VP_FADD, ISD::VP_FSUB, ISD::VP_FMUL,
889 ISD::VP_FDIV, ISD::VP_FNEG, ISD::VP_FABS,
890 ISD::VP_FMA, ISD::VP_REDUCE_FADD, ISD::VP_REDUCE_SEQ_FADD,
891 ISD::VP_REDUCE_FMIN, ISD::VP_REDUCE_FMAX, ISD::VP_SQRT,
892 ISD::VP_FMINNUM, ISD::VP_FMAXNUM, ISD::VP_FCEIL,
893 ISD::VP_FFLOOR, ISD::VP_FROUND, ISD::VP_FROUNDEVEN,
894 ISD::VP_FCOPYSIGN, ISD::VP_FROUNDTOZERO, ISD::VP_FRINT,
895 ISD::VP_FNEARBYINT, ISD::VP_SETCC, ISD::VP_FMINIMUM,
896 ISD::VP_FMAXIMUM, ISD::VP_REDUCE_FMINIMUM, ISD::VP_REDUCE_FMAXIMUM};
897
898 // Sets common operation actions on RVV floating-point vector types.
899 const auto SetCommonVFPActions = [&](MVT VT) {
901 // RVV has native FP_ROUND & FP_EXTEND conversions where the element type
902 // sizes are within one power-of-two of each other. Therefore conversions
903 // between vXf16 and vXf64 must be lowered as sequences which convert via
904 // vXf32.
907 // Custom-lower insert/extract operations to simplify patterns.
909 Custom);
910 // Expand various condition codes (explained above).
911 setCondCodeAction(VFPCCToExpand, VT, Expand);
912
915
919 VT, Custom);
920
921 setOperationAction(FloatingPointVecReduceOps, VT, Custom);
922
923 // Expand FP operations that need libcalls.
935
937
939
941 VT, Custom);
942
944 {ISD::VP_LOAD, ISD::VP_STORE, ISD::EXPERIMENTAL_VP_STRIDED_LOAD,
945 ISD::EXPERIMENTAL_VP_STRIDED_STORE, ISD::VP_GATHER, ISD::VP_SCATTER},
946 VT, Custom);
947
950
953 VT, Custom);
954
957
959
960 setOperationAction(FloatingPointVPOps, VT, Custom);
961
963 Custom);
966 VT, Legal);
971 VT, Custom);
972 };
973
974 // Sets common extload/truncstore actions on RVV floating-point vector
975 // types.
976 const auto SetCommonVFPExtLoadTruncStoreActions =
977 [&](MVT VT, ArrayRef<MVT::SimpleValueType> SmallerVTs) {
978 for (auto SmallVT : SmallerVTs) {
979 setTruncStoreAction(VT, SmallVT, Expand);
980 setLoadExtAction(ISD::EXTLOAD, VT, SmallVT, Expand);
981 }
982 };
983
984 if (Subtarget.hasVInstructionsF16()) {
985 for (MVT VT : F16VecVTs) {
986 if (!isTypeLegal(VT))
987 continue;
988 SetCommonVFPActions(VT);
989 }
990 } else if (Subtarget.hasVInstructionsF16Minimal()) {
991 for (MVT VT : F16VecVTs) {
992 if (!isTypeLegal(VT))
993 continue;
996 Custom);
997 setOperationAction({ISD::VP_FP_ROUND, ISD::VP_FP_EXTEND}, VT, Custom);
998 setOperationAction({ISD::VP_MERGE, ISD::VP_SELECT, ISD::SELECT}, VT,
999 Custom);
1002 ISD::VP_SINT_TO_FP, ISD::VP_UINT_TO_FP},
1003 VT, Custom);
1006 VT, Custom);
1007 if (Subtarget.hasStdExtZfhmin())
1009 // load/store
1011
1012 // Custom split nxv32f16 since nxv32f32 if not legal.
1013 if (VT == MVT::nxv32f16) {
1014 setOperationAction(ZvfhminPromoteOps, VT, Custom);
1015 setOperationAction(ZvfhminPromoteVPOps, VT, Custom);
1016 continue;
1017 }
1018 // Add more promote ops.
1019 MVT F32VecVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount());
1020 setOperationPromotedToType(ZvfhminPromoteOps, VT, F32VecVT);
1021 setOperationPromotedToType(ZvfhminPromoteVPOps, VT, F32VecVT);
1022 }
1023 }
1024
1025 // TODO: Could we merge some code with zvfhmin?
1026 if (Subtarget.hasVInstructionsBF16Minimal()) {
1027 for (MVT VT : BF16VecVTs) {
1028 if (!isTypeLegal(VT))
1029 continue;
1031 setOperationAction({ISD::VP_FP_ROUND, ISD::VP_FP_EXTEND}, VT, Custom);
1033 Custom);
1036 VT, Custom);
1038 if (Subtarget.hasStdExtZfbfmin())
1040 setOperationAction({ISD::VP_MERGE, ISD::VP_SELECT, ISD::SELECT}, VT,
1041 Custom);
1043 // TODO: Promote to fp32.
1044 }
1045 }
1046
1047 if (Subtarget.hasVInstructionsF32()) {
1048 for (MVT VT : F32VecVTs) {
1049 if (!isTypeLegal(VT))
1050 continue;
1051 SetCommonVFPActions(VT);
1052 SetCommonVFPExtLoadTruncStoreActions(VT, F16VecVTs);
1053 }
1054 }
1055
1056 if (Subtarget.hasVInstructionsF64()) {
1057 for (MVT VT : F64VecVTs) {
1058 if (!isTypeLegal(VT))
1059 continue;
1060 SetCommonVFPActions(VT);
1061 SetCommonVFPExtLoadTruncStoreActions(VT, F16VecVTs);
1062 SetCommonVFPExtLoadTruncStoreActions(VT, F32VecVTs);
1063 }
1064 }
1065
1066 if (Subtarget.useRVVForFixedLengthVectors()) {
1068 if (!useRVVForFixedLengthVectorVT(VT))
1069 continue;
1070
1071 // By default everything must be expanded.
1072 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1075 setTruncStoreAction(VT, OtherVT, Expand);
1077 OtherVT, Expand);
1078 }
1079
1080 // Custom lower fixed vector undefs to scalable vector undefs to avoid
1081 // expansion to a build_vector of 0s.
1083
1084 // We use EXTRACT_SUBVECTOR as a "cast" from scalable to fixed.
1086 Custom);
1087
1089 Custom);
1090
1092 VT, Custom);
1093
1095
1097
1099
1101
1103
1105
1108 Custom);
1109
1111 {ISD::VP_REDUCE_AND, ISD::VP_REDUCE_OR, ISD::VP_REDUCE_XOR}, VT,
1112 Custom);
1113
1115 {
1124 },
1125 VT, Custom);
1127 Custom);
1128
1130
1131 // Operations below are different for between masks and other vectors.
1132 if (VT.getVectorElementType() == MVT::i1) {
1133 setOperationAction({ISD::VP_AND, ISD::VP_OR, ISD::VP_XOR, ISD::AND,
1134 ISD::OR, ISD::XOR},
1135 VT, Custom);
1136
1137 setOperationAction({ISD::VP_FP_TO_SINT, ISD::VP_FP_TO_UINT,
1138 ISD::VP_SETCC, ISD::VP_TRUNCATE},
1139 VT, Custom);
1140
1141 setOperationAction(ISD::EXPERIMENTAL_VP_SPLICE, VT, Custom);
1142 setOperationAction(ISD::EXPERIMENTAL_VP_REVERSE, VT, Custom);
1143 continue;
1144 }
1145
1146 // Make SPLAT_VECTOR Legal so DAGCombine will convert splat vectors to
1147 // it before type legalization for i64 vectors on RV32. It will then be
1148 // type legalized to SPLAT_VECTOR_PARTS which we need to Custom handle.
1149 // FIXME: Use SPLAT_VECTOR for all types? DAGCombine probably needs
1150 // improvements first.
1151 if (!Subtarget.is64Bit() && VT.getVectorElementType() == MVT::i64) {
1154 }
1155
1158
1159 setOperationAction({ISD::VP_LOAD, ISD::VP_STORE,
1160 ISD::EXPERIMENTAL_VP_STRIDED_LOAD,
1161 ISD::EXPERIMENTAL_VP_STRIDED_STORE, ISD::VP_GATHER,
1162 ISD::VP_SCATTER},
1163 VT, Custom);
1164
1168 VT, Custom);
1169
1172
1174
1175 // vXi64 MULHS/MULHU requires the V extension instead of Zve64*.
1176 if (VT.getVectorElementType() != MVT::i64 || Subtarget.hasStdExtV())
1178
1182 VT, Custom);
1183
1185
1188
1189 // Custom-lower reduction operations to set up the corresponding custom
1190 // nodes' operands.
1194 VT, Custom);
1195
1196 setOperationAction(IntegerVPOps, VT, Custom);
1197
1198 if (Subtarget.hasStdExtZvkb())
1200
1201 if (Subtarget.hasStdExtZvbb()) {
1204 VT, Custom);
1205 } else {
1206 // Lower CTLZ_ZERO_UNDEF and CTTZ_ZERO_UNDEF if element of VT in the
1207 // range of f32.
1208 EVT FloatVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount());
1209 if (isTypeLegal(FloatVT))
1212 Custom);
1213 }
1214 }
1215
1217 // There are no extending loads or truncating stores.
1218 for (MVT InnerVT : MVT::fp_fixedlen_vector_valuetypes()) {
1219 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1220 setTruncStoreAction(VT, InnerVT, Expand);
1221 }
1222
1223 if (!useRVVForFixedLengthVectorVT(VT))
1224 continue;
1225
1226 // By default everything must be expanded.
1227 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1229
1230 // Custom lower fixed vector undefs to scalable vector undefs to avoid
1231 // expansion to a build_vector of 0s.
1233
1236 VT, Custom);
1237
1238 // FIXME: mload, mstore, mgather, mscatter, vp_load/store,
1239 // vp_stride_load/store, vp_gather/scatter can be hoisted to here.
1241
1244 Custom);
1245
1246 if (VT.getVectorElementType() == MVT::f16 &&
1247 !Subtarget.hasVInstructionsF16()) {
1248 setOperationAction({ISD::VP_FP_ROUND, ISD::VP_FP_EXTEND}, VT, Custom);
1250 {ISD::VP_MERGE, ISD::VP_SELECT, ISD::VSELECT, ISD::SELECT}, VT,
1251 Custom);
1253 ISD::VP_SINT_TO_FP, ISD::VP_UINT_TO_FP},
1254 VT, Custom);
1256 if (Subtarget.hasStdExtZfhmin()) {
1257 // FIXME: We should prefer BUILD_VECTOR over SPLAT_VECTOR.
1259 } else {
1260 // We need to custom legalize f16 build vectors if Zfhmin isn't
1261 // available.
1263 }
1264 MVT F32VecVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount());
1265 // Don't promote f16 vector operations to f32 if f32 vector type is
1266 // not legal.
1267 // TODO: could split the f16 vector into two vectors and do promotion.
1268 if (!isTypeLegal(F32VecVT))
1269 continue;
1270 setOperationPromotedToType(ZvfhminPromoteOps, VT, F32VecVT);
1271 setOperationPromotedToType(ZvfhminPromoteVPOps, VT, F32VecVT);
1272 continue;
1273 }
1274
1275 if (VT.getVectorElementType() == MVT::bf16) {
1276 setOperationAction({ISD::VP_FP_ROUND, ISD::VP_FP_EXTEND}, VT, Custom);
1277 // FIXME: We should prefer BUILD_VECTOR over SPLAT_VECTOR.
1280 {ISD::VP_MERGE, ISD::VP_SELECT, ISD::VSELECT, ISD::SELECT}, VT,
1281 Custom);
1282 // TODO: Promote to fp32.
1283 continue;
1284 }
1285
1288 VT, Custom);
1289
1292
1293 setOperationAction({ISD::VP_LOAD, ISD::VP_STORE,
1294 ISD::EXPERIMENTAL_VP_STRIDED_LOAD,
1295 ISD::EXPERIMENTAL_VP_STRIDED_STORE, ISD::VP_GATHER,
1296 ISD::VP_SCATTER},
1297 VT, Custom);
1298
1303 VT, Custom);
1304
1307 VT, Custom);
1308
1309 setCondCodeAction(VFPCCToExpand, VT, Expand);
1310
1313
1315
1316 setOperationAction(FloatingPointVecReduceOps, VT, Custom);
1317
1318 setOperationAction(FloatingPointVPOps, VT, Custom);
1319
1326 VT, Custom);
1327 }
1328
1329 // Custom-legalize bitcasts from fixed-length vectors to scalar types.
1330 setOperationAction(ISD::BITCAST, {MVT::i8, MVT::i16, MVT::i32, MVT::i64},
1331 Custom);
1332 if (Subtarget.hasStdExtZfhminOrZhinxmin())
1334 if (Subtarget.hasStdExtFOrZfinx())
1336 if (Subtarget.hasStdExtDOrZdinx())
1338 }
1339 }
1340
1341 if (Subtarget.hasStdExtA())
1343
1344 if (Subtarget.hasForcedAtomics()) {
1345 // Force __sync libcalls to be emitted for atomic rmw/cas operations.
1351 XLenVT, LibCall);
1352 }
1353
1354 if (Subtarget.hasVendorXTHeadMemIdx()) {
1355 for (unsigned im : {ISD::PRE_INC, ISD::POST_INC}) {
1356 setIndexedLoadAction(im, MVT::i8, Legal);
1357 setIndexedStoreAction(im, MVT::i8, Legal);
1358 setIndexedLoadAction(im, MVT::i16, Legal);
1359 setIndexedStoreAction(im, MVT::i16, Legal);
1360 setIndexedLoadAction(im, MVT::i32, Legal);
1361 setIndexedStoreAction(im, MVT::i32, Legal);
1362
1363 if (Subtarget.is64Bit()) {
1364 setIndexedLoadAction(im, MVT::i64, Legal);
1365 setIndexedStoreAction(im, MVT::i64, Legal);
1366 }
1367 }
1368 }
1369
1370 if (Subtarget.hasVendorXCVmem() && !Subtarget.is64Bit()) {
1374
1378 }
1379
1380 // Function alignments.
1381 const Align FunctionAlignment(Subtarget.hasStdExtCOrZca() ? 2 : 4);
1382 setMinFunctionAlignment(FunctionAlignment);
1383 // Set preferred alignments.
1386
1391
1392 if (Subtarget.hasStdExtFOrZfinx())
1394
1395 if (Subtarget.hasStdExtZbb())
1397
1398 if ((Subtarget.hasStdExtZbs() && Subtarget.is64Bit()) ||
1399 Subtarget.hasVInstructions())
1401
1402 if (Subtarget.hasStdExtZbkb())
1404 if (Subtarget.hasStdExtZfhminOrZhinxmin())
1406 if (Subtarget.hasStdExtFOrZfinx())
1409 if (Subtarget.hasVInstructions())
1411 ISD::VP_GATHER, ISD::VP_SCATTER, ISD::SRA, ISD::SRL,
1414 ISD::EXPERIMENTAL_VP_REVERSE, ISD::MUL,
1417 if (Subtarget.hasVendorXTHeadMemPair())
1419 if (Subtarget.useRVVForFixedLengthVectors())
1421
1422 setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
1423 setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
1424
1425 // Disable strict node mutation.
1426 IsStrictFPEnabled = true;
1427
1428 // Let the subtarget decide if a predictable select is more expensive than the
1429 // corresponding branch. This information is used in CGP/SelectOpt to decide
1430 // when to convert selects into branches.
1431 PredictableSelectIsExpensive = Subtarget.predictableSelectIsExpensive();
1432}
1433
1435 LLVMContext &Context,
1436 EVT VT) const {
1437 if (!VT.isVector())
1438 return getPointerTy(DL);
1439 if (Subtarget.hasVInstructions() &&
1440 (VT.isScalableVector() || Subtarget.useRVVForFixedLengthVectors()))
1441 return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
1443}
1444
1445MVT RISCVTargetLowering::getVPExplicitVectorLengthTy() const {
1446 return Subtarget.getXLenVT();
1447}
1448
1449// Return false if we can lower get_vector_length to a vsetvli intrinsic.
1450bool RISCVTargetLowering::shouldExpandGetVectorLength(EVT TripCountVT,
1451 unsigned VF,
1452 bool IsScalable) const {
1453 if (!Subtarget.hasVInstructions())
1454 return true;
1455
1456 if (!IsScalable)
1457 return true;
1458
1459 if (TripCountVT != MVT::i32 && TripCountVT != Subtarget.getXLenVT())
1460 return true;
1461
1462 // Don't allow VF=1 if those types are't legal.
1463 if (VF < RISCV::RVVBitsPerBlock / Subtarget.getELen())
1464 return true;
1465
1466 // VLEN=32 support is incomplete.
1467 if (Subtarget.getRealMinVLen() < RISCV::RVVBitsPerBlock)
1468 return true;
1469
1470 // The maximum VF is for the smallest element width with LMUL=8.
1471 // VF must be a power of 2.
1472 unsigned MaxVF = (RISCV::RVVBitsPerBlock / 8) * 8;
1473 return VF > MaxVF || !isPowerOf2_32(VF);
1474}
1475
1477 return !Subtarget.hasVInstructions() ||
1478 VT.getVectorElementType() != MVT::i1 || !isTypeLegal(VT);
1479}
1480
1482 const CallInst &I,
1483 MachineFunction &MF,
1484 unsigned Intrinsic) const {
1485 auto &DL = I.getDataLayout();
1486
1487 auto SetRVVLoadStoreInfo = [&](unsigned PtrOp, bool IsStore,
1488 bool IsUnitStrided, bool UsePtrVal = false) {
1490 // We can't use ptrVal if the intrinsic can access memory before the
1491 // pointer. This means we can't use it for strided or indexed intrinsics.
1492 if (UsePtrVal)
1493 Info.ptrVal = I.getArgOperand(PtrOp);
1494 else
1495 Info.fallbackAddressSpace =
1496 I.getArgOperand(PtrOp)->getType()->getPointerAddressSpace();
1497 Type *MemTy;
1498 if (IsStore) {
1499 // Store value is the first operand.
1500 MemTy = I.getArgOperand(0)->getType();
1501 } else {
1502 // Use return type. If it's segment load, return type is a struct.
1503 MemTy = I.getType();
1504 if (MemTy->isStructTy())
1505 MemTy = MemTy->getStructElementType(0);
1506 }
1507 if (!IsUnitStrided)
1508 MemTy = MemTy->getScalarType();
1509
1510 Info.memVT = getValueType(DL, MemTy);
1511 Info.align = Align(DL.getTypeSizeInBits(MemTy->getScalarType()) / 8);
1513 Info.flags |=
1515 return true;
1516 };
1517
1518 if (I.hasMetadata(LLVMContext::MD_nontemporal))
1520
1522 switch (Intrinsic) {
1523 default:
1524 return false;
1525 case Intrinsic::riscv_masked_atomicrmw_xchg_i32:
1526 case Intrinsic::riscv_masked_atomicrmw_add_i32:
1527 case Intrinsic::riscv_masked_atomicrmw_sub_i32:
1528 case Intrinsic::riscv_masked_atomicrmw_nand_i32:
1529 case Intrinsic::riscv_masked_atomicrmw_max_i32:
1530 case Intrinsic::riscv_masked_atomicrmw_min_i32:
1531 case Intrinsic::riscv_masked_atomicrmw_umax_i32:
1532 case Intrinsic::riscv_masked_atomicrmw_umin_i32:
1533 case Intrinsic::riscv_masked_cmpxchg_i32:
1535 Info.memVT = MVT::i32;
1536 Info.ptrVal = I.getArgOperand(0);
1537 Info.offset = 0;
1538 Info.align = Align(4);
1541 return true;
1542 case Intrinsic::riscv_seg2_load:
1543 case Intrinsic::riscv_seg3_load:
1544 case Intrinsic::riscv_seg4_load:
1545 case Intrinsic::riscv_seg5_load:
1546 case Intrinsic::riscv_seg6_load:
1547 case Intrinsic::riscv_seg7_load:
1548 case Intrinsic::riscv_seg8_load:
1549 return SetRVVLoadStoreInfo(/*PtrOp*/ 0, /*IsStore*/ false,
1550 /*IsUnitStrided*/ false, /*UsePtrVal*/ true);
1551 case Intrinsic::riscv_seg2_store:
1552 case Intrinsic::riscv_seg3_store:
1553 case Intrinsic::riscv_seg4_store:
1554 case Intrinsic::riscv_seg5_store:
1555 case Intrinsic::riscv_seg6_store:
1556 case Intrinsic::riscv_seg7_store:
1557 case Intrinsic::riscv_seg8_store:
1558 // Operands are (vec, ..., vec, ptr, vl)
1559 return SetRVVLoadStoreInfo(/*PtrOp*/ I.arg_size() - 2,
1560 /*IsStore*/ true,
1561 /*IsUnitStrided*/ false, /*UsePtrVal*/ true);
1562 case Intrinsic::riscv_vle:
1563 case Intrinsic::riscv_vle_mask:
1564 case Intrinsic::riscv_vleff:
1565 case Intrinsic::riscv_vleff_mask:
1566 return SetRVVLoadStoreInfo(/*PtrOp*/ 1,
1567 /*IsStore*/ false,
1568 /*IsUnitStrided*/ true,
1569 /*UsePtrVal*/ true);
1570 case Intrinsic::riscv_vse:
1571 case Intrinsic::riscv_vse_mask:
1572 return SetRVVLoadStoreInfo(/*PtrOp*/ 1,
1573 /*IsStore*/ true,
1574 /*IsUnitStrided*/ true,
1575 /*UsePtrVal*/ true);
1576 case Intrinsic::riscv_vlse:
1577 case Intrinsic::riscv_vlse_mask:
1578 case Intrinsic::riscv_vloxei:
1579 case Intrinsic::riscv_vloxei_mask:
1580 case Intrinsic::riscv_vluxei:
1581 case Intrinsic::riscv_vluxei_mask:
1582 return SetRVVLoadStoreInfo(/*PtrOp*/ 1,
1583 /*IsStore*/ false,
1584 /*IsUnitStrided*/ false);
1585 case Intrinsic::riscv_vsse:
1586 case Intrinsic::riscv_vsse_mask:
1587 case Intrinsic::riscv_vsoxei:
1588 case Intrinsic::riscv_vsoxei_mask:
1589 case Intrinsic::riscv_vsuxei:
1590 case Intrinsic::riscv_vsuxei_mask:
1591 return SetRVVLoadStoreInfo(/*PtrOp*/ 1,
1592 /*IsStore*/ true,
1593 /*IsUnitStrided*/ false);
1594 case Intrinsic::riscv_vlseg2:
1595 case Intrinsic::riscv_vlseg3:
1596 case Intrinsic::riscv_vlseg4:
1597 case Intrinsic::riscv_vlseg5:
1598 case Intrinsic::riscv_vlseg6:
1599 case Intrinsic::riscv_vlseg7:
1600 case Intrinsic::riscv_vlseg8:
1601 case Intrinsic::riscv_vlseg2ff:
1602 case Intrinsic::riscv_vlseg3ff:
1603 case Intrinsic::riscv_vlseg4ff:
1604 case Intrinsic::riscv_vlseg5ff:
1605 case Intrinsic::riscv_vlseg6ff:
1606 case Intrinsic::riscv_vlseg7ff:
1607 case Intrinsic::riscv_vlseg8ff:
1608 return SetRVVLoadStoreInfo(/*PtrOp*/ I.arg_size() - 2,
1609 /*IsStore*/ false,
1610 /*IsUnitStrided*/ false, /*UsePtrVal*/ true);
1611 case Intrinsic::riscv_vlseg2_mask:
1612 case Intrinsic::riscv_vlseg3_mask:
1613 case Intrinsic::riscv_vlseg4_mask:
1614 case Intrinsic::riscv_vlseg5_mask:
1615 case Intrinsic::riscv_vlseg6_mask:
1616 case Intrinsic::riscv_vlseg7_mask:
1617 case Intrinsic::riscv_vlseg8_mask:
1618 case Intrinsic::riscv_vlseg2ff_mask:
1619 case Intrinsic::riscv_vlseg3ff_mask:
1620 case Intrinsic::riscv_vlseg4ff_mask:
1621 case Intrinsic::riscv_vlseg5ff_mask:
1622 case Intrinsic::riscv_vlseg6ff_mask:
1623 case Intrinsic::riscv_vlseg7ff_mask:
1624 case Intrinsic::riscv_vlseg8ff_mask:
1625 return SetRVVLoadStoreInfo(/*PtrOp*/ I.arg_size() - 4,
1626 /*IsStore*/ false,
1627 /*IsUnitStrided*/ false, /*UsePtrVal*/ true);
1628 case Intrinsic::riscv_vlsseg2:
1629 case Intrinsic::riscv_vlsseg3:
1630 case Intrinsic::riscv_vlsseg4:
1631 case Intrinsic::riscv_vlsseg5:
1632 case Intrinsic::riscv_vlsseg6:
1633 case Intrinsic::riscv_vlsseg7:
1634 case Intrinsic::riscv_vlsseg8:
1635 case Intrinsic::riscv_vloxseg2:
1636 case Intrinsic::riscv_vloxseg3:
1637 case Intrinsic::riscv_vloxseg4:
1638 case Intrinsic::riscv_vloxseg5:
1639 case Intrinsic::riscv_vloxseg6:
1640 case Intrinsic::riscv_vloxseg7:
1641 case Intrinsic::riscv_vloxseg8:
1642 case Intrinsic::riscv_vluxseg2:
1643 case Intrinsic::riscv_vluxseg3:
1644 case Intrinsic::riscv_vluxseg4:
1645 case Intrinsic::riscv_vluxseg5:
1646 case Intrinsic::riscv_vluxseg6:
1647 case Intrinsic::riscv_vluxseg7:
1648 case Intrinsic::riscv_vluxseg8:
1649 return SetRVVLoadStoreInfo(/*PtrOp*/ I.arg_size() - 3,
1650 /*IsStore*/ false,
1651 /*IsUnitStrided*/ false);
1652 case Intrinsic::riscv_vlsseg2_mask:
1653 case Intrinsic::riscv_vlsseg3_mask:
1654 case Intrinsic::riscv_vlsseg4_mask:
1655 case Intrinsic::riscv_vlsseg5_mask:
1656 case Intrinsic::riscv_vlsseg6_mask:
1657 case Intrinsic::riscv_vlsseg7_mask:
1658 case Intrinsic::riscv_vlsseg8_mask:
1659 case Intrinsic::riscv_vloxseg2_mask:
1660 case Intrinsic::riscv_vloxseg3_mask:
1661 case Intrinsic::riscv_vloxseg4_mask:
1662 case Intrinsic::riscv_vloxseg5_mask:
1663 case Intrinsic::riscv_vloxseg6_mask:
1664 case Intrinsic::riscv_vloxseg7_mask:
1665 case Intrinsic::riscv_vloxseg8_mask:
1666 case Intrinsic::riscv_vluxseg2_mask:
1667 case Intrinsic::riscv_vluxseg3_mask:
1668 case Intrinsic::riscv_vluxseg4_mask:
1669 case Intrinsic::riscv_vluxseg5_mask:
1670 case Intrinsic::riscv_vluxseg6_mask:
1671 case Intrinsic::riscv_vluxseg7_mask:
1672 case Intrinsic::riscv_vluxseg8_mask:
1673 return SetRVVLoadStoreInfo(/*PtrOp*/ I.arg_size() - 5,
1674 /*IsStore*/ false,
1675 /*IsUnitStrided*/ false);
1676 case Intrinsic::riscv_vsseg2:
1677 case Intrinsic::riscv_vsseg3:
1678 case Intrinsic::riscv_vsseg4:
1679 case Intrinsic::riscv_vsseg5:
1680 case Intrinsic::riscv_vsseg6:
1681 case Intrinsic::riscv_vsseg7:
1682 case Intrinsic::riscv_vsseg8:
1683 return SetRVVLoadStoreInfo(/*PtrOp*/ I.arg_size() - 2,
1684 /*IsStore*/ true,
1685 /*IsUnitStrided*/ false);
1686 case Intrinsic::riscv_vsseg2_mask:
1687 case Intrinsic::riscv_vsseg3_mask:
1688 case Intrinsic::riscv_vsseg4_mask:
1689 case Intrinsic::riscv_vsseg5_mask:
1690 case Intrinsic::riscv_vsseg6_mask:
1691 case Intrinsic::riscv_vsseg7_mask:
1692 case Intrinsic::riscv_vsseg8_mask:
1693 return SetRVVLoadStoreInfo(/*PtrOp*/ I.arg_size() - 3,
1694 /*IsStore*/ true,
1695 /*IsUnitStrided*/ false);
1696 case Intrinsic::riscv_vssseg2:
1697 case Intrinsic::riscv_vssseg3:
1698 case Intrinsic::riscv_vssseg4:
1699 case Intrinsic::riscv_vssseg5:
1700 case Intrinsic::riscv_vssseg6:
1701 case Intrinsic::riscv_vssseg7:
1702 case Intrinsic::riscv_vssseg8:
1703 case Intrinsic::riscv_vsoxseg2:
1704 case Intrinsic::riscv_vsoxseg3:
1705 case Intrinsic::riscv_vsoxseg4:
1706 case Intrinsic::riscv_vsoxseg5:
1707 case Intrinsic::riscv_vsoxseg6:
1708 case Intrinsic::riscv_vsoxseg7:
1709 case Intrinsic::riscv_vsoxseg8:
1710 case Intrinsic::riscv_vsuxseg2:
1711 case Intrinsic::riscv_vsuxseg3:
1712 case Intrinsic::riscv_vsuxseg4:
1713 case Intrinsic::riscv_vsuxseg5:
1714 case Intrinsic::riscv_vsuxseg6:
1715 case Intrinsic::riscv_vsuxseg7:
1716 case Intrinsic::riscv_vsuxseg8:
1717 return SetRVVLoadStoreInfo(/*PtrOp*/ I.arg_size() - 3,
1718 /*IsStore*/ true,
1719 /*IsUnitStrided*/ false);
1720 case Intrinsic::riscv_vssseg2_mask:
1721 case Intrinsic::riscv_vssseg3_mask:
1722 case Intrinsic::riscv_vssseg4_mask:
1723 case Intrinsic::riscv_vssseg5_mask:
1724 case Intrinsic::riscv_vssseg6_mask:
1725 case Intrinsic::riscv_vssseg7_mask:
1726 case Intrinsic::riscv_vssseg8_mask:
1727 case Intrinsic::riscv_vsoxseg2_mask:
1728 case Intrinsic::riscv_vsoxseg3_mask:
1729 case Intrinsic::riscv_vsoxseg4_mask:
1730 case Intrinsic::riscv_vsoxseg5_mask:
1731 case Intrinsic::riscv_vsoxseg6_mask:
1732 case Intrinsic::riscv_vsoxseg7_mask:
1733 case Intrinsic::riscv_vsoxseg8_mask:
1734 case Intrinsic::riscv_vsuxseg2_mask:
1735 case Intrinsic::riscv_vsuxseg3_mask:
1736 case Intrinsic::riscv_vsuxseg4_mask:
1737 case Intrinsic::riscv_vsuxseg5_mask:
1738 case Intrinsic::riscv_vsuxseg6_mask:
1739 case Intrinsic::riscv_vsuxseg7_mask:
1740 case Intrinsic::riscv_vsuxseg8_mask:
1741 return SetRVVLoadStoreInfo(/*PtrOp*/ I.arg_size() - 4,
1742 /*IsStore*/ true,
1743 /*IsUnitStrided*/ false);
1744 }
1745}
1746
1748 const AddrMode &AM, Type *Ty,
1749 unsigned AS,
1750 Instruction *I) const {
1751 // No global is ever allowed as a base.
1752 if (AM.BaseGV)
1753 return false;
1754
1755 // None of our addressing modes allows a scalable offset
1756 if (AM.ScalableOffset)
1757 return false;
1758
1759 // RVV instructions only support register addressing.
1760 if (Subtarget.hasVInstructions() && isa<VectorType>(Ty))
1761 return AM.HasBaseReg && AM.Scale == 0 && !AM.BaseOffs;
1762
1763 // Require a 12-bit signed offset.
1764 if (!isInt<12>(AM.BaseOffs))
1765 return false;
1766
1767 switch (AM.Scale) {
1768 case 0: // "r+i" or just "i", depending on HasBaseReg.
1769 break;
1770 case 1:
1771 if (!AM.HasBaseReg) // allow "r+i".
1772 break;
1773 return false; // disallow "r+r" or "r+r+i".
1774 default:
1775 return false;
1776 }
1777
1778 return true;
1779}
1780
1782 return isInt<12>(Imm);
1783}
1784
1786 return isInt<12>(Imm);
1787}
1788
1789// On RV32, 64-bit integers are split into their high and low parts and held
1790// in two different registers, so the trunc is free since the low register can
1791// just be used.
1792// FIXME: Should we consider i64->i32 free on RV64 to match the EVT version of
1793// isTruncateFree?
1795 if (Subtarget.is64Bit() || !SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
1796 return false;
1797 unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
1798 unsigned DestBits = DstTy->getPrimitiveSizeInBits();
1799 return (SrcBits == 64 && DestBits == 32);
1800}
1801
1803 // We consider i64->i32 free on RV64 since we have good selection of W
1804 // instructions that make promoting operations back to i64 free in many cases.
1805 if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() ||
1806 !DstVT.isInteger())
1807 return false;
1808 unsigned SrcBits = SrcVT.getSizeInBits();
1809 unsigned DestBits = DstVT.getSizeInBits();
1810 return (SrcBits == 64 && DestBits == 32);
1811}
1812
1814 EVT SrcVT = Val.getValueType();
1815 // free truncate from vnsrl and vnsra
1816 if (Subtarget.hasVInstructions() &&
1817 (Val.getOpcode() == ISD::SRL || Val.getOpcode() == ISD::SRA) &&
1818 SrcVT.isVector() && VT2.isVector()) {
1819 unsigned SrcBits = SrcVT.getVectorElementType().getSizeInBits();
1820 unsigned DestBits = VT2.getVectorElementType().getSizeInBits();
1821 if (SrcBits == DestBits * 2) {
1822 return true;
1823 }
1824 }
1825 return TargetLowering::isTruncateFree(Val, VT2);
1826}
1827
1829 // Zexts are free if they can be combined with a load.
1830 // Don't advertise i32->i64 zextload as being free for RV64. It interacts
1831 // poorly with type legalization of compares preferring sext.
1832 if (auto *LD = dyn_cast<LoadSDNode>(Val)) {
1833 EVT MemVT = LD->getMemoryVT();
1834 if ((MemVT == MVT::i8 || MemVT == MVT::i16) &&
1835 (LD->getExtensionType() == ISD::NON_EXTLOAD ||
1836 LD->getExtensionType() == ISD::ZEXTLOAD))
1837 return true;
1838 }
1839
1840 return TargetLowering::isZExtFree(Val, VT2);
1841}
1842
1844 return Subtarget.is64Bit() && SrcVT == MVT::i32 && DstVT == MVT::i64;
1845}
1846
1848 return Subtarget.is64Bit() && CI->getType()->isIntegerTy(32);
1849}
1850
1852 return Subtarget.hasStdExtZbb() ||
1853 (Subtarget.hasVendorXCVbitmanip() && !Subtarget.is64Bit());
1854}
1855
1857 return Subtarget.hasStdExtZbb() || Subtarget.hasVendorXTHeadBb() ||
1858 (Subtarget.hasVendorXCVbitmanip() && !Subtarget.is64Bit());
1859}
1860
1862 const Instruction &AndI) const {
1863 // We expect to be able to match a bit extraction instruction if the Zbs
1864 // extension is supported and the mask is a power of two. However, we
1865 // conservatively return false if the mask would fit in an ANDI instruction,
1866 // on the basis that it's possible the sinking+duplication of the AND in
1867 // CodeGenPrepare triggered by this hook wouldn't decrease the instruction
1868 // count and would increase code size (e.g. ANDI+BNEZ => BEXTI+BNEZ).
1869 if (!Subtarget.hasStdExtZbs() && !Subtarget.hasVendorXTHeadBs())
1870 return false;
1871 ConstantInt *Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
1872 if (!Mask)
1873 return false;
1874 return !Mask->getValue().isSignedIntN(12) && Mask->getValue().isPowerOf2();
1875}
1876
1878 EVT VT = Y.getValueType();
1879
1880 // FIXME: Support vectors once we have tests.
1881 if (VT.isVector())
1882 return false;
1883
1884 return (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtZbkb()) &&
1885 (!isa<ConstantSDNode>(Y) || cast<ConstantSDNode>(Y)->isOpaque());
1886}
1887
1889 // Zbs provides BEXT[_I], which can be used with SEQZ/SNEZ as a bit test.
1890 if (Subtarget.hasStdExtZbs())
1891 return X.getValueType().isScalarInteger();
1892 auto *C = dyn_cast<ConstantSDNode>(Y);
1893 // XTheadBs provides th.tst (similar to bexti), if Y is a constant
1894 if (Subtarget.hasVendorXTHeadBs())
1895 return C != nullptr;
1896 // We can use ANDI+SEQZ/SNEZ as a bit test. Y contains the bit position.
1897 return C && C->getAPIntValue().ule(10);
1898}
1899
1901 EVT VT) const {
1902 // Only enable for rvv.
1903 if (!VT.isVector() || !Subtarget.hasVInstructions())
1904 return false;
1905
1906 if (VT.isFixedLengthVector() && !isTypeLegal(VT))
1907 return false;
1908
1909 return true;
1910}
1911
1913 Type *Ty) const {
1914 assert(Ty->isIntegerTy());
1915
1916 unsigned BitSize = Ty->getIntegerBitWidth();
1917 if (BitSize > Subtarget.getXLen())
1918 return false;
1919
1920 // Fast path, assume 32-bit immediates are cheap.
1921 int64_t Val = Imm.getSExtValue();
1922 if (isInt<32>(Val))
1923 return true;
1924
1925 // A constant pool entry may be more aligned thant he load we're trying to
1926 // replace. If we don't support unaligned scalar mem, prefer the constant
1927 // pool.
1928 // TODO: Can the caller pass down the alignment?
1929 if (!Subtarget.enableUnalignedScalarMem())
1930 return true;
1931
1932 // Prefer to keep the load if it would require many instructions.
1933 // This uses the same threshold we use for constant pools but doesn't
1934 // check useConstantPoolForLargeInts.
1935 // TODO: Should we keep the load only when we're definitely going to emit a
1936 // constant pool?
1937
1939 return Seq.size() <= Subtarget.getMaxBuildIntsCost();
1940}
1941
1945 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
1946 SelectionDAG &DAG) const {
1947 // One interesting pattern that we'd want to form is 'bit extract':
1948 // ((1 >> Y) & 1) ==/!= 0
1949 // But we also need to be careful not to try to reverse that fold.
1950
1951 // Is this '((1 >> Y) & 1)'?
1952 if (XC && OldShiftOpcode == ISD::SRL && XC->isOne())
1953 return false; // Keep the 'bit extract' pattern.
1954
1955 // Will this be '((1 >> Y) & 1)' after the transform?
1956 if (NewShiftOpcode == ISD::SRL && CC->isOne())
1957 return true; // Do form the 'bit extract' pattern.
1958
1959 // If 'X' is a constant, and we transform, then we will immediately
1960 // try to undo the fold, thus causing endless combine loop.
1961 // So only do the transform if X is not a constant. This matches the default
1962 // implementation of this function.
1963 return !XC;
1964}
1965
1966bool RISCVTargetLowering::canSplatOperand(unsigned Opcode, int Operand) const {
1967 switch (Opcode) {
1968 case Instruction::Add:
1969 case Instruction::Sub:
1970 case Instruction::Mul:
1971 case Instruction::And:
1972 case Instruction::Or:
1973 case Instruction::Xor:
1974 case Instruction::FAdd:
1975 case Instruction::FSub:
1976 case Instruction::FMul:
1977 case Instruction::FDiv:
1978 case Instruction::ICmp:
1979 case Instruction::FCmp:
1980 return true;
1981 case Instruction::Shl:
1982 case Instruction::LShr:
1983 case Instruction::AShr:
1984 case Instruction::UDiv:
1985 case Instruction::SDiv:
1986 case Instruction::URem:
1987 case Instruction::SRem:
1988 case Instruction::Select:
1989 return Operand == 1;
1990 default:
1991 return false;
1992 }
1993}
1994
1995
1997 if (!I->getType()->isVectorTy() || !Subtarget.hasVInstructions())
1998 return false;
1999
2000 if (canSplatOperand(I->getOpcode(), Operand))
2001 return true;
2002
2003 auto *II = dyn_cast<IntrinsicInst>(I);
2004 if (!II)
2005 return false;
2006
2007 switch (II->getIntrinsicID()) {
2008 case Intrinsic::fma:
2009 case Intrinsic::vp_fma:
2010 return Operand == 0 || Operand == 1;
2011 case Intrinsic::vp_shl:
2012 case Intrinsic::vp_lshr:
2013 case Intrinsic::vp_ashr:
2014 case Intrinsic::vp_udiv:
2015 case Intrinsic::vp_sdiv:
2016 case Intrinsic::vp_urem:
2017 case Intrinsic::vp_srem:
2018 case Intrinsic::ssub_sat:
2019 case Intrinsic::vp_ssub_sat:
2020 case Intrinsic::usub_sat:
2021 case Intrinsic::vp_usub_sat:
2022 return Operand == 1;
2023 // These intrinsics are commutative.
2024 case Intrinsic::vp_add:
2025 case Intrinsic::vp_mul:
2026 case Intrinsic::vp_and:
2027 case Intrinsic::vp_or:
2028 case Intrinsic::vp_xor:
2029 case Intrinsic::vp_fadd:
2030 case Intrinsic::vp_fmul:
2031 case Intrinsic::vp_icmp:
2032 case Intrinsic::vp_fcmp:
2033 case Intrinsic::smin:
2034 case Intrinsic::vp_smin:
2035 case Intrinsic::umin:
2036 case Intrinsic::vp_umin:
2037 case Intrinsic::smax:
2038 case Intrinsic::vp_smax:
2039 case Intrinsic::umax:
2040 case Intrinsic::vp_umax:
2041 case Intrinsic::sadd_sat:
2042 case Intrinsic::vp_sadd_sat:
2043 case Intrinsic::uadd_sat:
2044 case Intrinsic::vp_uadd_sat:
2045 // These intrinsics have 'vr' versions.
2046 case Intrinsic::vp_sub:
2047 case Intrinsic::vp_fsub:
2048 case Intrinsic::vp_fdiv:
2049 return Operand == 0 || Operand == 1;
2050 default:
2051 return false;
2052 }
2053}
2054
2055/// Check if sinking \p I's operands to I's basic block is profitable, because
2056/// the operands can be folded into a target instruction, e.g.
2057/// splats of scalars can fold into vector instructions.
2059 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
2060 using namespace llvm::PatternMatch;
2061
2062 if (!I->getType()->isVectorTy() || !Subtarget.hasVInstructions())
2063 return false;
2064
2065 // Don't sink splat operands if the target prefers it. Some targets requires
2066 // S2V transfer buffers and we can run out of them copying the same value
2067 // repeatedly.
2068 // FIXME: It could still be worth doing if it would improve vector register
2069 // pressure and prevent a vector spill.
2070 if (!Subtarget.sinkSplatOperands())
2071 return false;
2072
2073 for (auto OpIdx : enumerate(I->operands())) {
2074 if (!canSplatOperand(I, OpIdx.index()))
2075 continue;
2076
2077 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
2078 // Make sure we are not already sinking this operand
2079 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
2080 continue;
2081
2082 // We are looking for a splat that can be sunk.
2084 m_Undef(), m_ZeroMask())))
2085 continue;
2086
2087 // Don't sink i1 splats.
2088 if (cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(1))
2089 continue;
2090
2091 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
2092 // and vector registers
2093 for (Use &U : Op->uses()) {
2094 Instruction *Insn = cast<Instruction>(U.getUser());
2095 if (!canSplatOperand(Insn, U.getOperandNo()))
2096 return false;
2097 }
2098
2099 Ops.push_back(&Op->getOperandUse(0));
2100 Ops.push_back(&OpIdx.value());
2101 }
2102 return true;
2103}
2104
2106 unsigned Opc = VecOp.getOpcode();
2107
2108 // Assume target opcodes can't be scalarized.
2109 // TODO - do we have any exceptions?
2110 if (Opc >= ISD::BUILTIN_OP_END)
2111 return false;
2112
2113 // If the vector op is not supported, try to convert to scalar.
2114 EVT VecVT = VecOp.getValueType();
2115 if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
2116 return true;
2117
2118 // If the vector op is supported, but the scalar op is not, the transform may
2119 // not be worthwhile.
2120 // Permit a vector binary operation can be converted to scalar binary
2121 // operation which is custom lowered with illegal type.
2122 EVT ScalarVT = VecVT.getScalarType();
2123 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT) ||
2124 isOperationCustom(Opc, ScalarVT);
2125}
2126
2128 const GlobalAddressSDNode *GA) const {
2129 // In order to maximise the opportunity for common subexpression elimination,
2130 // keep a separate ADD node for the global address offset instead of folding
2131 // it in the global address node. Later peephole optimisations may choose to
2132 // fold it back in when profitable.
2133 return false;
2134}
2135
2136// Return one of the followings:
2137// (1) `{0-31 value, false}` if FLI is available for Imm's type and FP value.
2138// (2) `{0-31 value, true}` if Imm is negative and FLI is available for its
2139// positive counterpart, which will be materialized from the first returned
2140// element. The second returned element indicated that there should be a FNEG
2141// followed.
2142// (3) `{-1, _}` if there is no way FLI can be used to materialize Imm.
2143std::pair<int, bool> RISCVTargetLowering::getLegalZfaFPImm(const APFloat &Imm,
2144 EVT VT) const {
2145 if (!Subtarget.hasStdExtZfa())
2146 return std::make_pair(-1, false);
2147
2148 bool IsSupportedVT = false;
2149 if (VT == MVT::f16) {
2150 IsSupportedVT = Subtarget.hasStdExtZfh() || Subtarget.hasStdExtZvfh();
2151 } else if (VT == MVT::f32) {
2152 IsSupportedVT = true;
2153 } else if (VT == MVT::f64) {
2154 assert(Subtarget.hasStdExtD() && "Expect D extension");
2155 IsSupportedVT = true;
2156 }
2157
2158 if (!IsSupportedVT)
2159 return std::make_pair(-1, false);
2160
2162 if (Index < 0 && Imm.isNegative())
2163 // Try the combination of its positive counterpart + FNEG.
2164 return std::make_pair(RISCVLoadFPImm::getLoadFPImm(-Imm), true);
2165 else
2166 return std::make_pair(Index, false);
2167}
2168
2170 bool ForCodeSize) const {
2171 bool IsLegalVT = false;
2172 if (VT == MVT::f16)
2173 IsLegalVT = Subtarget.hasStdExtZfhminOrZhinxmin();
2174 else if (VT == MVT::f32)
2175 IsLegalVT = Subtarget.hasStdExtFOrZfinx();
2176 else if (VT == MVT::f64)
2177 IsLegalVT = Subtarget.hasStdExtDOrZdinx();
2178 else if (VT == MVT::bf16)
2179 IsLegalVT = Subtarget.hasStdExtZfbfmin();
2180
2181 if (!IsLegalVT)
2182 return false;
2183
2184 if (getLegalZfaFPImm(Imm, VT).first >= 0)
2185 return true;
2186
2187 // Cannot create a 64 bit floating-point immediate value for rv32.
2188 if (Subtarget.getXLen() < VT.getScalarSizeInBits()) {
2189 // td can handle +0.0 or -0.0 already.
2190 // -0.0 can be created by fmv + fneg.
2191 return Imm.isZero();
2192 }
2193
2194 // Special case: fmv + fneg
2195 if (Imm.isNegZero())
2196 return true;
2197
2198 // Building an integer and then converting requires a fmv at the end of
2199 // the integer sequence.
2200 const int Cost =
2201 1 + RISCVMatInt::getIntMatCost(Imm.bitcastToAPInt(), Subtarget.getXLen(),
2202 Subtarget);
2203 return Cost <= FPImmCost;
2204}
2205
2206// TODO: This is very conservative.
2208 unsigned Index) const {
2210 return false;
2211
2212 // Only support extracting a fixed from a fixed vector for now.
2213 if (ResVT.isScalableVector() || SrcVT.isScalableVector())
2214 return false;
2215
2216 EVT EltVT = ResVT.getVectorElementType();
2217 assert(EltVT == SrcVT.getVectorElementType() && "Should hold for node");
2218
2219 // The smallest type we can slide is i8.
2220 // TODO: We can extract index 0 from a mask vector without a slide.
2221 if (EltVT == MVT::i1)
2222 return false;
2223
2224 unsigned ResElts = ResVT.getVectorNumElements();
2225 unsigned SrcElts = SrcVT.getVectorNumElements();
2226
2227 unsigned MinVLen = Subtarget.getRealMinVLen();
2228 unsigned MinVLMAX = MinVLen / EltVT.getSizeInBits();
2229
2230 // If we're extracting only data from the first VLEN bits of the source
2231 // then we can always do this with an m1 vslidedown.vx. Restricting the
2232 // Index ensures we can use a vslidedown.vi.
2233 // TODO: We can generalize this when the exact VLEN is known.
2234 if (Index + ResElts <= MinVLMAX && Index < 31)
2235 return true;
2236
2237 // Convervatively only handle extracting half of a vector.
2238 // TODO: For sizes which aren't multiples of VLEN sizes, this may not be
2239 // a cheap extract. However, this case is important in practice for
2240 // shuffled extracts of longer vectors. How resolve?
2241 if ((ResElts * 2) != SrcElts)
2242 return false;
2243
2244 // Slide can support arbitrary index, but we only treat vslidedown.vi as
2245 // cheap.
2246 if (Index >= 32)
2247 return false;
2248
2249 // TODO: We can do arbitrary slidedowns, but for now only support extracting
2250 // the upper half of a vector until we have more test coverage.
2251 return Index == 0 || Index == ResElts;
2252}
2253
2256 EVT VT) const {
2257 // Use f32 to pass f16 if it is legal and Zfh/Zfhmin is not enabled.
2258 // We might still end up using a GPR but that will be decided based on ABI.
2259 if (VT == MVT::f16 && Subtarget.hasStdExtFOrZfinx() &&
2260 !Subtarget.hasStdExtZfhminOrZhinxmin())
2261 return MVT::f32;
2262
2264
2265 return PartVT;
2266}
2267
2270 EVT VT) const {
2271 // Use f32 to pass f16 if it is legal and Zfh/Zfhmin is not enabled.
2272 // We might still end up using a GPR but that will be decided based on ABI.
2273 if (VT == MVT::f16 && Subtarget.hasStdExtFOrZfinx() &&
2274 !Subtarget.hasStdExtZfhminOrZhinxmin())
2275 return 1;
2276
2278}
2279
2281 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
2282 unsigned &NumIntermediates, MVT &RegisterVT) const {
2284 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
2285
2286 return NumRegs;
2287}
2288
2289// Changes the condition code and swaps operands if necessary, so the SetCC
2290// operation matches one of the comparisons supported directly by branches
2291// in the RISC-V ISA. May adjust compares to favor compare with 0 over compare
2292// with 1/-1.
2293static void translateSetCCForBranch(const SDLoc &DL, SDValue &LHS, SDValue &RHS,
2294 ISD::CondCode &CC, SelectionDAG &DAG) {
2295 // If this is a single bit test that can't be handled by ANDI, shift the
2296 // bit to be tested to the MSB and perform a signed compare with 0.
2297 if (isIntEqualitySetCC(CC) && isNullConstant(RHS) &&
2298 LHS.getOpcode() == ISD::AND && LHS.hasOneUse() &&
2299 isa<ConstantSDNode>(LHS.getOperand(1))) {
2300 uint64_t Mask = LHS.getConstantOperandVal(1);
2301 if ((isPowerOf2_64(Mask) || isMask_64(Mask)) && !isInt<12>(Mask)) {
2302 unsigned ShAmt = 0;
2303 if (isPowerOf2_64(Mask)) {
2305 ShAmt = LHS.getValueSizeInBits() - 1 - Log2_64(Mask);
2306 } else {
2307 ShAmt = LHS.getValueSizeInBits() - llvm::bit_width(Mask);
2308 }
2309
2310 LHS = LHS.getOperand(0);
2311 if (ShAmt != 0)
2312 LHS = DAG.getNode(ISD::SHL, DL, LHS.getValueType(), LHS,
2313 DAG.getConstant(ShAmt, DL, LHS.getValueType()));
2314 return;
2315 }
2316 }
2317
2318 if (auto *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
2319 int64_t C = RHSC->getSExtValue();
2320 switch (CC) {
2321 default: break;
2322 case ISD::SETGT:
2323 // Convert X > -1 to X >= 0.
2324 if (C == -1) {
2325 RHS = DAG.getConstant(0, DL, RHS.getValueType());
2326 CC = ISD::SETGE;
2327 return;
2328 }
2329 break;
2330 case ISD::SETLT:
2331 // Convert X < 1 to 0 >= X.
2332 if (C == 1) {
2333 RHS = LHS;
2334 LHS = DAG.getConstant(0, DL, RHS.getValueType());
2335 CC = ISD::SETGE;
2336 return;
2337 }
2338 break;
2339 }
2340 }
2341
2342 switch (CC) {
2343 default:
2344 break;
2345 case ISD::SETGT:
2346 case ISD::SETLE:
2347 case ISD::SETUGT:
2348 case ISD::SETULE:
2350 std::swap(LHS, RHS);
2351 break;
2352 }
2353}
2354
2356 assert(VT.isScalableVector() && "Expecting a scalable vector type");
2357 unsigned KnownSize = VT.getSizeInBits().getKnownMinValue();
2358 if (VT.getVectorElementType() == MVT::i1)
2359 KnownSize *= 8;
2360
2361 switch (KnownSize) {
2362 default:
2363 llvm_unreachable("Invalid LMUL.");
2364 case 8:
2366 case 16:
2368 case 32:
2370 case 64:
2372 case 128:
2374 case 256:
2376 case 512:
2378 }
2379}
2380
2382 switch (LMul) {
2383 default:
2384 llvm_unreachable("Invalid LMUL.");
2389 return RISCV::VRRegClassID;
2391 return RISCV::VRM2RegClassID;
2393 return RISCV::VRM4RegClassID;
2395 return RISCV::VRM8RegClassID;
2396 }
2397}
2398
2400 RISCVII::VLMUL LMUL = getLMUL(VT);
2401 if (LMUL == RISCVII::VLMUL::LMUL_F8 ||
2402 LMUL == RISCVII::VLMUL::LMUL_F4 ||
2403 LMUL == RISCVII::VLMUL::LMUL_F2 ||
2404 LMUL == RISCVII::VLMUL::LMUL_1) {
2405 static_assert(RISCV::sub_vrm1_7 == RISCV::sub_vrm1_0 + 7,
2406 "Unexpected subreg numbering");
2407 return RISCV::sub_vrm1_0 + Index;
2408 }
2409 if (LMUL == RISCVII::VLMUL::LMUL_2) {
2410 static_assert(RISCV::sub_vrm2_3 == RISCV::sub_vrm2_0 + 3,
2411 "Unexpected subreg numbering");
2412 return RISCV::sub_vrm2_0 + Index;
2413 }
2414 if (LMUL == RISCVII::VLMUL::LMUL_4) {
2415 static_assert(RISCV::sub_vrm4_1 == RISCV::sub_vrm4_0 + 1,
2416 "Unexpected subreg numbering");
2417 return RISCV::sub_vrm4_0 + Index;
2418 }
2419 llvm_unreachable("Invalid vector type.");
2420}
2421
2423 if (VT.getVectorElementType() == MVT::i1)
2424 return RISCV::VRRegClassID;
2425 return getRegClassIDForLMUL(getLMUL(VT));
2426}
2427
2428// Attempt to decompose a subvector insert/extract between VecVT and
2429// SubVecVT via subregister indices. Returns the subregister index that
2430// can perform the subvector insert/extract with the given element index, as
2431// well as the index corresponding to any leftover subvectors that must be
2432// further inserted/extracted within the register class for SubVecVT.
2433std::pair<unsigned, unsigned>
2435 MVT VecVT, MVT SubVecVT, unsigned InsertExtractIdx,
2436 const RISCVRegisterInfo *TRI) {
2437 static_assert((RISCV::VRM8RegClassID > RISCV::VRM4RegClassID &&
2438 RISCV::VRM4RegClassID > RISCV::VRM2RegClassID &&
2439 RISCV::VRM2RegClassID > RISCV::VRRegClassID),
2440 "Register classes not ordered");
2441 unsigned VecRegClassID = getRegClassIDForVecVT(VecVT);
2442 unsigned SubRegClassID = getRegClassIDForVecVT(SubVecVT);
2443 // Try to compose a subregister index that takes us from the incoming
2444 // LMUL>1 register class down to the outgoing one. At each step we half
2445 // the LMUL:
2446 // nxv16i32@12 -> nxv2i32: sub_vrm4_1_then_sub_vrm2_1_then_sub_vrm1_0
2447 // Note that this is not guaranteed to find a subregister index, such as
2448 // when we are extracting from one VR type to another.
2449 unsigned SubRegIdx = RISCV::NoSubRegister;
2450 for (const unsigned RCID :
2451 {RISCV::VRM4RegClassID, RISCV::VRM2RegClassID, RISCV::VRRegClassID})
2452 if (VecRegClassID > RCID && SubRegClassID <= RCID) {
2453 VecVT = VecVT.getHalfNumVectorElementsVT();
2454 bool IsHi =
2455 InsertExtractIdx >= VecVT.getVectorElementCount().getKnownMinValue();
2456 SubRegIdx = TRI->composeSubRegIndices(SubRegIdx,
2457 getSubregIndexByMVT(VecVT, IsHi));
2458 if (IsHi)
2459 InsertExtractIdx -= VecVT.getVectorElementCount().getKnownMinValue();
2460 }
2461 return {SubRegIdx, InsertExtractIdx};
2462}
2463
2464// Permit combining of mask vectors as BUILD_VECTOR never expands to scalar
2465// stores for those types.
2466bool RISCVTargetLowering::mergeStoresAfterLegalization(EVT VT) const {
2467 return !Subtarget.useRVVForFixedLengthVectors() ||
2468 (VT.isFixedLengthVector() && VT.getVectorElementType() == MVT::i1);
2469}
2470
2472 if (!ScalarTy.isSimple())
2473 return false;
2474 switch (ScalarTy.getSimpleVT().SimpleTy) {
2475 case MVT::iPTR:
2476 return Subtarget.is64Bit() ? Subtarget.hasVInstructionsI64() : true;
2477 case MVT::i8:
2478 case MVT::i16:
2479 case MVT::i32:
2480 return true;
2481 case MVT::i64:
2482 return Subtarget.hasVInstructionsI64();
2483 case MVT::f16:
2484 return Subtarget.hasVInstructionsF16();
2485 case MVT::f32:
2486 return Subtarget.hasVInstructionsF32();
2487 case MVT::f64:
2488 return Subtarget.hasVInstructionsF64();
2489 default:
2490 return false;
2491 }
2492}
2493
2494
2495unsigned RISCVTargetLowering::combineRepeatedFPDivisors() const {
2496 return NumRepeatedDivisors;
2497}
2498
2500 assert((Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
2501 Op.getOpcode() == ISD::INTRINSIC_W_CHAIN) &&
2502 "Unexpected opcode");
2503 bool HasChain = Op.getOpcode() == ISD::INTRINSIC_W_CHAIN;
2504 unsigned IntNo = Op.getConstantOperandVal(HasChain ? 1 : 0);
2506 RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IntNo);
2507 if (!II)
2508 return SDValue();
2509 return Op.getOperand(II->VLOperand + 1 + HasChain);
2510}
2511
2513 const RISCVSubtarget &Subtarget) {
2514 assert(VT.isFixedLengthVector() && "Expected a fixed length vector type!");
2515 if (!Subtarget.useRVVForFixedLengthVectors())
2516 return false;
2517
2518 // We only support a set of vector types with a consistent maximum fixed size
2519 // across all supported vector element types to avoid legalization issues.
2520 // Therefore -- since the largest is v1024i8/v512i16/etc -- the largest
2521 // fixed-length vector type we support is 1024 bytes.
2522 if (VT.getFixedSizeInBits() > 1024 * 8)
2523 return false;
2524
2525 unsigned MinVLen = Subtarget.getRealMinVLen();
2526
2527 MVT EltVT = VT.getVectorElementType();
2528
2529 // Don't use RVV for vectors we cannot scalarize if required.
2530 switch (EltVT.SimpleTy) {
2531 // i1 is supported but has different rules.
2532 default:
2533 return false;
2534 case MVT::i1:
2535 // Masks can only use a single register.
2536 if (VT.getVectorNumElements() > MinVLen)
2537 return false;
2538 MinVLen /= 8;
2539 break;
2540 case MVT::i8:
2541 case MVT::i16:
2542 case MVT::i32:
2543 break;
2544 case MVT::i64:
2545 if (!Subtarget.hasVInstructionsI64())
2546 return false;
2547 break;
2548 case MVT::f16:
2549 if (!Subtarget.hasVInstructionsF16Minimal())
2550 return false;
2551 break;
2552 case MVT::bf16:
2553 if (!Subtarget.hasVInstructionsBF16Minimal())
2554 return false;
2555 break;
2556 case MVT::f32:
2557 if (!Subtarget.hasVInstructionsF32())
2558 return false;
2559 break;
2560 case MVT::f64:
2561 if (!Subtarget.hasVInstructionsF64())
2562 return false;
2563 break;
2564 }
2565
2566 // Reject elements larger than ELEN.
2567 if (EltVT.getSizeInBits() > Subtarget.getELen())
2568 return false;
2569
2570 unsigned LMul = divideCeil(VT.getSizeInBits(), MinVLen);
2571 // Don't use RVV for types that don't fit.
2572 if (LMul > Subtarget.getMaxLMULForFixedLengthVectors())
2573 return false;
2574
2575 // TODO: Perhaps an artificial restriction, but worth having whilst getting
2576 // the base fixed length RVV support in place.
2577 if (!VT.isPow2VectorType())
2578 return false;
2579
2580 return true;
2581}
2582
2583bool RISCVTargetLowering::useRVVForFixedLengthVectorVT(MVT VT) const {
2584 return ::useRVVForFixedLengthVectorVT(VT, Subtarget);
2585}
2586
2587// Return the largest legal scalable vector type that matches VT's element type.
2589 const RISCVSubtarget &Subtarget) {
2590 // This may be called before legal types are setup.
2591 assert(((VT.isFixedLengthVector() && TLI.isTypeLegal(VT)) ||
2592 useRVVForFixedLengthVectorVT(VT, Subtarget)) &&
2593 "Expected legal fixed length vector!");
2594
2595 unsigned MinVLen = Subtarget.getRealMinVLen();
2596 unsigned MaxELen = Subtarget.getELen();
2597
2598 MVT EltVT = VT.getVectorElementType();
2599 switch (EltVT.SimpleTy) {
2600 default:
2601 llvm_unreachable("unexpected element type for RVV container");
2602 case MVT::i1:
2603 case MVT::i8:
2604 case MVT::i16:
2605 case MVT::i32:
2606 case MVT::i64:
2607 case MVT::bf16:
2608 case MVT::f16:
2609 case MVT::f32:
2610 case MVT::f64: {
2611 // We prefer to use LMUL=1 for VLEN sized types. Use fractional lmuls for
2612 // narrower types. The smallest fractional LMUL we support is 8/ELEN. Within
2613 // each fractional LMUL we support SEW between 8 and LMUL*ELEN.
2614 unsigned NumElts =
2616 NumElts = std::max(NumElts, RISCV::RVVBitsPerBlock / MaxELen);
2617 assert(isPowerOf2_32(NumElts) && "Expected power of 2 NumElts");
2618 return MVT::getScalableVectorVT(EltVT, NumElts);
2619 }
2620 }
2621}
2622
2624 const RISCVSubtarget &Subtarget) {
2626 Subtarget);
2627}
2628
2630 return ::getContainerForFixedLengthVector(*this, VT, getSubtarget());
2631}
2632
2633// Grow V to consume an entire RVV register.
2635 const RISCVSubtarget &Subtarget) {
2636 assert(VT.isScalableVector() &&
2637 "Expected to convert into a scalable vector!");
2638 assert(V.getValueType().isFixedLengthVector() &&
2639 "Expected a fixed length vector operand!");
2640 SDLoc DL(V);
2641 SDValue Zero = DAG.getVectorIdxConstant(0, DL);
2642 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
2643}
2644
2645// Shrink V so it's just big enough to maintain a VT's worth of data.
2647 const RISCVSubtarget &Subtarget) {
2649 "Expected to convert into a fixed length vector!");
2650 assert(V.getValueType().isScalableVector() &&
2651 "Expected a scalable vector operand!");
2652 SDLoc DL(V);
2653 SDValue Zero = DAG.getConstant(0, DL, Subtarget.getXLenVT());
2654 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
2655}
2656
2657/// Return the type of the mask type suitable for masking the provided
2658/// vector type. This is simply an i1 element type vector of the same
2659/// (possibly scalable) length.
2660static MVT getMaskTypeFor(MVT VecVT) {
2661 assert(VecVT.isVector());
2663 return MVT::getVectorVT(MVT::i1, EC);
2664}
2665
2666/// Creates an all ones mask suitable for masking a vector of type VecTy with
2667/// vector length VL. .
2668static SDValue getAllOnesMask(MVT VecVT, SDValue VL, const SDLoc &DL,
2669 SelectionDAG &DAG) {
2670 MVT MaskVT = getMaskTypeFor(VecVT);
2671 return DAG.getNode(RISCVISD::VMSET_VL, DL, MaskVT, VL);
2672}
2673
2674static std::pair<SDValue, SDValue>
2676 const RISCVSubtarget &Subtarget) {
2677 assert(VecVT.isScalableVector() && "Expecting a scalable vector");
2678 SDValue VL = DAG.getRegister(RISCV::X0, Subtarget.getXLenVT());
2679 SDValue Mask = getAllOnesMask(VecVT, VL, DL, DAG);
2680 return {Mask, VL};
2681}
2682
2683static std::pair<SDValue, SDValue>
2684getDefaultVLOps(uint64_t NumElts, MVT ContainerVT, const SDLoc &DL,
2685 SelectionDAG &DAG, const RISCVSubtarget &Subtarget) {
2686 assert(ContainerVT.isScalableVector() && "Expecting scalable container type");
2687 SDValue VL = DAG.getConstant(NumElts, DL, Subtarget.getXLenVT());
2688 SDValue Mask = getAllOnesMask(ContainerVT, VL, DL, DAG);
2689 return {Mask, VL};
2690}
2691
2692// Gets the two common "VL" operands: an all-ones mask and the vector length.
2693// VecVT is a vector type, either fixed-length or scalable, and ContainerVT is
2694// the vector type that the fixed-length vector is contained in. Otherwise if
2695// VecVT is scalable, then ContainerVT should be the same as VecVT.
2696static std::pair<SDValue, SDValue>
2697getDefaultVLOps(MVT VecVT, MVT ContainerVT, const SDLoc &DL, SelectionDAG &DAG,
2698 const RISCVSubtarget &Subtarget) {
2699 if (VecVT.isFixedLengthVector())
2700 return getDefaultVLOps(VecVT.getVectorNumElements(), ContainerVT, DL, DAG,
2701 Subtarget);
2702 assert(ContainerVT.isScalableVector() && "Expecting scalable container type");
2703 return getDefaultScalableVLOps(ContainerVT, DL, DAG, Subtarget);
2704}
2705
2707 SelectionDAG &DAG) const {
2708 assert(VecVT.isScalableVector() && "Expected scalable vector");
2709 return DAG.getElementCount(DL, Subtarget.getXLenVT(),
2710 VecVT.getVectorElementCount());
2711}
2712
2713std::pair<unsigned, unsigned>
2715 const RISCVSubtarget &Subtarget) {
2716 assert(VecVT.isScalableVector() && "Expected scalable vector");
2717
2718 unsigned EltSize = VecVT.getScalarSizeInBits();
2719 unsigned MinSize = VecVT.getSizeInBits().getKnownMinValue();
2720
2721 unsigned VectorBitsMax = Subtarget.getRealMaxVLen();
2722 unsigned MaxVLMAX =
2723 RISCVTargetLowering::computeVLMAX(VectorBitsMax, EltSize, MinSize);
2724
2725 unsigned VectorBitsMin = Subtarget.getRealMinVLen();
2726 unsigned MinVLMAX =
2727 RISCVTargetLowering::computeVLMAX(VectorBitsMin, EltSize, MinSize);
2728
2729 return std::make_pair(MinVLMAX, MaxVLMAX);
2730}
2731
2732// The state of RVV BUILD_VECTOR and VECTOR_SHUFFLE lowering is that very few
2733// of either is (currently) supported. This can get us into an infinite loop
2734// where we try to lower a BUILD_VECTOR as a VECTOR_SHUFFLE as a BUILD_VECTOR
2735// as a ..., etc.
2736// Until either (or both) of these can reliably lower any node, reporting that
2737// we don't want to expand BUILD_VECTORs via VECTOR_SHUFFLEs at least breaks
2738// the infinite loop. Note that this lowers BUILD_VECTOR through the stack,
2739// which is not desirable.
2741 EVT VT, unsigned DefinedValues) const {
2742 return false;
2743}
2744
2746 // TODO: Here assume reciprocal throughput is 1 for LMUL_1, it is
2747 // implementation-defined.
2748 if (!VT.isVector())
2750 unsigned DLenFactor = Subtarget.getDLenFactor();
2751 unsigned Cost;
2752 if (VT.isScalableVector()) {
2753 unsigned LMul;
2754 bool Fractional;
2755 std::tie(LMul, Fractional) =
2757 if (Fractional)
2758 Cost = LMul <= DLenFactor ? (DLenFactor / LMul) : 1;
2759 else
2760 Cost = (LMul * DLenFactor);
2761 } else {
2762 Cost = divideCeil(VT.getSizeInBits(), Subtarget.getRealMinVLen() / DLenFactor);
2763 }
2764 return Cost;
2765}
2766
2767
2768/// Return the cost of a vrgather.vv instruction for the type VT. vrgather.vv
2769/// is generally quadratic in the number of vreg implied by LMUL. Note that
2770/// operand (index and possibly mask) are handled separately.
2772 return getLMULCost(VT) * getLMULCost(VT);
2773}
2774
2775/// Return the cost of a vrgather.vi (or vx) instruction for the type VT.
2776/// vrgather.vi/vx may be linear in the number of vregs implied by LMUL,
2777/// or may track the vrgather.vv cost. It is implementation-dependent.
2779 return getLMULCost(VT);
2780}
2781
2782/// Return the cost of a vslidedown.vx or vslideup.vx instruction
2783/// for the type VT. (This does not cover the vslide1up or vslide1down
2784/// variants.) Slides may be linear in the number of vregs implied by LMUL,
2785/// or may track the vrgather.vv cost. It is implementation-dependent.
2787 return getLMULCost(VT);
2788}
2789
2790/// Return the cost of a vslidedown.vi or vslideup.vi instruction
2791/// for the type VT. (This does not cover the vslide1up or vslide1down
2792/// variants.) Slides may be linear in the number of vregs implied by LMUL,
2793/// or may track the vrgather.vv cost. It is implementation-dependent.
2795 return getLMULCost(VT);
2796}
2797
2799 const RISCVSubtarget &Subtarget) {
2800 // RISC-V FP-to-int conversions saturate to the destination register size, but
2801 // don't produce 0 for nan. We can use a conversion instruction and fix the
2802 // nan case with a compare and a select.
2803 SDValue Src = Op.getOperand(0);
2804
2805 MVT DstVT = Op.getSimpleValueType();
2806 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
2807
2808 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT_SAT;
2809
2810 if (!DstVT.isVector()) {
2811 // For bf16 or for f16 in absense of Zfh, promote to f32, then saturate
2812 // the result.
2813 if ((Src.getValueType() == MVT::f16 && !Subtarget.hasStdExtZfhOrZhinx()) ||
2814 Src.getValueType() == MVT::bf16) {
2815 Src = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, Src);
2816 }
2817
2818 unsigned Opc;
2819 if (SatVT == DstVT)
2820 Opc = IsSigned ? RISCVISD::FCVT_X : RISCVISD::FCVT_XU;
2821 else if (DstVT == MVT::i64 && SatVT == MVT::i32)
2823 else
2824 return SDValue();
2825 // FIXME: Support other SatVTs by clamping before or after the conversion.
2826
2827 SDLoc DL(Op);
2828 SDValue FpToInt = DAG.getNode(
2829 Opc, DL, DstVT, Src,
2831
2832 if (Opc == RISCVISD::FCVT_WU_RV64)
2833 FpToInt = DAG.getZeroExtendInReg(FpToInt, DL, MVT::i32);
2834
2835 SDValue ZeroInt = DAG.getConstant(0, DL, DstVT);
2836 return DAG.getSelectCC(DL, Src, Src, ZeroInt, FpToInt,
2838 }
2839
2840 // Vectors.
2841
2842 MVT DstEltVT = DstVT.getVectorElementType();
2843 MVT SrcVT = Src.getSimpleValueType();
2844 MVT SrcEltVT = SrcVT.getVectorElementType();
2845 unsigned SrcEltSize = SrcEltVT.getSizeInBits();
2846 unsigned DstEltSize = DstEltVT.getSizeInBits();
2847
2848 // Only handle saturating to the destination type.
2849 if (SatVT != DstEltVT)
2850 return SDValue();
2851
2852 MVT DstContainerVT = DstVT;
2853 MVT SrcContainerVT = SrcVT;
2854 if (DstVT.isFixedLengthVector()) {
2855 DstContainerVT = getContainerForFixedLengthVector(DAG, DstVT, Subtarget);
2856 SrcContainerVT = getContainerForFixedLengthVector(DAG, SrcVT, Subtarget);
2857 assert(DstContainerVT.getVectorElementCount() ==
2858 SrcContainerVT.getVectorElementCount() &&
2859 "Expected same element count");
2860 Src = convertToScalableVector(SrcContainerVT, Src, DAG, Subtarget);
2861 }
2862
2863 SDLoc DL(Op);
2864
2865 auto [Mask, VL] = getDefaultVLOps(DstVT, DstContainerVT, DL, DAG, Subtarget);
2866
2867 SDValue IsNan = DAG.getNode(RISCVISD::SETCC_VL, DL, Mask.getValueType(),
2868 {Src, Src, DAG.getCondCode(ISD::SETNE),
2869 DAG.getUNDEF(Mask.getValueType()), Mask, VL});
2870
2871 // Need to widen by more than 1 step, promote the FP type, then do a widening
2872 // convert.
2873 if (DstEltSize > (2 * SrcEltSize)) {
2874 assert(SrcContainerVT.getVectorElementType() == MVT::f16 && "Unexpected VT!");
2875 MVT InterVT = SrcContainerVT.changeVectorElementType(MVT::f32);
2876 Src = DAG.getNode(RISCVISD::FP_EXTEND_VL, DL, InterVT, Src, Mask, VL);
2877 }
2878
2879 MVT CvtContainerVT = DstContainerVT;
2880 MVT CvtEltVT = DstEltVT;
2881 if (SrcEltSize > (2 * DstEltSize)) {
2882 CvtEltVT = MVT::getIntegerVT(SrcEltVT.getSizeInBits() / 2);
2883 CvtContainerVT = CvtContainerVT.changeVectorElementType(CvtEltVT);
2884 }
2885
2886 unsigned RVVOpc =
2888 SDValue Res = DAG.getNode(RVVOpc, DL, CvtContainerVT, Src, Mask, VL);
2889
2890 while (CvtContainerVT != DstContainerVT) {
2891 CvtEltVT = MVT::getIntegerVT(CvtEltVT.getSizeInBits() / 2);
2892 CvtContainerVT = CvtContainerVT.changeVectorElementType(CvtEltVT);
2893 // Rounding mode here is arbitrary since we aren't shifting out any bits.
2894 unsigned ClipOpc = IsSigned ? RISCVISD::TRUNCATE_VECTOR_VL_SSAT
2896 Res = DAG.getNode(ClipOpc, DL, CvtContainerVT, Res, Mask, VL);
2897 }
2898
2899 SDValue SplatZero = DAG.getNode(
2900 RISCVISD::VMV_V_X_VL, DL, DstContainerVT, DAG.getUNDEF(DstContainerVT),
2901 DAG.getConstant(0, DL, Subtarget.getXLenVT()), VL);
2902 Res = DAG.getNode(RISCVISD::VMERGE_VL, DL, DstContainerVT, IsNan, SplatZero,
2903 Res, DAG.getUNDEF(DstContainerVT), VL);
2904
2905 if (DstVT.isFixedLengthVector())
2906 Res = convertFromScalableVector(DstVT, Res, DAG, Subtarget);
2907
2908 return Res;
2909}
2910
2912 switch (Opc) {
2913 case ISD::FROUNDEVEN:
2915 case ISD::VP_FROUNDEVEN:
2916 return RISCVFPRndMode::RNE;
2917 case ISD::FTRUNC:
2918 case ISD::STRICT_FTRUNC:
2919 case ISD::VP_FROUNDTOZERO:
2920 return RISCVFPRndMode::RTZ;
2921 case ISD::FFLOOR:
2922 case ISD::STRICT_FFLOOR:
2923 case ISD::VP_FFLOOR:
2924 return RISCVFPRndMode::RDN;
2925 case ISD::FCEIL:
2926 case ISD::STRICT_FCEIL:
2927 case ISD::VP_FCEIL:
2928 return RISCVFPRndMode::RUP;
2929 case ISD::FROUND:
2930 case ISD::STRICT_FROUND:
2931 case ISD::VP_FROUND:
2932 return RISCVFPRndMode::RMM;
2933 case ISD::FRINT:
2934 return RISCVFPRndMode::DYN;
2935 }
2936
2938}
2939
2940// Expand vector FTRUNC, FCEIL, FFLOOR, FROUND, VP_FCEIL, VP_FFLOOR, VP_FROUND
2941// VP_FROUNDEVEN, VP_FROUNDTOZERO, VP_FRINT and VP_FNEARBYINT by converting to
2942// the integer domain and back. Taking care to avoid converting values that are
2943// nan or already correct.
2944static SDValue
2946 const RISCVSubtarget &Subtarget) {
2947 MVT VT = Op.getSimpleValueType();
2948 assert(VT.isVector() && "Unexpected type");
2949
2950 SDLoc DL(Op);
2951
2952 SDValue Src = Op.getOperand(0);
2953
2954 MVT ContainerVT = VT;
2955 if (VT.isFixedLengthVector()) {
2956 ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
2957 Src = convertToScalableVector(ContainerVT, Src, DAG, Subtarget);
2958 }
2959
2960 SDValue Mask, VL;
2961 if (Op->isVPOpcode()) {
2962 Mask = Op.getOperand(1);
2963 if (VT.isFixedLengthVector())
2964 Mask = convertToScalableVector(getMaskTypeFor(ContainerVT), Mask, DAG,
2965 Subtarget);
2966 VL = Op.getOperand(2);
2967 } else {
2968 std::tie(Mask, VL) = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
2969 }
2970
2971 // Freeze the source since we are increasing the number of uses.
2972 Src = DAG.getFreeze(Src);
2973
2974 // We do the conversion on the absolute value and fix the sign at the end.
2975 SDValue Abs = DAG.getNode(RISCVISD::FABS_VL, DL, ContainerVT, Src, Mask, VL);
2976
2977 // Determine the largest integer that can be represented exactly. This and
2978 // values larger than it don't have any fractional bits so don't need to
2979 // be converted.
2980 const fltSemantics &FltSem = DAG.EVTToAPFloatSemantics(ContainerVT);
2981 unsigned Precision = APFloat::semanticsPrecision(FltSem);
2982 APFloat MaxVal = APFloat(FltSem);
2983 MaxVal.convertFromAPInt(APInt::getOneBitSet(Precision, Precision - 1),
2984 /*IsSigned*/ false, APFloat::rmNearestTiesToEven);
2985 SDValue MaxValNode =
2986 DAG.getConstantFP(MaxVal, DL, ContainerVT.getVectorElementType());
2987 SDValue MaxValSplat = DAG.getNode(RISCVISD::VFMV_V_F_VL, DL, ContainerVT,
2988 DAG.getUNDEF(ContainerVT), MaxValNode, VL);
2989
2990 // If abs(Src) was larger than MaxVal or nan, keep it.
2991 MVT SetccVT = MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount());
2992 Mask =
2993 DAG.getNode(RISCVISD::SETCC_VL, DL, SetccVT,
2994 {Abs, MaxValSplat, DAG.getCondCode(ISD::SETOLT),
2995 Mask, Mask, VL});
2996
2997 // Truncate to integer and convert back to FP.
2998 MVT IntVT = ContainerVT.changeVectorElementTypeToInteger();
2999 MVT XLenVT = Subtarget.getXLenVT();
3000 SDValue Truncated;
3001
3002 switch (Op.getOpcode()) {
3003 default:
3004 llvm_unreachable("Unexpected opcode");
3005 case ISD::FCEIL:
3006 case ISD::VP_FCEIL:
3007 case ISD::FFLOOR:
3008 case ISD::VP_FFLOOR:
3009 case ISD::FROUND:
3010 case ISD::FROUNDEVEN:
3011 case ISD::VP_FROUND:
3012 case ISD::VP_FROUNDEVEN:
3013 case ISD::VP_FROUNDTOZERO: {
3016 Truncated = DAG.getNode(RISCVISD::VFCVT_RM_X_F_VL, DL, IntVT, Src, Mask,
3017 DAG.getTargetConstant(FRM, DL, XLenVT), VL);
3018 break;
3019 }
3020 case ISD::FTRUNC:
3021 Truncated = DAG.getNode(RISCVISD::VFCVT_RTZ_X_F_VL, DL, IntVT, Src,
3022 Mask, VL);
3023 break;
3024 case ISD::FRINT:
3025 case ISD::VP_FRINT:
3026 Truncated = DAG.getNode(RISCVISD::VFCVT_X_F_VL, DL, IntVT, Src, Mask, VL);
3027 break;
3028 case ISD::FNEARBYINT:
3029 case ISD::VP_FNEARBYINT:
3030 Truncated = DAG.getNode(RISCVISD::VFROUND_NOEXCEPT_VL, DL, ContainerVT, Src,
3031 Mask, VL);
3032 break;
3033 }
3034
3035 // VFROUND_NOEXCEPT_VL includes SINT_TO_FP_VL.
3036 if (Truncated.getOpcode() != RISCVISD::VFROUND_NOEXCEPT_VL)
3037 Truncated = DAG.getNode(RISCVISD::SINT_TO_FP_VL, DL, ContainerVT, Truncated,
3038 Mask, VL);
3039
3040 // Restore the original sign so that -0.0 is preserved.
3041 Truncated = DAG.getNode(RISCVISD::FCOPYSIGN_VL, DL, ContainerVT, Truncated,
3042 Src, Src, Mask, VL);
3043
3044 if (!VT.isFixedLengthVector())
3045 return Truncated;
3046
3047 return convertFromScalableVector(VT, Truncated, DAG, Subtarget);
3048}
3049
3050// Expand vector STRICT_FTRUNC, STRICT_FCEIL, STRICT_FFLOOR, STRICT_FROUND
3051// STRICT_FROUNDEVEN and STRICT_FNEARBYINT by converting sNan of the source to
3052// qNan and coverting the new source to integer and back to FP.
3053static SDValue
3055 const RISCVSubtarget &Subtarget) {
3056 SDLoc DL(Op);
3057 MVT VT = Op.getSimpleValueType();
3058 SDValue Chain = Op.getOperand(0);
3059 SDValue Src = Op.getOperand(1);
3060
3061 MVT ContainerVT = VT;
3062 if (VT.isFixedLengthVector()) {
3063 ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
3064 Src = convertToScalableVector(ContainerVT, Src, DAG, Subtarget);
3065 }
3066
3067 auto [Mask, VL] = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
3068
3069 // Freeze the source since we are increasing the number of uses.
3070 Src = DAG.getFreeze(Src);
3071
3072 // Covert sNan to qNan by executing x + x for all unordered elemenet x in Src.
3073 MVT MaskVT = Mask.getSimpleValueType();
3075 DAG.getVTList(MaskVT, MVT::Other),
3076 {Chain, Src, Src, DAG.getCondCode(ISD::SETUNE),
3077 DAG.getUNDEF(MaskVT), Mask, VL});
3078 Chain = Unorder.getValue(1);
3080 DAG.getVTList(ContainerVT, MVT::Other),
3081 {Chain, Src, Src, Src, Unorder, VL});
3082 Chain = Src.getValue(1);
3083
3084 // We do the conversion on the absolute value and fix the sign at the end.
3085 SDValue Abs = DAG.getNode(RISCVISD::FABS_VL, DL, ContainerVT, Src, Mask, VL);
3086
3087 // Determine the largest integer that can be represented exactly. This and
3088 // values larger than it don't have any fractional bits so don't need to
3089 // be converted.
3090 const fltSemantics &FltSem = DAG.EVTToAPFloatSemantics(ContainerVT);
3091 unsigned Precision = APFloat::semanticsPrecision(FltSem);
3092 APFloat MaxVal = APFloat(FltSem);
3093 MaxVal.convertFromAPInt(APInt::getOneBitSet(Precision, Precision - 1),
3094 /*IsSigned*/ false, APFloat::rmNearestTiesToEven);
3095 SDValue MaxValNode =
3096 DAG.getConstantFP(MaxVal, DL, ContainerVT.getVectorElementType());
3097 SDValue MaxValSplat = DAG.getNode(RISCVISD::VFMV_V_F_VL, DL, ContainerVT,
3098 DAG.getUNDEF(ContainerVT), MaxValNode, VL);
3099
3100 // If abs(Src) was larger than MaxVal or nan, keep it.
3101 Mask = DAG.getNode(
3102 RISCVISD::SETCC_VL, DL, MaskVT,
3103 {Abs, MaxValSplat, DAG.getCondCode(ISD::SETOLT), Mask, Mask, VL});
3104
3105 // Truncate to integer and convert back to FP.
3106 MVT IntVT = ContainerVT.changeVectorElementTypeToInteger();
3107 MVT XLenVT = Subtarget.getXLenVT();
3108 SDValue Truncated;
3109
3110 switch (Op.getOpcode()) {
3111 default:
3112 llvm_unreachable("Unexpected opcode");
3113 case ISD::STRICT_FCEIL:
3114 case ISD::STRICT_FFLOOR:
3115 case ISD::STRICT_FROUND:
3119 Truncated = DAG.getNode(
3120 RISCVISD::STRICT_VFCVT_RM_X_F_VL, DL, DAG.getVTList(IntVT, MVT::Other),
3121 {Chain, Src, Mask, DAG.getTargetConstant(FRM, DL, XLenVT), VL});
3122 break;
3123 }
3124 case ISD::STRICT_FTRUNC:
3125 Truncated =
3127 DAG.getVTList(IntVT, MVT::Other), Chain, Src, Mask, VL);
3128 break;
3131 DAG.getVTList(ContainerVT, MVT::Other), Chain, Src,
3132 Mask, VL);
3133 break;
3134 }
3135 Chain = Truncated.getValue(1);
3136
3137 // VFROUND_NOEXCEPT_VL includes SINT_TO_FP_VL.
3138 if (Op.getOpcode() != ISD::STRICT_FNEARBYINT) {
3139 Truncated = DAG.getNode(RISCVISD::STRICT_SINT_TO_FP_VL, DL,
3140 DAG.getVTList(ContainerVT, MVT::Other), Chain,
3141 Truncated, Mask, VL);
3142 Chain = Truncated.getValue(1);
3143 }
3144
3145 // Restore the original sign so that -0.0 is preserved.
3146 Truncated = DAG.getNode(RISCVISD::FCOPYSIGN_VL, DL, ContainerVT, Truncated,
3147 Src, Src, Mask, VL);
3148
3149 if (VT.isFixedLengthVector())
3150 Truncated = convertFromScalableVector(VT, Truncated, DAG, Subtarget);
3151 return DAG.getMergeValues({Truncated, Chain}, DL);
3152}
3153
3154static SDValue
3156 const RISCVSubtarget &Subtarget) {
3157 MVT VT = Op.getSimpleValueType();
3158 if (VT.isVector())
3159 return lowerVectorFTRUNC_FCEIL_FFLOOR_FROUND(Op, DAG, Subtarget);
3160
3161 if (DAG.shouldOptForSize())
3162 return SDValue();
3163
3164 SDLoc DL(Op);
3165 SDValue Src = Op.getOperand(0);
3166
3167 // Create an integer the size of the mantissa with the MSB set. This and all
3168 // values larger than it don't have any fractional bits so don't need to be
3169 // converted.
3170 const fltSemantics &FltSem = DAG.EVTToAPFloatSemantics(VT);
3171 unsigned Precision = APFloat::semanticsPrecision(FltSem);
3172 APFloat MaxVal = APFloat(FltSem);
3173 MaxVal.convertFromAPInt(APInt::getOneBitSet(Precision, Precision - 1),
3174 /*IsSigned*/ false, APFloat::rmNearestTiesToEven);
3175 SDValue MaxValNode = DAG.getConstantFP(MaxVal, DL, VT);
3176
3178 return DAG.getNode(RISCVISD::FROUND, DL, VT, Src, MaxValNode,
3179 DAG.getTargetConstant(FRM, DL, Subtarget.getXLenVT()));
3180}
3181
3182// Expand vector LRINT and LLRINT by converting to the integer domain.
3184 const RISCVSubtarget &Subtarget) {
3185 MVT VT = Op.getSimpleValueType();
3186 assert(VT.isVector() && "Unexpected type");
3187
3188 SDLoc DL(Op);
3189 SDValue Src = Op.getOperand(0);
3190 MVT ContainerVT = VT;
3191
3192 if (VT.isFixedLengthVector()) {
3193 ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
3194 Src = convertToScalableVector(ContainerVT, Src, DAG, Subtarget);
3195 }
3196
3197 auto [Mask, VL] = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
3198 SDValue Truncated =
3199 DAG.getNode(RISCVISD::VFCVT_X_F_VL, DL, ContainerVT, Src, Mask, VL);
3200
3201 if (!VT.isFixedLengthVector())
3202 return Truncated;
3203
3204 return convertFromScalableVector(VT, Truncated, DAG, Subtarget);
3205}
3206
3207static SDValue
3209 const SDLoc &DL, EVT VT, SDValue Passthru, SDValue Op,
3210 SDValue Offset, SDValue Mask, SDValue VL,
3212 if (Passthru.isUndef())
3214 SDValue PolicyOp = DAG.getTargetConstant(Policy, DL, Subtarget.getXLenVT());
3215 SDValue Ops[] = {Passthru, Op, Offset, Mask, VL, PolicyOp};
3216 return DAG.getNode(RISCVISD::VSLIDEDOWN_VL, DL, VT, Ops);
3217}
3218
3219static SDValue
3220getVSlideup(SelectionDAG &DAG, const RISCVSubtarget &Subtarget, const SDLoc &DL,
3221 EVT VT, SDValue Passthru, SDValue Op, SDValue Offset, SDValue Mask,
3222 SDValue VL,
3224 if (Passthru.isUndef())
3226 SDValue PolicyOp = DAG.getTargetConstant(Policy, DL, Subtarget.getXLenVT());
3227 SDValue Ops[] = {Passthru, Op, Offset, Mask, VL, PolicyOp};
3228 return DAG.getNode(RISCVISD::VSLIDEUP_VL, DL, VT, Ops);
3229}
3230
3231static MVT getLMUL1VT(MVT VT) {
3233 "Unexpected vector MVT");
3237}
3238
3242 int64_t Addend;
3243};
3244
3245static std::optional<APInt> getExactInteger(const APFloat &APF,
3247 // We will use a SINT_TO_FP to materialize this constant so we should use a
3248 // signed APSInt here.
3249 APSInt ValInt(BitWidth, /*IsUnsigned*/ false);
3250 // We use an arbitrary rounding mode here. If a floating-point is an exact
3251 // integer (e.g., 1.0), the rounding mode does not affect the output value. If
3252 // the rounding mode changes the output value, then it is not an exact
3253 // integer.
3255 bool IsExact;
3256 // If it is out of signed integer range, it will return an invalid operation.
3257 // If it is not an exact integer, IsExact is false.
3258 if ((APF.convertToInteger(ValInt, ArbitraryRM, &IsExact) ==
3260 !IsExact)
3261 return std::nullopt;
3262 return ValInt.extractBits(BitWidth, 0);
3263}
3264
3265// Try to match an arithmetic-sequence BUILD_VECTOR [X,X+S,X+2*S,...,X+(N-1)*S]
3266// to the (non-zero) step S and start value X. This can be then lowered as the
3267// RVV sequence (VID * S) + X, for example.
3268// The step S is represented as an integer numerator divided by a positive
3269// denominator. Note that the implementation currently only identifies
3270// sequences in which either the numerator is +/- 1 or the denominator is 1. It
3271// cannot detect 2/3, for example.
3272// Note that this method will also match potentially unappealing index
3273// sequences, like <i32 0, i32 50939494>, however it is left to the caller to
3274// determine whether this is worth generating code for.
3275//
3276// EltSizeInBits is the size of the type that the sequence will be calculated
3277// in, i.e. SEW for build_vectors or XLEN for address calculations.
3278static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op,
3279 unsigned EltSizeInBits) {
3280 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unexpected BUILD_VECTOR");
3281 if (!cast<BuildVectorSDNode>(Op)->isConstant())
3282 return std::nullopt;
3283 bool IsInteger = Op.getValueType().isInteger();
3284
3285 std::optional<unsigned> SeqStepDenom;
3286 std::optional<APInt> SeqStepNum;
3287 std::optional<APInt> SeqAddend;
3288 std::optional<std::pair<APInt, unsigned>> PrevElt;
3289 assert(EltSizeInBits >= Op.getValueType().getScalarSizeInBits());
3290
3291 // First extract the ops into a list of constant integer values. This may not
3292 // be possible for floats if they're not all representable as integers.
3294 const unsigned OpSize = Op.getScalarValueSizeInBits();
3295 for (auto [Idx, Elt] : enumerate(Op->op_values())) {
3296 if (Elt.isUndef()) {
3297 Elts[Idx] = std::nullopt;
3298 continue;
3299 }
3300 if (IsInteger) {
3301 Elts[Idx] = Elt->getAsAPIntVal().trunc(OpSize).zext(EltSizeInBits);
3302 } else {
3303 auto ExactInteger =
3304 getExactInteger(cast<ConstantFPSDNode>(Elt)->getValueAPF(), OpSize);
3305 if (!ExactInteger)
3306 return std::nullopt;
3307 Elts[Idx] = *ExactInteger;
3308 }
3309 }
3310
3311 for (auto [Idx, Elt] : enumerate(Elts)) {
3312 // Assume undef elements match the sequence; we just have to be careful
3313 // when interpolating across them.
3314 if (!Elt)
3315 continue;
3316
3317 if (PrevElt) {
3318 // Calculate the step since the last non-undef element, and ensure
3319 // it's consistent across the entire sequence.
3320 unsigned IdxDiff = Idx - PrevElt->second;
3321 APInt ValDiff = *Elt - PrevElt->first;
3322
3323 // A zero-value value difference means that we're somewhere in the middle
3324 // of a fractional step, e.g. <0,0,0*,0,1,1,1,1>. Wait until we notice a
3325 // step change before evaluating the sequence.
3326 if (ValDiff == 0)
3327 continue;
3328
3329 int64_t Remainder = ValDiff.srem(IdxDiff);
3330 // Normalize the step if it's greater than 1.
3331 if (Remainder != ValDiff.getSExtValue()) {
3332 // The difference must cleanly divide the element span.
3333 if (Remainder != 0)
3334 return std::nullopt;
3335 ValDiff = ValDiff.sdiv(IdxDiff);
3336 IdxDiff = 1;
3337 }
3338
3339 if (!SeqStepNum)
3340 SeqStepNum = ValDiff;
3341 else if (ValDiff != SeqStepNum)
3342 return std::nullopt;
3343
3344 if (!SeqStepDenom)
3345 SeqStepDenom = IdxDiff;
3346 else if (IdxDiff != *SeqStepDenom)
3347 return std::nullopt;
3348 }
3349
3350 // Record this non-undef element for later.
3351 if (!PrevElt || PrevElt->first != *Elt)
3352 PrevElt = std::make_pair(*Elt, Idx);
3353 }
3354
3355 // We need to have logged a step for this to count as a legal index sequence.
3356 if (!SeqStepNum || !SeqStepDenom)
3357 return std::nullopt;
3358
3359 // Loop back through the sequence and validate elements we might have skipped
3360 // while waiting for a valid step. While doing this, log any sequence addend.
3361 for (auto [Idx, Elt] : enumerate(Elts)) {
3362 if (!Elt)
3363 continue;
3364 APInt ExpectedVal =
3365 (APInt(EltSizeInBits, Idx) * *SeqStepNum).sdiv(*SeqStepDenom);
3366
3367 APInt Addend = *Elt - ExpectedVal;
3368 if (!SeqAddend)
3369 SeqAddend = Addend;
3370 else if (Addend != SeqAddend)
3371 return std::nullopt;
3372 }
3373
3374 assert(SeqAddend && "Must have an addend if we have a step");
3375
3376 return VIDSequence{SeqStepNum->getSExtValue(), *SeqStepDenom,
3377 SeqAddend->getSExtValue()};
3378}
3379
3380// Match a splatted value (SPLAT_VECTOR/BUILD_VECTOR) of an EXTRACT_VECTOR_ELT
3381// and lower it as a VRGATHER_VX_VL from the source vector.
3382static SDValue matchSplatAsGather(SDValue SplatVal, MVT VT, const SDLoc &DL,
3383 SelectionDAG &DAG,
3384 const RISCVSubtarget &Subtarget) {
3385 if (SplatVal.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
3386 return SDValue();
3387 SDValue Vec = SplatVal.getOperand(0);
3388 // Only perform this optimization on vectors of the same size for simplicity.
3389 // Don't perform this optimization for i1 vectors.
3390 // FIXME: Support i1 vectors, maybe by promoting to i8?
3391 if (Vec.getValueType() != VT || VT.getVectorElementType() == MVT::i1)
3392 return SDValue();
3393 SDValue Idx = SplatVal.getOperand(1);
3394 // The index must be a legal type.
3395 if (Idx.getValueType() != Subtarget.getXLenVT())
3396 return SDValue();
3397
3398 MVT ContainerVT = VT;
3399 if (VT.isFixedLengthVector()) {
3400 ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
3401 Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget);
3402 }
3403
3404 auto [Mask, VL] = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
3405
3406 SDValue Gather = DAG.getNode(RISCVISD::VRGATHER_VX_VL, DL, ContainerVT, Vec,
3407 Idx, DAG.getUNDEF(ContainerVT), Mask, VL);
3408
3409 if (!VT.isFixedLengthVector())
3410 return Gather;
3411
3412 return convertFromScalableVector(VT, Gather, DAG, Subtarget);
3413}
3414
3415
3416/// Try and optimize BUILD_VECTORs with "dominant values" - these are values
3417/// which constitute a large proportion of the elements. In such cases we can
3418/// splat a vector with the dominant element and make up the shortfall with
3419/// INSERT_VECTOR_ELTs. Returns SDValue if not profitable.
3420/// Note that this includes vectors of 2 elements by association. The
3421/// upper-most element is the "dominant" one, allowing us to use a splat to
3422/// "insert" the upper element, and an insert of the lower element at position
3423/// 0, which improves codegen.
3425 const RISCVSubtarget &Subtarget) {
3426 MVT VT = Op.getSimpleValueType();
3427 assert(VT.isFixedLengthVector() && "Unexpected vector!");
3428
3429 MVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
3430
3431 SDLoc DL(Op);
3432 auto [Mask, VL] = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
3433
3434 MVT XLenVT = Subtarget.getXLenVT();
3435 unsigned NumElts = Op.getNumOperands();
3436
3437 SDValue DominantValue;
3438 unsigned MostCommonCount = 0;
3439 DenseMap<SDValue, unsigned> ValueCounts;
3440 unsigned NumUndefElts =
3441 count_if(Op->op_values(), [](const SDValue &V) { return V.isUndef(); });
3442
3443 // Track the number of scalar loads we know we'd be inserting, estimated as
3444 // any non-zero floating-point constant. Other kinds of element are either
3445 // already in registers or are materialized on demand. The threshold at which
3446 // a vector load is more desirable than several scalar materializion and
3447 // vector-insertion instructions is not known.
3448 unsigned NumScalarLoads = 0;
3449
3450 for (SDValue V : Op->op_values()) {
3451 if (V.isUndef())
3452 continue;
3453
3454 ValueCounts.insert(std::make_pair(V, 0));
3455 unsigned &Count = ValueCounts[V];
3456 if (0 == Count)
3457 if (auto *CFP = dyn_cast<ConstantFPSDNode>(V))
3458 NumScalarLoads += !CFP->isExactlyValue(+0.0);
3459
3460 // Is this value dominant? In case of a tie, prefer the highest element as
3461 // it's cheaper to insert near the beginning of a vector than it is at the
3462 // end.
3463 if (++Count >= MostCommonCount) {
3464 DominantValue = V;
3465 MostCommonCount = Count;
3466 }
3467 }
3468
3469 assert(DominantValue && "Not expecting an all-undef BUILD_VECTOR");
3470 unsigned NumDefElts = NumElts - NumUndefElts;
3471 unsigned DominantValueCountThreshold = NumDefElts <= 2 ? 0 : NumDefElts - 2;
3472
3473 // Don't perform this optimization when optimizing for size, since
3474 // materializing elements and inserting them tends to cause code bloat.
3475 if (!DAG.shouldOptForSize() && NumScalarLoads < NumElts &&
3476 (NumElts != 2 || ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) &&
3477 ((MostCommonCount > DominantValueCountThreshold) ||
3478 (ValueCounts.size() <= Log2_32(NumDefElts)))) {
3479 // Start by splatting the most common element.
3480 SDValue Vec = DAG.getSplatBuildVector(VT, DL, DominantValue);
3481
3482 DenseSet<SDValue> Processed{DominantValue};
3483
3484 // We can handle an insert into the last element (of a splat) via
3485 // v(f)slide1down. This is slightly better than the vslideup insert
3486 // lowering as it avoids the need for a vector group temporary. It
3487 // is also better than using vmerge.vx as it avoids the need to
3488 // materialize the mask in a vector register.
3489 if (SDValue LastOp = Op->getOperand(Op->getNumOperands() - 1);
3490 !LastOp.isUndef() && ValueCounts[LastOp] == 1 &&
3491 LastOp != DominantValue) {
3492 Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget);
3493 auto OpCode =
3495 if (!VT.isFloatingPoint())
3496 LastOp = DAG.getNode(ISD::ANY_EXTEND, DL, XLenVT, LastOp);
3497 Vec = DAG.getNode(OpCode, DL, ContainerVT, DAG.getUNDEF(ContainerVT), Vec,
3498 LastOp, Mask, VL);
3499 Vec = convertFromScalableVector(VT, Vec, DAG, Subtarget);
3500 Processed.insert(LastOp);
3501 }
3502
3503 MVT SelMaskTy = VT.changeVectorElementType(MVT::i1);
3504 for (const auto &OpIdx : enumerate(Op->ops())) {
3505 const SDValue &V = OpIdx.value();
3506 if (V.isUndef() || !Processed.insert(V).second)
3507 continue;
3508 if (ValueCounts[V] == 1) {
3509 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec, V,
3510 DAG.getVectorIdxConstant(OpIdx.index(), DL));
3511 } else {
3512 // Blend in all instances of this value using a VSELECT, using a
3513 // mask where each bit signals whether that element is the one
3514 // we're after.
3516 transform(Op->op_values(), std::back_inserter(Ops), [&](SDValue V1) {
3517 return DAG.getConstant(V == V1, DL, XLenVT);
3518 });
3519 Vec = DAG.getNode(ISD::VSELECT, DL, VT,
3520 DAG.getBuildVector(SelMaskTy, DL, Ops),
3521 DAG.getSplatBuildVector(VT, DL, V), Vec);
3522 }
3523 }
3524
3525 return Vec;
3526 }
3527
3528 return SDValue();
3529}
3530
3532 const RISCVSubtarget &Subtarget) {
3533 MVT VT = Op.getSimpleValueType();
3534 assert(VT.isFixedLengthVector() && "Unexpected vector!");
3535
3536 MVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
3537
3538 SDLoc DL(Op);
3539 auto [Mask, VL] = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
3540
3541 MVT XLenVT = Subtarget.getXLenVT();
3542 unsigned NumElts = Op.getNumOperands();
3543
3544 if (VT.getVectorElementType() == MVT::i1) {
3545 if (ISD::isBuildVectorAllZeros(Op.getNode())) {
3546 SDValue VMClr = DAG.getNode(RISCVISD::VMCLR_VL, DL, ContainerVT, VL);
3547 return convertFromScalableVector(VT, VMClr, DAG, Subtarget);
3548 }
3549
3550 if (ISD::isBuildVectorAllOnes(Op.getNode())) {
3551 SDValue VMSet = DAG.getNode(RISCVISD::VMSET_VL, DL, ContainerVT, VL);
3552 return convertFromScalableVector(VT, VMSet, DAG, Subtarget);
3553 }
3554
3555 // Lower constant mask BUILD_VECTORs via an integer vector type, in
3556 // scalar integer chunks whose bit-width depends on the number of mask
3557 // bits and XLEN.
3558 // First, determine the most appropriate scalar integer type to use. This
3559 // is at most XLenVT, but may be shrunk to a smaller vector element type
3560 // according to the size of the final vector - use i8 chunks rather than
3561 // XLenVT if we're producing a v8i1. This results in more consistent
3562 // codegen across RV32 and RV64.
3563 unsigned NumViaIntegerBits = std::clamp(NumElts, 8u, Subtarget.getXLen());
3564 NumViaIntegerBits = std::min(NumViaIntegerBits, Subtarget.getELen());
3565 // If we have to use more than one INSERT_VECTOR_ELT then this
3566 // optimization is likely to increase code size; avoid peforming it in
3567 // such a case. We can use a load from a constant pool in this case.
3568 if (DAG.shouldOptForSize() && NumElts > NumViaIntegerBits)
3569 return SDValue();
3570 // Now we can create our integer vector type. Note that it may be larger
3571 // than the resulting mask type: v4i1 would use v1i8 as its integer type.
3572 unsigned IntegerViaVecElts = divideCeil(NumElts, NumViaIntegerBits);
3573 MVT IntegerViaVecVT =
3574 MVT::getVectorVT(MVT::getIntegerVT(NumViaIntegerBits),
3575 IntegerViaVecElts);
3576
3577 uint64_t Bits = 0;
3578 unsigned BitPos = 0, IntegerEltIdx = 0;
3579 SmallVector<SDValue, 8> Elts(IntegerViaVecElts);
3580
3581 for (unsigned I = 0; I < NumElts;) {
3582 SDValue V = Op.getOperand(I);
3583 bool BitValue = !V.isUndef() && V->getAsZExtVal();
3584 Bits |= ((uint64_t)BitValue << BitPos);
3585 ++BitPos;
3586 ++I;
3587
3588 // Once we accumulate enough bits to fill our scalar type or process the
3589 // last element, insert into our vector and clear our accumulated data.
3590 if (I % NumViaIntegerBits == 0 || I == NumElts) {
3591 if (NumViaIntegerBits <= 32)
3592 Bits = SignExtend64<32>(Bits);
3593 SDValue Elt = DAG.getConstant(Bits, DL, XLenVT);
3594 Elts[IntegerEltIdx] = Elt;
3595 Bits = 0;
3596 BitPos = 0;
3597 IntegerEltIdx++;
3598 }
3599 }
3600
3601 SDValue Vec = DAG.getBuildVector(IntegerViaVecVT, DL, Elts);
3602
3603 if (NumElts < NumViaIntegerBits) {
3604 // If we're producing a smaller vector than our minimum legal integer
3605 // type, bitcast to the equivalent (known-legal) mask type, and extract
3606 // our final mask.
3607 assert(IntegerViaVecVT == MVT::v1i8 && "Unexpected mask vector type");
3608 Vec = DAG.getBitcast(MVT::v8i1, Vec);
3609 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Vec,
3610 DAG.getConstant(0, DL, XLenVT));
3611 } else {
3612 // Else we must have produced an integer type with the same size as the
3613 // mask type; bitcast for the final result.
3614 assert(VT.getSizeInBits() == IntegerViaVecVT.getSizeInBits());
3615 Vec = DAG.getBitcast(VT, Vec);
3616 }
3617
3618 return Vec;
3619 }
3620
3621 if (SDValue Splat = cast<BuildVectorSDNode>(Op)->getSplatValue()) {
3622 unsigned Opc = VT.isFloatingPoint() ? RISCVISD::VFMV_V_F_VL
3624 if (!VT.isFloatingPoint())
3625 Splat = DAG.getNode(ISD::ANY_EXTEND, DL, XLenVT, Splat);
3626 Splat =
3627 DAG.getNode(Opc, DL, ContainerVT, DAG.getUNDEF(ContainerVT), Splat, VL);
3628 return convertFromScalableVector(VT, Splat, DAG, Subtarget);
3629 }
3630
3631 // Try and match index sequences, which we can lower to the vid instruction
3632 // with optional modifications. An all-undef vector is matched by
3633 // getSplatValue, above.
3634 if (auto SimpleVID = isSimpleVIDSequence(Op, Op.getScalarValueSizeInBits())) {
3635 int64_t StepNumerator = SimpleVID->StepNumerator;
3636 unsigned StepDenominator = SimpleVID->StepDenominator;
3637 int64_t Addend = SimpleVID->Addend;
3638
3639 assert(StepNumerator != 0 && "Invalid step");
3640 bool Negate = false;
3641 int64_t SplatStepVal = StepNumerator;
3642 unsigned StepOpcode = ISD::MUL;
3643 // Exclude INT64_MIN to avoid passing it to std::abs. We won't optimize it
3644 // anyway as the shift of 63 won't fit in uimm5.
3645 if (StepNumerator != 1 && StepNumerator != INT64_MIN &&
3646 isPowerOf2_64(std::abs(StepNumerator))) {
3647 Negate = StepNumerator < 0;
3648 StepOpcode = ISD::SHL;
3649 SplatStepVal = Log2_64(std::abs(StepNumerator));
3650 }
3651
3652 // Only emit VIDs with suitably-small steps/addends. We use imm5 is a
3653 // threshold since it's the immediate value many RVV instructions accept.
3654 // There is no vmul.vi instruction so ensure multiply constant can fit in
3655 // a single addi instruction.
3656 if (((StepOpcode == ISD::MUL && isInt<12>(SplatStepVal)) ||
3657 (StepOpcode == ISD::SHL && isUInt<5>(SplatStepVal))) &&
3658 isPowerOf2_32(StepDenominator) &&
3659 (SplatStepVal >= 0 || StepDenominator == 1) && isInt<5>(Addend)) {
3660 MVT VIDVT =
3662 MVT VIDContainerVT =
3663 getContainerForFixedLengthVector(DAG, VIDVT, Subtarget);
3664 SDValue VID = DAG.getNode(RISCVISD::VID_VL, DL, VIDContainerVT, Mask, VL);
3665 // Convert right out of the scalable type so we can use standard ISD
3666 // nodes for the rest of the computation. If we used scalable types with
3667 // these, we'd lose the fixed-length vector info and generate worse
3668 // vsetvli code.
3669 VID = convertFromScalableVector(VIDVT, VID, DAG, Subtarget);
3670 if ((StepOpcode == ISD::MUL && SplatStepVal != 1) ||
3671 (StepOpcode == ISD::SHL && SplatStepVal != 0)) {
3672 SDValue SplatStep = DAG.getConstant(SplatStepVal, DL, VIDVT);
3673 VID = DAG.getNode(StepOpcode, DL, VIDVT, VID, SplatStep);
3674 }
3675 if (StepDenominator != 1) {
3676 SDValue SplatStep =
3677 DAG.getConstant(Log2_64(StepDenominator), DL, VIDVT);
3678 VID = DAG.getNode(ISD::SRL, DL, VIDVT, VID, SplatStep);
3679 }
3680 if (Addend != 0 || Negate) {
3681 SDValue SplatAddend = DAG.getConstant(Addend, DL, VIDVT);
3682 VID = DAG.getNode(Negate ? ISD::SUB : ISD::ADD, DL, VIDVT, SplatAddend,
3683 VID);
3684 }
3685 if (VT.isFloatingPoint()) {
3686 // TODO: Use vfwcvt to reduce register pressure.
3687 VID = DAG.getNode(ISD::SINT_TO_FP, DL, VT, VID);
3688 }
3689 return VID;
3690 }
3691 }
3692
3693 // For very small build_vectors, use a single scalar insert of a constant.
3694 // TODO: Base this on constant rematerialization cost, not size.
3695 const unsigned EltBitSize = VT.getScalarSizeInBits();
3696 if (VT.getSizeInBits() <= 32 &&
3698 MVT ViaIntVT = MVT::getIntegerVT(VT.getSizeInBits());
3699 assert((ViaIntVT == MVT::i16 || ViaIntVT == MVT::i32) &&
3700 "Unexpected sequence type");
3701 // If we can use the original VL with the modified element type, this
3702 // means we only have a VTYPE toggle, not a VL toggle. TODO: Should this
3703 // be moved into InsertVSETVLI?
3704 unsigned ViaVecLen =
3705 (Subtarget.getRealMinVLen() >= VT.getSizeInBits() * NumElts) ? NumElts : 1;
3706 MVT ViaVecVT = MVT::getVectorVT(ViaIntVT, ViaVecLen);
3707
3708 uint64_t EltMask = maskTrailingOnes<uint64_t>(EltBitSize);
3709 uint64_t SplatValue = 0;
3710 // Construct the amalgamated value at this larger vector type.
3711 for (const auto &OpIdx : enumerate(Op->op_values())) {
3712 const auto &SeqV = OpIdx.value();
3713 if (!SeqV.isUndef())
3714 SplatValue |=
3715 ((SeqV->getAsZExtVal() & EltMask) << (OpIdx.index() * EltBitSize));
3716 }
3717
3718 // On RV64, sign-extend from 32 to 64 bits where possible in order to
3719 // achieve better constant materializion.
3720 if (Subtarget.is64Bit() && ViaIntVT == MVT::i32)
3721 SplatValue = SignExtend64<32>(SplatValue);
3722
3723 SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ViaVecVT,
3724 DAG.getUNDEF(ViaVecVT),
3725 DAG.getConstant(SplatValue, DL, XLenVT),
3726 DAG.getVectorIdxConstant(0, DL));
3727 if (ViaVecLen != 1)
3729 MVT::getVectorVT(ViaIntVT, 1), Vec,
3730 DAG.getConstant(0, DL, XLenVT));
3731 return DAG.getBitcast(VT, Vec);
3732 }
3733
3734
3735 // Attempt to detect "hidden" splats, which only reveal themselves as splats
3736 // when re-interpreted as a vector with a larger element type. For example,
3737 // v4i16 = build_vector i16 0, i16 1, i16 0, i16 1
3738 // could be instead splat as
3739 // v2i32 = build_vector i32 0x00010000, i32 0x00010000
3740 // TODO: This optimization could also work on non-constant splats, but it
3741 // would require bit-manipulation instructions to construct the splat value.
3742 SmallVector<SDValue> Sequence;
3743 const auto *BV = cast<BuildVectorSDNode>(Op);
3744 if (VT.isInteger() && EltBitSize < Subtarget.getELen() &&
3746 BV->getRepeatedSequence(Sequence) &&
3747 (Sequence.size() * EltBitSize) <= Subtarget.getELen()) {
3748 unsigned SeqLen = Sequence.size();
3749 MVT ViaIntVT = MVT::getIntegerVT(EltBitSize * SeqLen);
3750 assert((ViaIntVT == MVT::i16 || ViaIntVT == MVT::i32 ||
3751 ViaIntVT == MVT::i64) &&
3752 "Unexpected sequence type");
3753
3754 // If we can use the original VL with the modified element type, this
3755 // means we only have a VTYPE toggle, not a VL toggle. TODO: Should this
3756 // be moved into InsertVSETVLI?
3757 const unsigned RequiredVL = NumElts / SeqLen;
3758 const unsigned ViaVecLen =
3759 (Subtarget.getRealMinVLen() >= ViaIntVT.getSizeInBits() * NumElts) ?
3760 NumElts : RequiredVL;
3761 MVT ViaVecVT = MVT::getVectorVT(ViaIntVT, ViaVecLen);
3762
3763 unsigned EltIdx = 0;
3764 uint64_t EltMask = maskTrailingOnes<uint64_t>(EltBitSize);
3765 uint64_t SplatValue = 0;
3766 // Construct the amalgamated value which can be splatted as this larger
3767 // vector type.
3768 for (const auto &SeqV : Sequence) {
3769 if (!SeqV.isUndef())
3770 SplatValue |=
3771 ((SeqV->getAsZExtVal() & EltMask) << (EltIdx * EltBitSize));
3772 EltIdx++;
3773 }
3774
3775 // On RV64, sign-extend from 32 to 64 bits where possible in order to
3776 // achieve better constant materializion.
3777 if (Subtarget.is64Bit() && ViaIntVT == MVT::i32)
3778 SplatValue = SignExtend64<32>(SplatValue);
3779
3780 // Since we can't introduce illegal i64 types at this stage, we can only
3781 // perform an i64 splat on RV32 if it is its own sign-extended value. That
3782 // way we can use RVV instructions to splat.
3783 assert((ViaIntVT.bitsLE(XLenVT) ||
3784 (!Subtarget.is64Bit() && ViaIntVT == MVT::i64)) &&
3785 "Unexpected bitcast sequence");
3786 if (ViaIntVT.bitsLE(XLenVT) || isInt<32>(SplatValue)) {
3787 SDValue ViaVL =
3788 DAG.getConstant(ViaVecVT.getVectorNumElements(), DL, XLenVT);
3789 MVT ViaContainerVT =
3790 getContainerForFixedLengthVector(DAG, ViaVecVT, Subtarget);
3791 SDValue Splat =
3792 DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ViaContainerVT,
3793 DAG.getUNDEF(ViaContainerVT),
3794 DAG.getConstant(SplatValue, DL, XLenVT), ViaVL);
3795 Splat = convertFromScalableVector(ViaVecVT, Splat, DAG, Subtarget);
3796 if (ViaVecLen != RequiredVL)
3798 MVT::getVectorVT(ViaIntVT, RequiredVL), Splat,
3799 DAG.getConstant(0, DL, XLenVT));
3800 return DAG.getBitcast(VT, Splat);
3801 }
3802 }
3803
3804 // If the number of signbits allows, see if we can lower as a <N x i8>.
3805 // Our main goal here is to reduce LMUL (and thus work) required to
3806 // build the constant, but we will also narrow if the resulting
3807 // narrow vector is known to materialize cheaply.
3808 // TODO: We really should be costing the smaller vector. There are
3809 // profitable cases this misses.
3810 if (EltBitSize > 8 && VT.isInteger() &&
3811 (NumElts <= 4 || VT.getSizeInBits() > Subtarget.getRealMinVLen())) {
3812 unsigned SignBits = DAG.ComputeNumSignBits(Op);
3813 if (EltBitSize - SignBits < 8) {
3814 SDValue Source = DAG.getBuildVector(VT.changeVectorElementType(MVT::i8),
3815 DL, Op->ops());
3816 Source = convertToScalableVector(ContainerVT.changeVectorElementType(MVT::i8),
3817 Source, DAG, Subtarget);
3818 SDValue Res = DAG.getNode(RISCVISD::VSEXT_VL, DL, ContainerVT, Source, Mask, VL);
3819 return convertFromScalableVector(VT, Res, DAG, Subtarget);
3820 }
3821 }
3822
3823 if (SDValue Res = lowerBuildVectorViaDominantValues(Op, DAG, Subtarget))
3824 return Res;
3825
3826 // For constant vectors, use generic constant pool lowering. Otherwise,
3827 // we'd have to materialize constants in GPRs just to move them into the
3828 // vector.
3829 return SDValue();
3830}
3831
3832static unsigned getPACKOpcode(unsigned DestBW,
3833 const RISCVSubtarget &Subtarget) {
3834 switch (DestBW) {
3835 default:
3836 llvm_unreachable("Unsupported pack size");
3837 case 16:
3838 return RISCV::PACKH;
3839 case 32:
3840 return Subtarget.is64Bit() ? RISCV::PACKW : RISCV::PACK;
3841 case 64:
3842 assert(Subtarget.is64Bit());
3843 return RISCV::PACK;
3844 }
3845}
3846
3847/// Double the element size of the build vector to reduce the number
3848/// of vslide1down in the build vector chain. In the worst case, this
3849/// trades three scalar operations for 1 vector operation. Scalar
3850/// operations are generally lower latency, and for out-of-order cores
3851/// we also benefit from additional parallelism.
3853 const RISCVSubtarget &Subtarget) {
3854 SDLoc DL(Op);
3855 MVT VT = Op.getSimpleValueType();
3856 assert(VT.isFixedLengthVector() && "Unexpected vector!");
3857 MVT ElemVT = VT.getVectorElementType();
3858 if (!ElemVT.isInteger())
3859 return SDValue();
3860
3861 // TODO: Relax these architectural restrictions, possibly with costing
3862 // of the actual instructions required.
3863 if (!Subtarget.hasStdExtZbb() || !Subtarget.hasStdExtZba())
3864 return SDValue();
3865
3866 unsigned NumElts = VT.getVectorNumElements();
3867 unsigned ElemSizeInBits = ElemVT.getSizeInBits();
3868 if (ElemSizeInBits >= std::min(Subtarget.getELen(), Subtarget.getXLen()) ||
3869 NumElts % 2 != 0)
3870 return SDValue();
3871
3872 // Produce [B,A] packed into a type twice as wide. Note that all
3873 // scalars are XLenVT, possibly masked (see below).
3874 MVT XLenVT = Subtarget.getXLenVT();
3875 SDValue Mask = DAG.getConstant(
3876 APInt::getLowBitsSet(XLenVT.getSizeInBits(), ElemSizeInBits), DL, XLenVT);
3877 auto pack = [&](SDValue A, SDValue B) {
3878 // Bias the scheduling of the inserted operations to near the
3879 // definition of the element - this tends to reduce register
3880 // pressure overall.
3881 SDLoc ElemDL(B);
3882 if (Subtarget.hasStdExtZbkb())
3883 // Note that we're relying on the high bits of the result being
3884 // don't care. For PACKW, the result is *sign* extended.
3885 return SDValue(
3886 DAG.getMachineNode(getPACKOpcode(ElemSizeInBits * 2, Subtarget),
3887 ElemDL, XLenVT, A, B),
3888 0);
3889
3890 A = DAG.getNode(ISD::AND, SDLoc(A), XLenVT, A, Mask);
3891 B = DAG.getNode(ISD::AND, SDLoc(B), XLenVT, B, Mask);
3892 SDValue ShtAmt = DAG.getConstant(ElemSizeInBits, ElemDL, XLenVT);
3893 SDNodeFlags Flags;
3894 Flags.setDisjoint(true);
3895 return DAG.getNode(ISD::OR, ElemDL, XLenVT, A,
3896 DAG.getNode(ISD::SHL, ElemDL, XLenVT, B, ShtAmt), Flags);
3897 };
3898
3899 SmallVector<SDValue> NewOperands;
3900 NewOperands.reserve(NumElts / 2);
3901 for (unsigned i = 0; i < VT.getVectorNumElements(); i += 2)
3902 NewOperands.push_back(pack(Op.getOperand(i), Op.getOperand(i + 1)));
3903 assert(NumElts == NewOperands.size() * 2);
3904 MVT WideVT = MVT::getIntegerVT(ElemSizeInBits * 2);
3905 MVT WideVecVT = MVT::getVectorVT(WideVT, NumElts / 2);
3906 return DAG.getNode(ISD::BITCAST, DL, VT,
3907 DAG.getBuildVector(WideVecVT, DL, NewOperands));
3908}
3909
3910// Convert to an vXf16 build_vector to vXi16 with bitcasts.
3912 MVT VT = Op.getSimpleValueType();
3913 MVT IVT = VT.changeVectorElementType(MVT::i16);
3915 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
3916 NewOps[I] = DAG.getBitcast(MVT::i16, Op.getOperand(I));
3917 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), IVT, NewOps);
3918 return DAG.getBitcast(VT, Res);
3919}
3920
3922 const RISCVSubtarget &Subtarget) {
3923 MVT VT = Op.getSimpleValueType();
3924 assert(VT.isFixedLengthVector() && "Unexpected vector!");
3925
3926 // If we don't have scalar f16, we need to bitcast to an i16 vector.
3927 if (VT.getVectorElementType() == MVT::f16 &&
3928 !Subtarget.hasStdExtZfhmin())
3929 return lowerBUILD_VECTORvXf16(Op, DAG);
3930
3931 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||
3933 return lowerBuildVectorOfConstants(Op, DAG, Subtarget);
3934
3935 MVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
3936
3937 SDLoc DL(Op);
3938 auto [Mask, VL] = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
3939
3940 MVT XLenVT = Subtarget.getXLenVT();
3941
3942 if (VT.getVectorElementType() == MVT::i1) {
3943 // A BUILD_VECTOR can be lowered as a SETCC. For each fixed-length mask
3944 // vector type, we have a legal equivalently-sized i8 type, so we can use
3945 // that.
3946 MVT WideVecVT = VT.changeVectorElementType(MVT::i8);
3947 SDValue VecZero = DAG.getConstant(0, DL, WideVecVT);
3948
3949 SDValue WideVec;
3950 if (SDValue Splat = cast<BuildVectorSDNode>(Op)->getSplatValue()) {
3951 // For a splat, perform a scalar truncate before creating the wider
3952 // vector.
3953 Splat = DAG.getNode(ISD::AND, DL, Splat.getValueType(), Splat,
3954 DAG.getConstant(1, DL, Splat.getValueType()));
3955 WideVec = DAG.getSplatBuildVector(WideVecVT, DL, Splat);
3956 } else {
3957 SmallVector<SDValue, 8> Ops(Op->op_values());
3958 WideVec = DAG.getBuildVector(WideVecVT, DL, Ops);
3959 SDValue VecOne = DAG.getConstant(1, DL, WideVecVT);
3960 WideVec = DAG.getNode(ISD::AND, DL, WideVecVT, WideVec, VecOne);
3961 }
3962
3963 return DAG.getSetCC(DL, VT, WideVec, VecZero, ISD::SETNE);
3964 }
3965
3966 if (SDValue Splat = cast<BuildVectorSDNode>(Op)->getSplatValue()) {
3967 if (auto Gather = matchSplatAsGather(Splat, VT, DL, DAG, Subtarget))
3968 return Gather;
3969 unsigned Opc = VT.isFloatingPoint() ? RISCVISD::VFMV_V_F_VL
3971 if (!VT.isFloatingPoint())
3972 Splat = DAG.getNode(ISD::ANY_EXTEND, DL, XLenVT, Splat);
3973 Splat =
3974 DAG.getNode(Opc, DL, ContainerVT, DAG.getUNDEF(ContainerVT), Splat, VL);
3975 return convertFromScalableVector(VT, Splat, DAG, Subtarget);
3976 }
3977
3978 if (SDValue Res = lowerBuildVectorViaDominantValues(Op, DAG, Subtarget))
3979 return Res;
3980
3981 // If we're compiling for an exact VLEN value, we can split our work per
3982 // register in the register group.
3983 if (const auto VLen = Subtarget.getRealVLen();
3984 VLen && VT.getSizeInBits().getKnownMinValue() > *VLen) {
3985 MVT ElemVT = VT.getVectorElementType();
3986 unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
3987 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
3988 MVT OneRegVT = MVT::getVectorVT(ElemVT, ElemsPerVReg);
3989 MVT M1VT = getContainerForFixedLengthVector(DAG, OneRegVT, Subtarget);
3990 assert(M1VT == getLMUL1VT(M1VT));
3991
3992 // The following semantically builds up a fixed length concat_vector
3993 // of the component build_vectors. We eagerly lower to scalable and
3994 // insert_subvector here to avoid DAG combining it back to a large
3995 // build_vector.
3996 SmallVector<SDValue> BuildVectorOps(Op->ops());
3997 unsigned NumOpElts = M1VT.getVectorMinNumElements();
3998 SDValue Vec = DAG.getUNDEF(ContainerVT);
3999 for (unsigned i = 0; i < VT.getVectorNumElements(); i += ElemsPerVReg) {
4000 auto OneVRegOfOps = ArrayRef(BuildVectorOps).slice(i, ElemsPerVReg);
4001 SDValue SubBV =
4002 DAG.getNode(ISD::BUILD_VECTOR, DL, OneRegVT, OneVRegOfOps);
4003 SubBV = convertToScalableVector(M1VT, SubBV, DAG, Subtarget);
4004 unsigned InsertIdx = (i / ElemsPerVReg) * NumOpElts;
4005 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, Vec, SubBV,
4006 DAG.getVectorIdxConstant(InsertIdx, DL));
4007 }
4008 return convertFromScalableVector(VT, Vec, DAG, Subtarget);
4009 }
4010
4011 // If we're about to resort to vslide1down (or stack usage), pack our
4012 // elements into the widest scalar type we can. This will force a VL/VTYPE
4013 // toggle, but reduces the critical path, the number of vslide1down ops
4014 // required, and possibly enables scalar folds of the values.
4015 if (SDValue Res = lowerBuildVectorViaPacking(Op, DAG, Subtarget))
4016 return Res;
4017
4018 // For m1 vectors, if we have non-undef values in both halves of our vector,
4019 // split the vector into low and high halves, build them separately, then
4020 // use a vselect to combine them. For long vectors, this cuts the critical
4021 // path of the vslide1down sequence in half, and gives us an opportunity
4022 // to special case each half independently. Note that we don't change the
4023 // length of the sub-vectors here, so if both fallback to the generic
4024 // vslide1down path, we should be able to fold the vselect into the final
4025 // vslidedown (for the undef tail) for the first half w/ masking.
4026 unsigned NumElts = VT.getVectorNumElements();
4027 unsigned NumUndefElts =
4028 count_if(Op->op_values(), [](const SDValue &V) { return V.isUndef(); });
4029 unsigned NumDefElts = NumElts - NumUndefElts;
4030 if (NumDefElts >= 8 && NumDefElts > NumElts / 2 &&
4031 ContainerVT.bitsLE(getLMUL1VT(ContainerVT))) {
4032 SmallVector<SDValue> SubVecAOps, SubVecBOps;
4033 SmallVector<SDValue> MaskVals;
4034 SDValue UndefElem = DAG.getUNDEF(Op->getOperand(0)->getValueType(0));
4035 SubVecAOps.reserve(NumElts);
4036 SubVecBOps.reserve(NumElts);
4037 for (unsigned i = 0; i < NumElts; i++) {
4038 SDValue Elem = Op->getOperand(i);
4039 if (i < NumElts / 2) {
4040 SubVecAOps.push_back(Elem);
4041 SubVecBOps.push_back(UndefElem);
4042 } else {
4043 SubVecAOps.push_back(UndefElem);
4044 SubVecBOps.push_back(Elem);
4045 }
4046 bool SelectMaskVal = (i < NumElts / 2);
4047 MaskVals.push_back(DAG.getConstant(SelectMaskVal, DL, XLenVT));
4048 }
4049 assert(SubVecAOps.size() == NumElts && SubVecBOps.size() == NumElts &&
4050 MaskVals.size() == NumElts);
4051
4052 SDValue SubVecA = DAG.getBuildVector(VT, DL, SubVecAOps);
4053 SDValue SubVecB = DAG.getBuildVector(VT, DL, SubVecBOps);
4054 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
4055 SDValue SelectMask = DAG.getBuildVector(MaskVT, DL, MaskVals);
4056 return DAG.getNode(ISD::VSELECT, DL, VT, SelectMask, SubVecA, SubVecB);
4057 }
4058
4059 // Cap the cost at a value linear to the number of elements in the vector.
4060 // The default lowering is to use the stack. The vector store + scalar loads
4061 // is linear in VL. However, at high lmuls vslide1down and vslidedown end up
4062 // being (at least) linear in LMUL. As a result, using the vslidedown
4063 // lowering for every element ends up being VL*LMUL..
4064 // TODO: Should we be directly costing the stack alternative? Doing so might
4065 // give us a more accurate upper bound.
4066 InstructionCost LinearBudget = VT.getVectorNumElements() * 2;
4067
4068 // TODO: unify with TTI getSlideCost.
4069 InstructionCost PerSlideCost = 1;
4070 switch (RISCVTargetLowering::getLMUL(ContainerVT)) {
4071 default: break;
4073 PerSlideCost = 2;
4074 break;
4076 PerSlideCost = 4;
4077 break;
4079 PerSlideCost = 8;
4080 break;
4081 }
4082
4083 // TODO: Should we be using the build instseq then cost + evaluate scheme
4084 // we use for integer constants here?
4085 unsigned UndefCount = 0;
4086 for (const SDValue &V : Op->ops()) {
4087 if (V.isUndef()) {
4088 UndefCount++;
4089 continue;
4090 }
4091 if (UndefCount) {
4092 LinearBudget -= PerSlideCost;
4093 UndefCount = 0;
4094 }
4095 LinearBudget -= PerSlideCost;
4096 }
4097 if (UndefCount) {
4098 LinearBudget -= PerSlideCost;
4099 }
4100
4101 if (LinearBudget < 0)
4102 return SDValue();
4103
4104 assert((!VT.isFloatingPoint() ||
4105 VT.getVectorElementType().getSizeInBits() <= Subtarget.getFLen()) &&
4106 "Illegal type which will result in reserved encoding");
4107
4108 const unsigned Policy = RISCVII::TAIL_AGNOSTIC | RISCVII::MASK_AGNOSTIC;
4109
4110 SDValue Vec;
4111 UndefCount = 0;
4112 for (SDValue V : Op->ops()) {
4113 if (V.isUndef()) {
4114 UndefCount++;
4115 continue;
4116 }
4117
4118 // Start our sequence with a TA splat in the hopes that hardware is able to
4119 // recognize there's no dependency on the prior value of our temporary
4120 // register.
4121 if (!Vec) {
4122 Vec = DAG.getSplatVector(VT, DL, V);
4123 Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget);
4124 UndefCount = 0;
4125 continue;
4126 }
4127
4128 if (UndefCount) {
4129 const SDValue Offset = DAG.getConstant(UndefCount, DL, Subtarget.getXLenVT());
4130 Vec = getVSlidedown(DAG, Subtarget, DL, ContainerVT, DAG.getUNDEF(ContainerVT),
4131 Vec, Offset, Mask, VL, Policy);
4132 UndefCount = 0;
4133 }
4134 auto OpCode =
4136 if (!VT.isFloatingPoint())
4137 V = DAG.getNode(ISD::ANY_EXTEND, DL, Subtarget.getXLenVT(), V);
4138 Vec = DAG.getNode(OpCode, DL, ContainerVT, DAG.getUNDEF(ContainerVT), Vec,
4139 V, Mask, VL);
4140 }
4141 if (UndefCount) {
4142 const SDValue Offset = DAG.getConstant(UndefCount, DL, Subtarget.getXLenVT());
4143 Vec = getVSlidedown(DAG, Subtarget, DL, ContainerVT, DAG.getUNDEF(ContainerVT),
4144 Vec, Offset, Mask, VL, Policy);
4145 }
4146 return convertFromScalableVector(VT, Vec, DAG, Subtarget);
4147}
4148
4149static SDValue splatPartsI64WithVL(const SDLoc &DL, MVT VT, SDValue Passthru,
4151 SelectionDAG &DAG) {
4152 if (!Passthru)
4153 Passthru = DAG.getUNDEF(VT);
4154 if (isa<ConstantSDNode>(Lo) && isa<ConstantSDNode>(Hi)) {
4155 int32_t LoC = cast<ConstantSDNode>(Lo)->getSExtValue();
4156 int32_t HiC = cast<ConstantSDNode>(Hi)->getSExtValue();
4157 // If Hi constant is all the same sign bit as Lo, lower this as a custom
4158 // node in order to try and match RVV vector/scalar instructions.
4159 if ((LoC >> 31) == HiC)
4160 return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, Passthru, Lo, VL);
4161
4162 // If vl is equal to VLMAX or fits in 4 bits and Hi constant is equal to Lo,
4163 // we could use vmv.v.x whose EEW = 32 to lower it. This allows us to use
4164 // vlmax vsetvli or vsetivli to change the VL.
4165 // FIXME: Support larger constants?
4166 // FIXME: Support non-constant VLs by saturating?
4167 if (LoC == HiC) {
4168 SDValue NewVL;
4169 if (isAllOnesConstant(VL) ||
4170 (isa<RegisterSDNode>(VL) &&
4171 cast<RegisterSDNode>(VL)->getReg() == RISCV::X0))
4172 NewVL = DAG.getRegister(RISCV::X0, MVT::i32);
4173 else if (isa<ConstantSDNode>(VL) && isUInt<4>(VL->getAsZExtVal()))
4174 NewVL = DAG.getNode(ISD::ADD, DL, VL.getValueType(), VL, VL);
4175
4176 if (NewVL) {
4177 MVT InterVT =
4178 MVT::getVectorVT(MVT::i32, VT.getVectorElementCount() * 2);
4179 auto InterVec = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, InterVT,
4180 DAG.getUNDEF(InterVT), Lo, NewVL);
4181 return DAG.getNode(ISD::BITCAST, DL, VT, InterVec);
4182 }
4183 }
4184 }
4185
4186 // Detect cases where Hi is (SRA Lo, 31) which means Hi is Lo sign extended.
4187 if (Hi.getOpcode() == ISD::SRA && Hi.getOperand(0) == Lo &&
4188 isa<ConstantSDNode>(Hi.getOperand(1)) &&
4189 Hi.getConstantOperandVal(1) == 31)
4190 return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, Passthru, Lo, VL);
4191
4192 // If the hi bits of the splat are undefined, then it's fine to just splat Lo
4193 // even if it might be sign extended.
4194 if (Hi.isUndef())
4195 return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, Passthru, Lo, VL);
4196
4197 // Fall back to a stack store and stride x0 vector load.
4198 return DAG.getNode(RISCVISD::SPLAT_VECTOR_SPLIT_I64_VL, DL, VT, Passthru, Lo,
4199 Hi, VL);
4200}
4201
4202// Called by type legalization to handle splat of i64 on RV32.
4203// FIXME: We can optimize this when the type has sign or zero bits in one
4204// of the halves.
4205static SDValue splatSplitI64WithVL(const SDLoc &DL, MVT VT, SDValue Passthru,