LLVM 23.0.0git
X86ISelLowering.cpp
Go to the documentation of this file.
1// I
2//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
3//
4// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5// See https://llvm.org/LICENSE.txt for license information.
6// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7//
8//===----------------------------------------------------------------------===//
9//
10// This file defines the interfaces that X86 uses to lower LLVM code into a
11// selection DAG.
12//
13//===----------------------------------------------------------------------===//
14
15#include "X86ISelLowering.h"
17#include "X86.h"
18#include "X86FrameLowering.h"
19#include "X86InstrBuilder.h"
20#include "X86IntrinsicsInfo.h"
22#include "X86TargetMachine.h"
24#include "llvm/ADT/SmallSet.h"
26#include "llvm/ADT/Statistic.h"
44#include "llvm/IR/CallingConv.h"
45#include "llvm/IR/Constants.h"
48#include "llvm/IR/Function.h"
49#include "llvm/IR/GlobalAlias.h"
51#include "llvm/IR/IRBuilder.h"
53#include "llvm/IR/Intrinsics.h"
55#include "llvm/MC/MCAsmInfo.h"
56#include "llvm/MC/MCContext.h"
57#include "llvm/MC/MCExpr.h"
58#include "llvm/MC/MCSymbol.h"
60#include "llvm/Support/Debug.h"
65#include <algorithm>
66#include <bitset>
67#include <cctype>
68#include <numeric>
69using namespace llvm;
70
71#define DEBUG_TYPE "x86-isel"
72
74 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
76 "Sets the preferable loop alignment for experiments (as log2 bytes) "
77 "for innermost loops only. If specified, this option overrides "
78 "alignment set by x86-experimental-pref-loop-alignment."),
80
82 "x86-br-merging-base-cost", cl::init(2),
84 "Sets the cost threshold for when multiple conditionals will be merged "
85 "into one branch versus be split in multiple branches. Merging "
86 "conditionals saves branches at the cost of additional instructions. "
87 "This value sets the instruction cost limit, below which conditionals "
88 "will be merged, and above which conditionals will be split. Set to -1 "
89 "to never merge branches."),
91
93 "x86-br-merging-ccmp-bias", cl::init(6),
94 cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target "
95 "supports conditional compare instructions."),
97
98static cl::opt<bool>
99 WidenShift("x86-widen-shift", cl::init(true),
100 cl::desc("Replace narrow shifts with wider shifts."),
101 cl::Hidden);
102
104 "x86-br-merging-likely-bias", cl::init(0),
105 cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely "
106 "that all conditionals will be executed. For example for merging "
107 "the conditionals (a == b && c > d), if its known that a == b is "
108 "likely, then it is likely that if the conditionals are split "
109 "both sides will be executed, so it may be desirable to increase "
110 "the instruction cost threshold. Set to -1 to never merge likely "
111 "branches."),
112 cl::Hidden);
113
115 "x86-br-merging-unlikely-bias", cl::init(-1),
116 cl::desc(
117 "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
118 "that all conditionals will be executed. For example for merging "
119 "the conditionals (a == b && c > d), if its known that a == b is "
120 "unlikely, then it is unlikely that if the conditionals are split "
121 "both sides will be executed, so it may be desirable to decrease "
122 "the instruction cost threshold. Set to -1 to never merge unlikely "
123 "branches."),
124 cl::Hidden);
125
127 "mul-constant-optimization", cl::init(true),
128 cl::desc("Replace 'mul x, Const' with more effective instructions like "
129 "SHIFT, LEA, etc."),
130 cl::Hidden);
131
133 const X86Subtarget &STI)
134 : TargetLowering(TM, STI), Subtarget(STI) {
135 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
136 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
137
138 // Set up the TargetLowering object.
139
140 // X86 is weird. It always uses i8 for shift amounts and setcc results.
142 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
144
145 // X86 instruction cache is coherent with its data cache so we can use the
146 // default expansion to a no-op.
148
149 // For 64-bit, since we have so many registers, use the ILP scheduler.
150 // For 32-bit, use the register pressure specific scheduling.
151 // For Atom, always use ILP scheduling.
152 if (Subtarget.isAtom())
154 else if (Subtarget.is64Bit())
156 else
158 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
159 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
160
161 // Bypass expensive divides and use cheaper ones.
162 if (TM.getOptLevel() >= CodeGenOptLevel::Default) {
163 if (Subtarget.hasSlowDivide32())
164 addBypassSlowDiv(32, 8);
165 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
166 addBypassSlowDiv(64, 32);
167 }
168
169 if (Subtarget.canUseCMPXCHG16B())
171 else if (Subtarget.canUseCMPXCHG8B())
173 else
175
176 setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
177
179
180 // Set up the register classes.
181 addRegisterClass(MVT::i8, &X86::GR8RegClass);
182 addRegisterClass(MVT::i16, &X86::GR16RegClass);
183 addRegisterClass(MVT::i32, &X86::GR32RegClass);
184 if (Subtarget.is64Bit())
185 addRegisterClass(MVT::i64, &X86::GR64RegClass);
186
187 for (MVT VT : MVT::integer_valuetypes())
189
190 // We don't accept any truncstore of integer registers.
191 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
192 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
193 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
194 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
195 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
196 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
197
198 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
199
200 // SETOEQ and SETUNE require checking two conditions.
201 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
204 }
205
206 // Integer absolute.
207 if (Subtarget.canUseCMOV()) {
208 setOperationAction(ISD::ABS , MVT::i16 , Custom);
209 setOperationAction(ISD::ABS , MVT::i32 , Custom);
210 if (Subtarget.is64Bit())
211 setOperationAction(ISD::ABS , MVT::i64 , Custom);
212 }
213
214 // Absolute difference.
215 for (auto Op : {ISD::ABDS, ISD::ABDU}) {
216 setOperationAction(Op , MVT::i8 , Custom);
217 setOperationAction(Op , MVT::i16 , Custom);
218 setOperationAction(Op , MVT::i32 , Custom);
219 if (Subtarget.is64Bit())
220 setOperationAction(Op , MVT::i64 , Custom);
221 }
222
223 // Signed saturation subtraction.
227 if (Subtarget.is64Bit())
229
230 // Funnel shifts.
231 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
232 // For slow shld targets we only lower for code size.
233 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
234
235 setOperationAction(ShiftOp , MVT::i8 , Custom);
236 setOperationAction(ShiftOp , MVT::i16 , Custom);
237 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
238 if (Subtarget.is64Bit())
239 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
240 }
241
242 if (!Subtarget.useSoftFloat()) {
243 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
244 // operation.
249 // We have an algorithm for SSE2, and we turn this into a 64-bit
250 // FILD or VCVTUSI2SS/SD for other targets.
253 // We have an algorithm for SSE2->double, and we turn this into a
254 // 64-bit FILD followed by conditional FADD for other targets.
257
258 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
259 // this operation.
262 // SSE has no i16 to fp conversion, only i32. We promote in the handler
263 // to allow f80 to use i16 and f64 to use i16 with sse1 only
266 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
269 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
270 // are Legal, f80 is custom lowered.
273
274 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
275 // this operation.
277 // FIXME: This doesn't generate invalid exception when it should. PR44019.
283 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
284 // are Legal, f80 is custom lowered.
287
288 // Handle FP_TO_UINT by promoting the destination to a larger signed
289 // conversion.
291 // FIXME: This doesn't generate invalid exception when it should. PR44019.
294 // FIXME: This doesn't generate invalid exception when it should. PR44019.
300
305
306 if (!Subtarget.is64Bit() && Subtarget.hasX87()) {
309 }
310 }
311
312 if (Subtarget.hasSSE2()) {
313 // Custom lowering for saturating float to int conversions.
314 // We handle promotion to larger result types manually.
315 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
318 }
321 if (Subtarget.is64Bit()) {
324 }
325 }
326 if (Subtarget.hasAVX10_2()) {
331 for (MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
332 MVT::v4i64}) {
335 }
336 if (Subtarget.is64Bit()) {
339 }
340 }
341
342 // Handle address space casts between mixed sized pointers.
345
346 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
347 if (!Subtarget.hasSSE2()) {
352 if (Subtarget.is64Bit()) {
354 // Without SSE, i64->f64 goes through memory.
356 }
357 } else if (!Subtarget.is64Bit())
359
360 // Scalar integer divide and remainder are lowered to use operations that
361 // produce two results, to match the available instructions. This exposes
362 // the two-result form to trivial CSE, which is able to combine x/y and x%y
363 // into a single instruction.
364 //
365 // Scalar integer multiply-high is also lowered to use two-result
366 // operations, to match the available instructions. However, plain multiply
367 // (low) operations are left as Legal, as there are single-result
368 // instructions for this in x86. Using the two-result multiply instructions
369 // when both high and low results are needed must be arranged by dagcombine.
370 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
377 }
378
379 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
381 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
382 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
385 }
386 if (Subtarget.is64Bit())
391
396
397 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
403 }
404
405 // Promote the i8 variants and force them on up to i32 which has a shorter
406 // encoding.
407 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
409 // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
410 // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
411 // promote that too.
412 setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);
414
415 if (!Subtarget.hasBMI()) {
416 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
418 if (Subtarget.is64Bit()) {
419 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
421 }
422 }
423
424 if (Subtarget.hasLZCNT()) {
425 // When promoting the i8 variants, force them to i32 for a shorter
426 // encoding.
427 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
429 } else {
430 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
431 if (VT == MVT::i64 && !Subtarget.is64Bit())
432 continue;
435 }
436 }
437
440 // Special handling for half-precision floating point conversions.
441 // If we don't have F16C support, then lower half float conversions
442 // into library calls.
444 Op, MVT::f32,
445 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
446 // There's never any support for operations beyond MVT::f32.
447 setOperationAction(Op, MVT::f64, Expand);
448 setOperationAction(Op, MVT::f80, Expand);
449 setOperationAction(Op, MVT::f128, Expand);
450 }
451
452 for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
455 }
456
457 for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
458 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
459 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
460 setTruncStoreAction(VT, MVT::f16, Expand);
461 setTruncStoreAction(VT, MVT::bf16, Expand);
462
465 }
466
470 if (Subtarget.is64Bit())
472 if (Subtarget.hasPOPCNT()) {
473 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
474 // popcntw is longer to encode than popcntl and also has a false dependency
475 // on the dest that popcntl hasn't had since Cannon Lake.
476 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
477 } else {
482 }
483
485
486 if (!Subtarget.hasMOVBE())
488
489 // X86 wants to expand cmov itself.
490 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
495 }
496 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
497 if (VT == MVT::i64 && !Subtarget.is64Bit())
498 continue;
501 }
502
504
505 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
508
510 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
511 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
515
516 // Darwin ABI issue.
517 for (auto VT : { MVT::i32, MVT::i64 }) {
518 if (VT == MVT::i64 && !Subtarget.is64Bit())
519 continue;
526 }
527
528 // 64-bit shl, sra, srl (iff 32-bit x86)
529 for (auto VT : { MVT::i32, MVT::i64 }) {
530 if (VT == MVT::i64 && !Subtarget.is64Bit())
531 continue;
535 }
536
537 if (Subtarget.hasSSEPrefetch())
539
541
542 // Expand certain atomics
543 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
551 }
552
553 if (!Subtarget.is64Bit())
555
556 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
557 // All CPUs supporting AVX will atomically load/store aligned 128-bit
558 // values, so we can emit [V]MOVAPS/[V]MOVDQA.
561 }
562
563 if (Subtarget.canUseCMPXCHG16B())
565
566 // FIXME - use subtarget debug flags
567 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
568 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
569 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
571 }
572
575
578
579 setOperationAction(ISD::TRAP, MVT::Other, Legal);
581 if (Subtarget.isTargetPS())
583 else
585
586 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
588 setOperationAction(ISD::VAEND , MVT::Other, Expand);
589 bool Is64Bit = Subtarget.is64Bit();
590 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
591 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
592
595
597
598 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
601
603
604 auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
605 setOperationAction(ISD::FABS, VT, Action);
606 setOperationAction(ISD::FNEG, VT, Action);
608 setOperationAction(ISD::FREM, VT, Action);
609 setOperationAction(ISD::FMA, VT, Action);
610 setOperationAction(ISD::FMINNUM, VT, Action);
611 setOperationAction(ISD::FMAXNUM, VT, Action);
616 setOperationAction(ISD::FSIN, VT, Action);
617 setOperationAction(ISD::FCOS, VT, Action);
618 setOperationAction(ISD::FSINCOS, VT, Action);
619 setOperationAction(ISD::FTAN, VT, Action);
620 setOperationAction(ISD::FSQRT, VT, Action);
621 setOperationAction(ISD::FPOW, VT, Action);
622 setOperationAction(ISD::FPOWI, VT, Action);
623 setOperationAction(ISD::FLOG, VT, Action);
624 setOperationAction(ISD::FLOG2, VT, Action);
625 setOperationAction(ISD::FLOG10, VT, Action);
626 setOperationAction(ISD::FEXP, VT, Action);
627 setOperationAction(ISD::FEXP2, VT, Action);
628 setOperationAction(ISD::FEXP10, VT, Action);
629 setOperationAction(ISD::FCEIL, VT, Action);
630 setOperationAction(ISD::FFLOOR, VT, Action);
632 setOperationAction(ISD::FRINT, VT, Action);
633 setOperationAction(ISD::BR_CC, VT, Action);
634 setOperationAction(ISD::SETCC, VT, Action);
637 setOperationAction(ISD::FROUND, VT, Action);
639 setOperationAction(ISD::FTRUNC, VT, Action);
640 setOperationAction(ISD::FLDEXP, VT, Action);
642 };
643
644 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
645 // f16, f32 and f64 use SSE.
646 // Set up the FP register classes.
647 addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
648 : &X86::FR16RegClass);
649 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
650 : &X86::FR32RegClass);
651 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
652 : &X86::FR64RegClass);
653
654 // Disable f32->f64 extload as we can only generate this in one instruction
655 // under optsize. So its easier to pattern match (fpext (load)) for that
656 // case instead of needing to emit 2 instructions for extload in the
657 // non-optsize case.
658 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
659
660 for (auto VT : { MVT::f32, MVT::f64 }) {
661 // Use ANDPD to simulate FABS.
663
664 // Use XORP to simulate FNEG.
666
667 // Use ANDPD and ORPD to simulate FCOPYSIGN.
669
670 // These might be better off as horizontal vector ops.
673
674 // We don't support sin/cos/fmod
678 }
679
680 // Half type will be promoted by default.
681 setF16Action(MVT::f16, Promote);
692
723
728
733
734 // Lower this to MOVMSK plus an AND.
737
738 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
739 (UseX87 || Is64Bit)) {
740 // Use SSE for f32, x87 for f64.
741 // Set up the FP register classes.
742 addRegisterClass(MVT::f32, &X86::FR32RegClass);
743 if (UseX87)
744 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
745
746 // Use ANDPS to simulate FABS.
748
749 // Use XORP to simulate FNEG.
751
752 if (UseX87)
754
755 // Use ANDPS and ORPS to simulate FCOPYSIGN.
756 if (UseX87)
759
760 // We don't support sin/cos/fmod
764
765 if (UseX87) {
766 // Always expand sin/cos functions even though x87 has an instruction.
770 }
771 } else if (UseX87) {
772 // f32 and f64 in x87.
773 // Set up the FP register classes.
774 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
775 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
776
777 for (auto VT : { MVT::f32, MVT::f64 }) {
780
781 // Always expand sin/cos functions even though x87 has an instruction.
785 }
786 }
787
788 // Expand FP32 immediates into loads from the stack, save special cases.
789 if (isTypeLegal(MVT::f32)) {
790 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
791 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
792 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
793 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
794 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
795 } else // SSE immediates.
796 addLegalFPImmediate(APFloat(+0.0f)); // xorps
797 }
798 // Expand FP64 immediates into loads from the stack, save special cases.
799 if (isTypeLegal(MVT::f64)) {
800 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
801 addLegalFPImmediate(APFloat(+0.0)); // FLD0
802 addLegalFPImmediate(APFloat(+1.0)); // FLD1
803 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
804 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
805 } else // SSE immediates.
806 addLegalFPImmediate(APFloat(+0.0)); // xorpd
807 }
808 // Support fp16 0 immediate.
809 if (isTypeLegal(MVT::f16))
810 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
811
812 // Handle constrained floating-point operations of scalar.
825
826 // We don't support FMA.
829
830 // f80 always uses X87.
831 if (UseX87) {
832 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
835 {
837 addLegalFPImmediate(TmpFlt); // FLD0
838 TmpFlt.changeSign();
839 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
840
841 bool ignored;
842 APFloat TmpFlt2(+1.0);
844 &ignored);
845 addLegalFPImmediate(TmpFlt2); // FLD1
846 TmpFlt2.changeSign();
847 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
848 }
849
850 // Always expand sin/cos functions even though x87 has an instruction.
851 // clang-format off
863 // clang-format on
864
876
877 // Handle constrained floating-point operations of scalar.
884 if (isTypeLegal(MVT::f16)) {
887 } else {
889 }
890 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
891 // as Custom.
893 }
894
895 // f128 uses xmm registers, but most operations require libcalls.
896 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
897 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
898 : &X86::VR128RegClass);
899
900 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
901
912
916
917 // clang-format off
925 // clang-format on
926 // No STRICT_FSINCOS
929
932 // We need to custom handle any FP_ROUND with an f128 input, but
933 // LegalizeDAG uses the result type to know when to run a custom handler.
934 // So we have to list all legal floating point result types here.
935 if (isTypeLegal(MVT::f32)) {
938 }
939 if (isTypeLegal(MVT::f64)) {
942 }
943 if (isTypeLegal(MVT::f80)) {
947 }
948
950
951 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
952 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
953 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
954 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
955 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
956 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
957 }
958
959 // Always use a library call for pow.
960 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
961 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
962 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
963 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
964
973
974 // Some FP actions are always expanded for vector types.
975 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
976 MVT::v4f32, MVT::v8f32, MVT::v16f32,
977 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
978 // clang-format off
992 // clang-format on
993 }
994
995 // First set operation action for all vector types to either promote
996 // (for widening) or expand (for scalarization). Then we will selectively
997 // turn on ones that can be effectively codegen'd.
1037 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1038 setTruncStoreAction(InnerVT, VT, Expand);
1039
1040 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
1041 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
1042
1043 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
1044 // types, we have to deal with them whether we ask for Expansion or not.
1045 // Setting Expand causes its own optimisation problems though, so leave
1046 // them legal.
1047 if (VT.getVectorElementType() == MVT::i1)
1048 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1049
1050 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
1051 // split/scalarized right now.
1052 if (VT.getVectorElementType() == MVT::f16 ||
1053 VT.getVectorElementType() == MVT::bf16)
1054 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1055 }
1056 }
1057
1058 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
1059 // with -msoft-float, disable use of MMX as well.
1060 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1061 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
1062 // No operations on x86mmx supported, everything uses intrinsics.
1063 }
1064
1065 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1066 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1067 : &X86::VR128RegClass);
1068
1073
1074 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
1075 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
1083
1084 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
1085 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
1087
1093 }
1094
1095 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1096 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1097 : &X86::VR128RegClass);
1098
1099 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1100 // registers cannot be used even for integer operations.
1101 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1102 : &X86::VR128RegClass);
1103 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1104 : &X86::VR128RegClass);
1105 addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1106 : &X86::VR128RegClass);
1107 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1108 : &X86::VR128RegClass);
1109 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1110 : &X86::VR128RegClass);
1111
1112 for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1117 }
1118
1119 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1120 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1125 }
1126
1127 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
1128 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
1129 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
1130
1131 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
1132 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1133 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1134 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
1135 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
1136 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
1137 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
1138 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
1139 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
1140 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
1143
1144 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
1145 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
1146 setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
1147
1148 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
1150 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
1152
1153 setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
1154 setOperationAction(ISD::LRINT, MVT::v2i32, Custom);
1155
1156 setOperationAction(ISD::AND, MVT::i128, Custom);
1157 setOperationAction(ISD::OR, MVT::i128, Custom);
1158 setOperationAction(ISD::XOR, MVT::i128, Custom);
1160
1161 if (Subtarget.hasPCLMUL()) {
1162 for (auto VT : {MVT::i64, MVT::v4i32, MVT::v2i64}) {
1165 }
1169 }
1170
1171 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1172 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1173 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1174 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1175 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1176 }
1177
1188
1193
1194 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1200
1201 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1202 // setcc all the way to isel and prefer SETGT in some isel patterns.
1205 }
1206
1207 setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
1208 setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
1213
1214 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1220 }
1221
1222 for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1226
1227 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1228 continue;
1229
1232 }
1233 setF16Action(MVT::v8f16, Expand);
1234 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1235 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1236 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1237 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1238 setOperationAction(ISD::FNEG, MVT::v8f16, Custom);
1239 setOperationAction(ISD::FABS, MVT::v8f16, Custom);
1241
1242 // Custom lower v2i64 and v2f64 selects.
1249
1256
1257 // Custom legalize these to avoid over promotion or custom promotion.
1258 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1263 }
1264
1269
1272
1275
1276 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1281
1286
1287 // We want to legalize this to an f64 load rather than an i64 load on
1288 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1289 // store.
1290 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1291 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1292 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1293 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1294 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1296
1297 // Add 32-bit vector stores to help vectorization opportunities.
1298 setOperationAction(ISD::STORE, MVT::v2i16, Custom);
1300
1304 if (!Subtarget.hasAVX512())
1306
1310
1312
1329
1330 // In the customized shift lowering, the legal v4i32/v2i64 cases
1331 // in AVX2 will be recognized.
1332 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1336 if (VT == MVT::v2i64) continue;
1341 }
1342
1348 }
1349
1350 if (!Subtarget.useSoftFloat() && Subtarget.hasGFNI()) {
1355
1356 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1358 }
1359
1360 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
1361 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
1362 }
1363
1364 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1365 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1366 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1367 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1368
1369 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1372 }
1373
1374 // These might be better off as horizontal vector ops.
1379 }
1380
1381 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1382 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1385 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1389 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1395
1397 }
1398
1399 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1400 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1401 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1402 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1403 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1404 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1405 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1406 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1407
1411
1412 // FIXME: Do we need to handle scalar-to-vector here?
1413 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1414 setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
1415
1416 // We directly match byte blends in the backend as they match the VSELECT
1417 // condition form.
1419
1420 // SSE41 brings specific instructions for doing vector sign extend even in
1421 // cases where we don't have SRA.
1422 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1425 }
1426
1427 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1428 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1429 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1430 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1431 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1432 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1433 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1434 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1435 }
1436
1437 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1438 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1439 // do the pre and post work in the vector domain.
1442 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1443 // so that DAG combine doesn't try to turn it into uint_to_fp.
1446 }
1447 }
1448
1449 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1451 }
1452
1453 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1454 for (MVT VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1455 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1458 }
1459
1460 // XOP can efficiently perform BITREVERSE with VPPERM.
1461 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1463 }
1464
1465 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1466 bool HasInt256 = Subtarget.hasInt256();
1467
1468 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1469 : &X86::VR256RegClass);
1470 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1471 : &X86::VR256RegClass);
1472 addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1473 : &X86::VR256RegClass);
1474 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1475 : &X86::VR256RegClass);
1476 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1477 : &X86::VR256RegClass);
1478 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1479 : &X86::VR256RegClass);
1480 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1481 : &X86::VR256RegClass);
1482
1483 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1496
1498
1502
1508 }
1509
1510 setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
1511 setOperationAction(ISD::LRINT, MVT::v4f64, Custom);
1512
1513 setOperationAction(ISD::AND, MVT::i256, Custom);
1514 setOperationAction(ISD::OR, MVT::i256, Custom);
1515 setOperationAction(ISD::XOR, MVT::i256, Custom);
1517
1518 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1519 // even though v8i16 is a legal type.
1520 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1521 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1522 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1523 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1527
1534
1546
1547 if (!Subtarget.hasAVX512())
1549
1550 // In the customized shift lowering, the legal v8i32/v4i64 cases
1551 // in AVX2 will be recognized.
1552 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1558 if (VT == MVT::v4i64) continue;
1563 }
1564
1565 // These types need custom splitting if their input is a 128-bit vector.
1570
1574 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1575 setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
1578
1579 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1583 }
1584
1589
1590 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1595
1596 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1597 // setcc all the way to isel and prefer SETGT in some isel patterns.
1600 }
1601
1602 setOperationAction(ISD::SETCC, MVT::v4f64, Custom);
1603 setOperationAction(ISD::SETCC, MVT::v8f32, Custom);
1608
1609 if (Subtarget.hasAnyFMA()) {
1610 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1611 MVT::v2f64, MVT::v4f64 }) {
1614 }
1615 }
1616
1617 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1618 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1619 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1620 }
1621
1622 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1623 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1624 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1625 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1626
1627 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1628 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1629 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1630 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1631 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1632 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1633 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1634 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1635
1636 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1637 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1638
1639 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1640 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1641 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1642 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1643 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1644
1645 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1646 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1647 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1648 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1649 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1650 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1651 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1652 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1657
1658 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1659 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1660 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1661 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1662 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1663 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1664 }
1665
1666 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1669 }
1670
1671 if (HasInt256) {
1672 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1673 // when we have a 256bit-wide blend with immediate.
1676
1677 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1678 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1679 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1680 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1681 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1682 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1683 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1684 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1685 }
1686 }
1687
1688 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1689 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1690 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1692 }
1693
1694 // Extract subvector is special because the value type
1695 // (result) is 128-bit but the source is 256-bit wide.
1696 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1697 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1699 }
1700
1701 // Custom lower several nodes for 256-bit types.
1702 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1703 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1713 }
1714 setF16Action(MVT::v16f16, Expand);
1715 setOperationAction(ISD::FNEG, MVT::v16f16, Custom);
1716 setOperationAction(ISD::FABS, MVT::v16f16, Custom);
1718 setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1719 setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1720 setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1721 setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1722
1723 // Only PCLMUL required as we always unroll clmul vectors.
1724 if (Subtarget.hasPCLMUL()) {
1725 for (auto VT : {MVT::v8i32, MVT::v4i64}) {
1728 }
1729 }
1730
1731 if (HasInt256) {
1733
1734 // Custom legalize 2x32 to get a little better code.
1737
1738 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1739 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1741 }
1742
1743 if (Subtarget.hasGFNI()) {
1744 setOperationAction(ISD::CTLZ, MVT::v32i8, Custom);
1745 setOperationAction(ISD::CTTZ, MVT::v32i8, Custom);
1746 }
1747 }
1748
1749 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1750 Subtarget.hasF16C()) {
1751 for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1754 }
1755 for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1758 }
1759 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1760 setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1761 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1762 }
1763 setOperationAction(ISD::SETCC, MVT::v8f16, Custom);
1764 setOperationAction(ISD::SETCC, MVT::v16f16, Custom);
1765 }
1766
1767 // This block controls legalization of the mask vector sizes that are
1768 // available with AVX512. 512-bit vectors are in a separate block controlled
1769 // by useAVX512Regs.
1770 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1771 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1772 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1773 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1774 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1775 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1776
1780
1781 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1782 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1783 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1784 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1785 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1786 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1787 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1788 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1796
1797 // There is no byte sized k-register load or store without AVX512DQ.
1798 if (!Subtarget.hasDQI()) {
1799 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1800 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1801 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1802 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1803
1808 }
1809
1810 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1811 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1815 }
1816
1817 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1819
1820 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1824
1831 }
1832
1833 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1835 }
1836 if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
1837 for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1840 }
1841 }
1842
1843 // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1844 // elements. 512-bits can be disabled based on prefer-vector-width and
1845 // required-vector-width function attributes.
1846 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1847 bool HasBWI = Subtarget.hasBWI();
1848
1849 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1850 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1851 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1852 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1853 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1854 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1855 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1856
1857 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1858 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1859 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1860 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1861 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1862 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1863 if (HasBWI)
1864 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1865 }
1866
1867 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1878 }
1879 setOperationAction(ISD::LRINT, MVT::v16f32,
1880 Subtarget.hasDQI() ? Legal : Custom);
1881 setOperationAction(ISD::LRINT, MVT::v8f64,
1882 Subtarget.hasDQI() ? Legal : Custom);
1883 if (Subtarget.hasDQI())
1884 setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
1885
1886 setOperationAction(ISD::AND, MVT::i512, Custom);
1887 setOperationAction(ISD::OR, MVT::i512, Custom);
1888 setOperationAction(ISD::XOR, MVT::i512, Custom);
1889 setOperationAction(ISD::ADD, MVT::i512, Custom);
1890 setOperationAction(ISD::SUB, MVT::i512, Custom);
1891 setOperationAction(ISD::SRL, MVT::i512, Custom);
1892 setOperationAction(ISD::SHL, MVT::i512, Custom);
1893 setOperationAction(ISD::SRA, MVT::i512, Custom);
1894 setOperationAction(ISD::FSHR, MVT::i512, Custom);
1895 setOperationAction(ISD::FSHL, MVT::i512, Custom);
1896 setOperationAction(ISD::FSHR, MVT::i256, Custom);
1897 setOperationAction(ISD::FSHL, MVT::i256, Custom);
1899
1900 for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1905 }
1906
1907 for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1912 }
1913
1920
1932
1933 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1934 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1935 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1936 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1937 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1938 if (HasBWI)
1939 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1940
1941 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1942 // to 512-bit rather than use the AVX2 instructions so that we can use
1943 // k-masks.
1944 if (!Subtarget.hasVLX()) {
1945 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1946 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1949 }
1950 }
1951
1953 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1954 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1964
1965 if (HasBWI) {
1966 // Extends from v64i1 masks to 512-bit vectors.
1970 }
1971
1972 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1985
1987 }
1988
1989 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1992 }
1993
1994 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1995 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1996 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1997 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1998
1999 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
2000 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
2001 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
2002 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
2003
2004 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
2005 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
2006 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
2007 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
2008 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
2009 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
2010 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
2011 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
2012
2013 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
2014 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
2015
2016 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
2026
2027 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
2028 // setcc all the way to isel and prefer SETGT in some isel patterns.
2031 }
2032
2033 setOperationAction(ISD::SETCC, MVT::v8f64, Custom);
2034 setOperationAction(ISD::SETCC, MVT::v16f32, Custom);
2039
2040 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
2049 }
2050
2051 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2052 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
2053 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
2055 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
2056 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
2057 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
2058 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
2063 }
2064
2065 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
2066 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
2067 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
2068 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
2069 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
2070 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
2071
2072 if (Subtarget.hasDQI() || Subtarget.hasFP16())
2076 setOperationAction(Opc, MVT::v8i64, Custom);
2077
2078 if (Subtarget.hasDQI())
2079 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
2080
2081 if (Subtarget.hasCDI()) {
2082 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
2083 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
2085 }
2086 } // Subtarget.hasCDI()
2087
2088 if (Subtarget.hasVPOPCNTDQ()) {
2089 for (auto VT : { MVT::v16i32, MVT::v8i64 })
2092 }
2093
2094 // Extract subvector is special because the value type
2095 // (result) is 256-bit but the source is 512-bit wide.
2096 // 128-bit was made Legal under AVX1.
2097 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
2098 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
2100
2101 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
2102 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
2112 }
2113 setF16Action(MVT::v32f16, Expand);
2118 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2119 setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
2120 setOperationAction(ISD::SETCC, MVT::v32f16, Custom);
2121
2122 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
2127 }
2128 if (HasBWI) {
2129 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2132 }
2133 } else {
2134 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
2135 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
2136 }
2137
2138 if (Subtarget.hasVBMI2()) {
2139 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2142 }
2143
2144 setOperationAction(ISD::ROTL, MVT::v32i16, Legal);
2145 setOperationAction(ISD::ROTR, MVT::v32i16, Legal);
2146 }
2147
2148 // Only PCLMUL required as we always unroll clmul vectors.
2149 if (Subtarget.hasPCLMUL()) {
2150 for (auto VT : {MVT::v16i32, MVT::v8i64}) {
2153 }
2154 }
2155
2156 setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
2157 setOperationAction(ISD::FABS, MVT::v32f16, Custom);
2159
2160 if (Subtarget.hasGFNI()) {
2161 setOperationAction(ISD::CTLZ, MVT::v64i8, Custom);
2162 setOperationAction(ISD::CTTZ, MVT::v64i8, Custom);
2163 }
2164 }// useAVX512Regs
2165
2166 if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
2167 for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
2168 MVT::v4i64}) {
2171 }
2172
2173 setOperationAction(ISD::ROTL, MVT::v16i16, Legal);
2174 setOperationAction(ISD::ROTR, MVT::v16i16, Legal);
2175 setOperationAction(ISD::ROTL, MVT::v8i16, Legal);
2176 setOperationAction(ISD::ROTR, MVT::v8i16, Legal);
2177 }
2178
2179 // This block controls legalization for operations that don't have
2180 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
2181 // narrower widths.
2182 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
2183 for (MVT VT : {MVT::f16, MVT::f32, MVT::f64, MVT::v8f16, MVT::v4f32,
2184 MVT::v2f64, MVT::v16f16, MVT::v8f32, MVT::v4f64, MVT::v32f16,
2185 MVT::v16f32, MVT::v8f64})
2187
2188 // These operations are handled on non-VLX by artificially widening in
2189 // isel patterns.
2193
2194 if (Subtarget.hasDQI()) {
2195 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
2196 // v2f32 UINT_TO_FP is already custom under SSE2.
2199 "Unexpected operation action!");
2200 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
2205 }
2206
2207 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
2213 }
2214
2215 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2218 }
2219
2220 // Custom legalize 2x32 to get a little better code.
2223
2224 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2225 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2227
2228 if (Subtarget.hasDQI()) {
2232 setOperationAction(Opc, MVT::v2i64, Custom);
2233 setOperationAction(Opc, MVT::v4i64, Custom);
2234 }
2235 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
2236 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
2237 }
2238
2239 if (Subtarget.hasCDI()) {
2240 for (auto VT : {MVT::i256, MVT::i512}) {
2241 if (VT == MVT::i512 && !Subtarget.useAVX512Regs())
2242 continue;
2247 }
2248 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2250 }
2251 } // Subtarget.hasCDI()
2252
2253 if (Subtarget.hasVPOPCNTDQ()) {
2254 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64})
2257 }
2258
2259 // We can try to convert vectors to different sizes to leverage legal
2260 // `vpcompress` cases. So we mark these supported vector sizes as Custom and
2261 // then specialize to Legal below.
2262 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2263 MVT::v4f64, MVT::v2i64, MVT::v2f64, MVT::v16i8, MVT::v8i16,
2264 MVT::v16i16, MVT::v8i8})
2266
2267 // Legal vpcompress depends on various AVX512 extensions.
2268 // Legal in AVX512F
2269 for (MVT VT : {MVT::v16i32, MVT::v16f32, MVT::v8i64, MVT::v8f64})
2271
2272 // Legal in AVX512F + AVX512VL
2273 if (Subtarget.hasVLX())
2274 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2275 MVT::v4f64, MVT::v2i64, MVT::v2f64})
2277
2278 // Legal in AVX512F + AVX512VBMI2
2279 if (Subtarget.hasVBMI2())
2280 for (MVT VT : {MVT::v32i16, MVT::v64i8})
2282
2283 // Legal in AVX512F + AVX512VL + AVX512VBMI2
2284 if (Subtarget.hasVBMI2() && Subtarget.hasVLX())
2285 for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v32i8, MVT::v16i16})
2287 }
2288
2289 // This block control legalization of v32i1/v64i1 which are available with
2290 // AVX512BW..
2291 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2292 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
2293 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
2294
2295 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2306 }
2307
2308 for (auto VT : { MVT::v16i1, MVT::v32i1 })
2310
2311 // Extends from v32i1 masks to 256-bit vectors.
2315
2316 for (auto VT : {MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16,
2317 MVT::v16f16, MVT::v8f16}) {
2318 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
2319 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2320 }
2321
2322 // These operations are handled on non-VLX by artificially widening in
2323 // isel patterns.
2324 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2325
2326 if (Subtarget.hasBITALG()) {
2327 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2329 }
2330 }
2331
2332 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2333 auto setGroup = [&] (MVT VT) {
2344
2357
2359
2362
2368
2374
2378 };
2379
2380 // AVX512_FP16 scalar operations
2381 setGroup(MVT::f16);
2399
2402
2403 if (Subtarget.useAVX512Regs()) {
2404 setGroup(MVT::v32f16);
2410 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
2417
2422 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
2424 MVT::v32i16);
2425 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2427 MVT::v32i16);
2428 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2430 MVT::v32i16);
2431 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2433 MVT::v32i16);
2434
2438
2439 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2440 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2441
2446 setOperationAction(ISD::LRINT, MVT::v32f16, Legal);
2447 setOperationAction(ISD::LLRINT, MVT::v8f16, Legal);
2448 }
2449
2454
2455 if (Subtarget.hasVLX()) {
2456 setGroup(MVT::v8f16);
2457 setGroup(MVT::v16f16);
2458
2469
2476
2477 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2480
2484
2485 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2486 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2487 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2488 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2489
2490 // Need to custom widen these to prevent scalarization.
2491 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2492 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2493
2498
2503 setOperationAction(ISD::LRINT, MVT::v8f16, Legal);
2504 setOperationAction(ISD::LRINT, MVT::v16f16, Legal);
2505 }
2506 }
2507
2508 if (!Subtarget.useSoftFloat() &&
2509 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2510 addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass
2511 : &X86::VR128RegClass);
2512 addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
2513 : &X86::VR256RegClass);
2514 // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2515 // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT.
2516 // Set the operation action Custom to do the customization later.
2519 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2520 setF16Action(VT, Expand);
2521 if (!Subtarget.hasBF16())
2527 }
2528 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
2529 setOperationPromotedToType(Opc, MVT::v8bf16, MVT::v8f32);
2530 setOperationPromotedToType(Opc, MVT::v16bf16, MVT::v16f32);
2531 }
2532 setOperationAction(ISD::SETCC, MVT::v8bf16, Custom);
2533 setOperationAction(ISD::SETCC, MVT::v16bf16, Custom);
2535 addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2536 }
2537
2538 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16() &&
2539 Subtarget.useAVX512Regs()) {
2540 addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2541 setF16Action(MVT::v32bf16, Expand);
2542 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2543 setOperationPromotedToType(Opc, MVT::v32bf16, MVT::v32f32);
2544 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2546 setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
2550 }
2551
2552 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) {
2553 setOperationAction(ISD::FADD, MVT::v32bf16, Legal);
2554 setOperationAction(ISD::FSUB, MVT::v32bf16, Legal);
2555 setOperationAction(ISD::FMUL, MVT::v32bf16, Legal);
2556 setOperationAction(ISD::FDIV, MVT::v32bf16, Legal);
2557 setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal);
2558 setOperationAction(ISD::FMA, MVT::v32bf16, Legal);
2559 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2560 setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom);
2561 setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom);
2564 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2576 }
2577 for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
2580 }
2581 }
2582
2583 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2584 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2585 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2586 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2587 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2588 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2589
2590 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2591 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2592 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2593 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2594 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2595
2596 if (Subtarget.hasBWI()) {
2597 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2598 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2599 }
2600
2601 if (Subtarget.hasFP16()) {
2602 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2611 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2620 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2625 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2630 }
2631 }
2632
2633 if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {
2634 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2635 }
2636
2637 // We want to custom lower some of our intrinsics.
2641 if (!Subtarget.is64Bit()) {
2643 }
2644
2645 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2646 // handle type legalization for these operations here.
2647 //
2648 // FIXME: We really should do custom legalization for addition and
2649 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2650 // than generic legalization for 64-bit multiplication-with-overflow, though.
2651 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2652 if (VT == MVT::i64 && !Subtarget.is64Bit())
2653 continue;
2654 // Add/Sub/Mul with overflow operations are custom lowered.
2661
2662 // Support carry in as value rather than glue.
2668 }
2669
2670 // Combine sin / cos into _sincos_stret if it is available.
2673
2674 if (Subtarget.isTargetWin64()) {
2675 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2676 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2677 setOperationAction(ISD::SREM, MVT::i128, Custom);
2678 setOperationAction(ISD::UREM, MVT::i128, Custom);
2687 }
2688
2689 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2690 // is. We should promote the value to 64-bits to solve this.
2691 // This is what the CRT headers do - `fmodf` is an inline header
2692 // function casting to f64 and calling `fmod`.
2693 if (Subtarget.is32Bit() &&
2694 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2695 // clang-format off
2696 for (ISD::NodeType Op :
2714 // TODO: Add ISD:::STRICT_FMODF too once implemented.
2715 ISD::FMODF})
2716 if (isOperationExpandOrLibCall(Op, MVT::f32))
2717 setOperationAction(Op, MVT::f32, Promote);
2718 // clang-format on
2719
2720 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2721 // it, but it's just a wrapper around ldexp.
2722 if (Subtarget.isOSWindows()) {
2724 if (isOperationExpand(Op, MVT::f32))
2725 setOperationAction(Op, MVT::f32, Promote);
2726 }
2727
2728 setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
2729 setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
2730 setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
2731
2732 // We have target-specific dag combine patterns for the following nodes:
2743 ISD::SHL,
2744 ISD::SRA,
2745 ISD::SRL,
2746 ISD::OR,
2747 ISD::AND,
2753 ISD::ADD,
2756 ISD::FADD,
2757 ISD::FSUB,
2758 ISD::FNEG,
2759 ISD::FMA,
2763 ISD::SUB,
2764 ISD::LOAD,
2765 ISD::LRINT,
2767 ISD::MLOAD,
2768 ISD::STORE,
2785 ISD::SETCC,
2786 ISD::MUL,
2787 ISD::XOR,
2795 ISD::ROTL,
2796 ISD::ROTR,
2797 ISD::FSHL,
2798 ISD::FSHR,
2802
2803 computeRegisterProperties(Subtarget.getRegisterInfo());
2804
2805 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2807 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2809 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2811
2812 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2813 // that needs to benchmarked and balanced with the potential use of vector
2814 // load/store types (PR33329, PR33914).
2817
2818 // Default loop alignment, which can be overridden by -align-loops.
2820
2821 // An out-of-order CPU can speculatively execute past a predictable branch,
2822 // but a conditional move could be stalled by an expensive earlier operation.
2823 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2824 EnableExtLdPromotion = true;
2826
2828
2829 // Default to having -disable-strictnode-mutation on
2830 IsStrictFPEnabled = true;
2831}
2832
2833// This has so far only been implemented for 64-bit MachO.
2835 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2836}
2837
2839 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2840 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2841}
2842
2844 const SDLoc &DL) const {
2845 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2846 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2847 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2848 return SDValue(Node, 0);
2849}
2850
2853 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2854 !Subtarget.hasBWI())
2855 return TypeSplitVector;
2856
2857 // Since v8f16 is legal, widen anything over v4f16.
2858 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2859 VT.getVectorNumElements() <= 4 && !Subtarget.hasF16C() &&
2860 VT.getVectorElementType() == MVT::f16)
2861 return TypeSplitVector;
2862
2863 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2864 VT.getVectorElementType() != MVT::i1)
2865 return TypeWidenVector;
2866
2868}
2869
2871 FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo,
2872 const LibcallLoweringInfo *libcallLowering) const {
2873 return X86::createFastISel(funcInfo, libInfo, libcallLowering);
2874}
2875
2876//===----------------------------------------------------------------------===//
2877// Other Lowering Hooks
2878//===----------------------------------------------------------------------===//
2879
2881 bool AssumeSingleUse, bool IgnoreAlignment) {
2882 if (!AssumeSingleUse && !Op.hasOneUse())
2883 return false;
2884 if (!ISD::isNormalLoad(Op.getNode()))
2885 return false;
2886
2887 // If this is an unaligned vector, make sure the target supports folding it.
2888 auto *Ld = cast<LoadSDNode>(Op.getNode());
2889 if (!IgnoreAlignment && !Subtarget.hasAVX() &&
2890 !Subtarget.hasSSEUnalignedMem() && Ld->getValueSizeInBits(0) == 128 &&
2891 Ld->getAlign() < Align(16))
2892 return false;
2893
2894 // TODO: If this is a non-temporal load and the target has an instruction
2895 // for it, it should not be folded. See "useNonTemporalLoad()".
2896
2897 return true;
2898}
2899
2901 const X86Subtarget &Subtarget,
2902 bool AssumeSingleUse) {
2903 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
2904 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
2905 return false;
2906
2907 // We can not replace a wide volatile load with a broadcast-from-memory,
2908 // because that would narrow the load, which isn't legal for volatiles.
2909 auto *Ld = cast<LoadSDNode>(Op.getNode());
2910 return !Ld->isVolatile() ||
2911 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
2912}
2913
2915 if (!Op.hasOneUse())
2916 return false;
2917 // Peek through (oneuse) bitcast users
2918 SDNode *User = *Op->user_begin();
2919 while (User->getOpcode() == ISD::BITCAST) {
2920 if (!User->hasOneUse())
2921 return false;
2922 User = *User->user_begin();
2923 }
2924 return ISD::isNormalStore(User);
2925}
2926
2928 if (Op.hasOneUse()) {
2929 unsigned Opcode = Op.getNode()->user_begin()->getOpcode();
2930 return (ISD::ZERO_EXTEND == Opcode);
2931 }
2932 return false;
2933}
2934
2935// Return true if its cheap to bitcast this to a vector type.
2937 const X86Subtarget &Subtarget) {
2938 if (peekThroughBitcasts(Op).getValueType().isVector())
2939 return true;
2941 return true;
2942
2943 EVT VT = Op.getValueType();
2944 unsigned Opcode = Op.getOpcode();
2945 if ((VT == MVT::i128 || VT == MVT::i256 || VT == MVT::i512) &&
2946 DAG.getTargetLoweringInfo().getOperationAction(Opcode, VT) ==
2948 // Check for larger than legal scalar integer ops that might have been
2949 // custom lowered to vector instruction.
2950 switch (Opcode) {
2951 case ISD::SHL:
2952 case ISD::SRL:
2953 case ISD::SRA:
2954 return mayFoldIntoVector(Op.getOperand(0), DAG, Subtarget);
2955 case ISD::AND:
2956 case ISD::OR:
2957 case ISD::XOR:
2958 case ISD::ADD:
2959 case ISD::SUB:
2960 case ISD::FSHL:
2961 case ISD::FSHR:
2962 return mayFoldIntoVector(Op.getOperand(0), DAG, Subtarget) &&
2963 mayFoldIntoVector(Op.getOperand(1), DAG, Subtarget);
2964 case ISD::SELECT:
2965 return mayFoldIntoVector(Op.getOperand(1), DAG, Subtarget) &&
2966 mayFoldIntoVector(Op.getOperand(2), DAG, Subtarget);
2967 }
2968 }
2969 return X86::mayFoldLoad(Op, Subtarget, /*AssumeSingleUse=*/true,
2970 /*IgnoreAlignment=*/true);
2971}
2972
2973static bool isLogicOp(unsigned Opcode) {
2974 // TODO: Add support for X86ISD::FAND/FOR/FXOR/FANDN with test coverage.
2975 return ISD::isBitwiseLogicOp(Opcode) || X86ISD::ANDNP == Opcode;
2976}
2977
2978static bool isTargetShuffle(unsigned Opcode) {
2979 switch(Opcode) {
2980 default: return false;
2981 case X86ISD::BLENDI:
2982 case X86ISD::PSHUFB:
2983 case X86ISD::PSHUFD:
2984 case X86ISD::PSHUFHW:
2985 case X86ISD::PSHUFLW:
2986 case X86ISD::SHUFP:
2987 case X86ISD::INSERTPS:
2988 case X86ISD::EXTRQI:
2989 case X86ISD::INSERTQI:
2990 case X86ISD::VALIGN:
2991 case X86ISD::PALIGNR:
2992 case X86ISD::VSHLDQ:
2993 case X86ISD::VSRLDQ:
2994 case X86ISD::MOVLHPS:
2995 case X86ISD::MOVHLPS:
2996 case X86ISD::MOVSHDUP:
2997 case X86ISD::MOVSLDUP:
2998 case X86ISD::MOVDDUP:
2999 case X86ISD::MOVSS:
3000 case X86ISD::MOVSD:
3001 case X86ISD::MOVSH:
3002 case X86ISD::UNPCKL:
3003 case X86ISD::UNPCKH:
3004 case X86ISD::VBROADCAST:
3005 case X86ISD::VPERMILPI:
3006 case X86ISD::VPERMILPV:
3007 case X86ISD::VPERM2X128:
3008 case X86ISD::SHUF128:
3009 case X86ISD::VPERMIL2:
3010 case X86ISD::VPERMI:
3011 case X86ISD::VPPERM:
3012 case X86ISD::VPERMV:
3013 case X86ISD::VPERMV3:
3014 case X86ISD::VZEXT_MOVL:
3015 case X86ISD::COMPRESS:
3016 case X86ISD::EXPAND:
3017 return true;
3018 }
3019}
3020
3021static bool isTargetShuffleVariableMask(unsigned Opcode) {
3022 switch (Opcode) {
3023 default: return false;
3024 // Target Shuffles.
3025 case X86ISD::PSHUFB:
3026 case X86ISD::VPERMILPV:
3027 case X86ISD::VPERMIL2:
3028 case X86ISD::VPPERM:
3029 case X86ISD::VPERMV:
3030 case X86ISD::VPERMV3:
3031 return true;
3032 // 'Faux' Target Shuffles.
3033 case ISD::OR:
3034 case ISD::AND:
3035 case X86ISD::ANDNP:
3036 return true;
3037 }
3038}
3039
3042 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3044 int ReturnAddrIndex = FuncInfo->getRAIndex();
3045
3046 if (ReturnAddrIndex == 0) {
3047 // Set up a frame object for the return address.
3048 unsigned SlotSize = RegInfo->getSlotSize();
3049 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
3050 -(int64_t)SlotSize,
3051 false);
3052 FuncInfo->setRAIndex(ReturnAddrIndex);
3053 }
3054
3055 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
3056}
3057
3059 bool HasSymbolicDisplacement) {
3060 // Offset should fit into 32 bit immediate field.
3061 if (!isInt<32>(Offset))
3062 return false;
3063
3064 // If we don't have a symbolic displacement - we don't have any extra
3065 // restrictions.
3066 if (!HasSymbolicDisplacement)
3067 return true;
3068
3069 // We can fold large offsets in the large code model because we always use
3070 // 64-bit offsets.
3071 if (CM == CodeModel::Large)
3072 return true;
3073
3074 // For kernel code model we know that all object resist in the negative half
3075 // of 32bits address space. We may not accept negative offsets, since they may
3076 // be just off and we may accept pretty large positive ones.
3077 if (CM == CodeModel::Kernel)
3078 return Offset >= 0;
3079
3080 // For other non-large code models we assume that latest small object is 16MB
3081 // before end of 31 bits boundary. We may also accept pretty large negative
3082 // constants knowing that all objects are in the positive half of address
3083 // space.
3084 return Offset < 16 * 1024 * 1024;
3085}
3086
3087/// Return true if the condition is an signed comparison operation.
3088static bool isX86CCSigned(X86::CondCode X86CC) {
3089 switch (X86CC) {
3090 default:
3091 llvm_unreachable("Invalid integer condition!");
3092 case X86::COND_E:
3093 case X86::COND_NE:
3094 case X86::COND_B:
3095 case X86::COND_A:
3096 case X86::COND_BE:
3097 case X86::COND_AE:
3098 return false;
3099 case X86::COND_G:
3100 case X86::COND_GE:
3101 case X86::COND_L:
3102 case X86::COND_LE:
3103 return true;
3104 }
3105}
3106
3108 switch (SetCCOpcode) {
3109 // clang-format off
3110 default: llvm_unreachable("Invalid integer condition!");
3111 case ISD::SETEQ: return X86::COND_E;
3112 case ISD::SETGT: return X86::COND_G;
3113 case ISD::SETGE: return X86::COND_GE;
3114 case ISD::SETLT: return X86::COND_L;
3115 case ISD::SETLE: return X86::COND_LE;
3116 case ISD::SETNE: return X86::COND_NE;
3117 case ISD::SETULT: return X86::COND_B;
3118 case ISD::SETUGT: return X86::COND_A;
3119 case ISD::SETULE: return X86::COND_BE;
3120 case ISD::SETUGE: return X86::COND_AE;
3121 // clang-format on
3122 }
3123}
3124
3125/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
3126/// condition code, returning the condition code and the LHS/RHS of the
3127/// comparison to make.
3129 bool isFP, SDValue &LHS, SDValue &RHS,
3130 SelectionDAG &DAG) {
3131 if (!isFP) {
3133 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
3134 // X > -1 -> X == 0, jump !sign.
3135 RHS = DAG.getConstant(0, DL, RHS.getValueType());
3136 return X86::COND_NS;
3137 }
3138 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
3139 // X < 0 -> X == 0, jump on sign.
3140 return X86::COND_S;
3141 }
3142 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
3143 // X >= 0 -> X == 0, jump on !sign.
3144 return X86::COND_NS;
3145 }
3146 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
3147 // X < 1 -> X <= 0
3148 RHS = DAG.getConstant(0, DL, RHS.getValueType());
3149 return X86::COND_LE;
3150 }
3151 }
3152
3153 return TranslateIntegerX86CC(SetCCOpcode);
3154 }
3155
3156 // First determine if it is required or is profitable to flip the operands.
3157
3158 // If LHS is a foldable load, but RHS is not, flip the condition.
3159 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3160 !ISD::isNON_EXTLoad(RHS.getNode())) {
3161 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3162 std::swap(LHS, RHS);
3163 }
3164
3165 switch (SetCCOpcode) {
3166 default: break;
3167 case ISD::SETOLT:
3168 case ISD::SETOLE:
3169 case ISD::SETUGT:
3170 case ISD::SETUGE:
3171 std::swap(LHS, RHS);
3172 break;
3173 }
3174
3175 // On a floating point condition, the flags are set as follows:
3176 // ZF PF CF op
3177 // 0 | 0 | 0 | X > Y
3178 // 0 | 0 | 1 | X < Y
3179 // 1 | 0 | 0 | X == Y
3180 // 1 | 1 | 1 | unordered
3181 switch (SetCCOpcode) {
3182 // clang-format off
3183 default: llvm_unreachable("Condcode should be pre-legalized away");
3184 case ISD::SETUEQ:
3185 case ISD::SETEQ: return X86::COND_E;
3186 case ISD::SETOLT: // flipped
3187 case ISD::SETOGT:
3188 case ISD::SETGT: return X86::COND_A;
3189 case ISD::SETOLE: // flipped
3190 case ISD::SETOGE:
3191 case ISD::SETGE: return X86::COND_AE;
3192 case ISD::SETUGT: // flipped
3193 case ISD::SETULT:
3194 case ISD::SETLT: return X86::COND_B;
3195 case ISD::SETUGE: // flipped
3196 case ISD::SETULE:
3197 case ISD::SETLE: return X86::COND_BE;
3198 case ISD::SETONE:
3199 case ISD::SETNE: return X86::COND_NE;
3200 case ISD::SETUO: return X86::COND_P;
3201 case ISD::SETO: return X86::COND_NP;
3202 case ISD::SETOEQ:
3203 case ISD::SETUNE: return X86::COND_INVALID;
3204 // clang-format on
3205 }
3206}
3207
3208/// Is there a floating point cmov for the specific X86 condition code?
3209/// Current x86 isa includes the following FP cmov instructions:
3210/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3211static bool hasFPCMov(unsigned X86CC) {
3212 switch (X86CC) {
3213 default:
3214 return false;
3215 case X86::COND_B:
3216 case X86::COND_BE:
3217 case X86::COND_E:
3218 case X86::COND_P:
3219 case X86::COND_A:
3220 case X86::COND_AE:
3221 case X86::COND_NE:
3222 case X86::COND_NP:
3223 return true;
3224 }
3225}
3226
3227static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
3228 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
3229 VT.is512BitVector();
3230}
3231
3234 MachineFunction &MF, unsigned Intrinsic) const {
3235 IntrinsicInfo Info;
3237 Info.offset = 0;
3238
3240 if (!IntrData) {
3241 switch (Intrinsic) {
3242 case Intrinsic::x86_aesenc128kl:
3243 case Intrinsic::x86_aesdec128kl:
3244 Info.opc = ISD::INTRINSIC_W_CHAIN;
3245 Info.ptrVal = I.getArgOperand(1);
3246 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3247 Info.align = Align(1);
3248 Info.flags |= MachineMemOperand::MOLoad;
3249 Infos.push_back(Info);
3250 return;
3251 case Intrinsic::x86_aesenc256kl:
3252 case Intrinsic::x86_aesdec256kl:
3253 Info.opc = ISD::INTRINSIC_W_CHAIN;
3254 Info.ptrVal = I.getArgOperand(1);
3255 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3256 Info.align = Align(1);
3257 Info.flags |= MachineMemOperand::MOLoad;
3258 Infos.push_back(Info);
3259 return;
3260 case Intrinsic::x86_aesencwide128kl:
3261 case Intrinsic::x86_aesdecwide128kl:
3262 Info.opc = ISD::INTRINSIC_W_CHAIN;
3263 Info.ptrVal = I.getArgOperand(0);
3264 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3265 Info.align = Align(1);
3266 Info.flags |= MachineMemOperand::MOLoad;
3267 Infos.push_back(Info);
3268 return;
3269 case Intrinsic::x86_aesencwide256kl:
3270 case Intrinsic::x86_aesdecwide256kl:
3271 Info.opc = ISD::INTRINSIC_W_CHAIN;
3272 Info.ptrVal = I.getArgOperand(0);
3273 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3274 Info.align = Align(1);
3275 Info.flags |= MachineMemOperand::MOLoad;
3276 Infos.push_back(Info);
3277 return;
3278 case Intrinsic::x86_cmpccxadd32:
3279 case Intrinsic::x86_cmpccxadd64:
3280 case Intrinsic::x86_atomic_bts:
3281 case Intrinsic::x86_atomic_btc:
3282 case Intrinsic::x86_atomic_btr: {
3283 Info.opc = ISD::INTRINSIC_W_CHAIN;
3284 Info.ptrVal = I.getArgOperand(0);
3285 unsigned Size = I.getType()->getScalarSizeInBits();
3286 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3287 Info.align = Align(Size);
3290 Infos.push_back(Info);
3291 return;
3292 }
3293 case Intrinsic::x86_atomic_bts_rm:
3294 case Intrinsic::x86_atomic_btc_rm:
3295 case Intrinsic::x86_atomic_btr_rm: {
3296 Info.opc = ISD::INTRINSIC_W_CHAIN;
3297 Info.ptrVal = I.getArgOperand(0);
3298 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3299 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3300 Info.align = Align(Size);
3303 Infos.push_back(Info);
3304 return;
3305 }
3306 case Intrinsic::x86_aadd32:
3307 case Intrinsic::x86_aadd64:
3308 case Intrinsic::x86_aand32:
3309 case Intrinsic::x86_aand64:
3310 case Intrinsic::x86_aor32:
3311 case Intrinsic::x86_aor64:
3312 case Intrinsic::x86_axor32:
3313 case Intrinsic::x86_axor64:
3314 case Intrinsic::x86_atomic_add_cc:
3315 case Intrinsic::x86_atomic_sub_cc:
3316 case Intrinsic::x86_atomic_or_cc:
3317 case Intrinsic::x86_atomic_and_cc:
3318 case Intrinsic::x86_atomic_xor_cc: {
3319 Info.opc = ISD::INTRINSIC_W_CHAIN;
3320 Info.ptrVal = I.getArgOperand(0);
3321 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3322 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3323 Info.align = Align(Size);
3326 Infos.push_back(Info);
3327 return;
3328 }
3329 }
3330 return;
3331 }
3332
3333 switch (IntrData->Type) {
3336 case TRUNCATE_TO_MEM_VI32: {
3337 Info.opc = ISD::INTRINSIC_VOID;
3338 Info.ptrVal = I.getArgOperand(0);
3339 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
3341 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
3342 ScalarVT = MVT::i8;
3343 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
3344 ScalarVT = MVT::i16;
3345 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
3346 ScalarVT = MVT::i32;
3347
3348 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
3349 Info.align = Align(1);
3350 Info.flags |= MachineMemOperand::MOStore;
3351 Infos.push_back(Info);
3352 return;
3353 }
3354 case GATHER:
3355 case GATHER_AVX2: {
3356 Info.opc = ISD::INTRINSIC_W_CHAIN;
3357 Info.ptrVal = nullptr;
3358 MVT DataVT = MVT::getVT(I.getType());
3359 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3360 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3361 IndexVT.getVectorNumElements());
3362 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3363 Info.align = Align(1);
3364 Info.flags |= MachineMemOperand::MOLoad;
3365 Infos.push_back(Info);
3366 return;
3367 }
3368 case SCATTER: {
3369 Info.opc = ISD::INTRINSIC_VOID;
3370 Info.ptrVal = nullptr;
3371 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
3372 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3373 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3374 IndexVT.getVectorNumElements());
3375 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3376 Info.align = Align(1);
3377 Info.flags |= MachineMemOperand::MOStore;
3378 Infos.push_back(Info);
3379 return;
3380 }
3381 default:
3382 return;
3383 }
3384}
3385
3386/// Returns true if the target can instruction select the
3387/// specified FP immediate natively. If false, the legalizer will
3388/// materialize the FP immediate as a load from a constant pool.
3390 bool ForCodeSize) const {
3391 for (const APFloat &FPImm : LegalFPImmediates)
3392 if (Imm.bitwiseIsEqual(FPImm))
3393 return true;
3394 return false;
3395}
3396
3398 SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
3399 std::optional<unsigned> ByteOffset) const {
3400 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
3401
3402 auto PeekThroughOneUserBitcasts = [](const SDNode *N) {
3403 while (N->getOpcode() == ISD::BITCAST && N->hasOneUse())
3404 N = *N->user_begin();
3405 return N;
3406 };
3407
3408 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3409 // relocation target a movq or addq instruction: don't let the load shrink.
3410 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3411 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3412 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3413 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3414
3415 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
3416 // those uses are extracted directly into a store, then the extract + store
3417 // can be store-folded, or (4) any use will be used by legal full width
3418 // instruction. Then, it's probably not worth splitting the load.
3419 EVT VT = Load->getValueType(0);
3420 if ((VT.is256BitVector() || VT.is512BitVector()) &&
3421 !SDValue(Load, 0).hasOneUse()) {
3422 bool FullWidthUse = false;
3423 bool AllExtractStores = true;
3424 for (SDUse &Use : Load->uses()) {
3425 // Skip uses of the chain value. Result 0 of the node is the load value.
3426 if (Use.getResNo() != 0)
3427 continue;
3428
3429 const SDNode *User = PeekThroughOneUserBitcasts(Use.getUser());
3430
3431 // If this use is an extract + store, it's probably not worth splitting.
3432 if (User->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
3433 all_of(User->uses(), [&](const SDUse &U) {
3434 const SDNode *Inner = PeekThroughOneUserBitcasts(U.getUser());
3435 return Inner->getOpcode() == ISD::STORE;
3436 }))
3437 continue;
3438
3439 AllExtractStores = false;
3440
3441 // If any use is a full width legal/target bin op, then assume its legal
3442 // and won't split.
3443 if (isBinOp(User->getOpcode()) &&
3444 (isOperationLegal(User->getOpcode(), User->getValueType(0)) ||
3445 User->getOpcode() > ISD::BUILTIN_OP_END))
3446 FullWidthUse = true;
3447 }
3448
3449 if (AllExtractStores)
3450 return false;
3451
3452 // If we have an user that uses the full vector width, then this use is
3453 // only worth splitting if the offset isn't 0 (to avoid an
3454 // EXTRACT_SUBVECTOR) or we're loading a scalar integer.
3455 if (FullWidthUse)
3456 return (ByteOffset.value_or(0) > 0) || NewVT.isScalarInteger();
3457 }
3458
3459 return true;
3460}
3461
3462/// Returns true if it is beneficial to convert a load of a constant
3463/// to just the constant itself.
3465 Type *Ty) const {
3466 assert(Ty->isIntegerTy());
3467
3468 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3469 if (BitSize == 0 || BitSize > 64)
3470 return false;
3471 return true;
3472}
3473
3475 // If we are using XMM registers in the ABI and the condition of the select is
3476 // a floating-point compare and we have blendv or conditional move, then it is
3477 // cheaper to select instead of doing a cross-register move and creating a
3478 // load that depends on the compare result.
3479 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
3480 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
3481}
3482
3484 // TODO: It might be a win to ease or lift this restriction, but the generic
3485 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
3486 if (VT.isVector() && Subtarget.hasAVX512())
3487 return false;
3488
3489 return true;
3490}
3491
3493 SDValue C) const {
3494 // TODO: We handle scalars using custom code, but generic combining could make
3495 // that unnecessary.
3496 APInt MulC;
3497 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
3498 return false;
3499
3500 if (VT.isVector() && VT.getScalarSizeInBits() == 8) {
3501 // Check whether a vXi8 multiply can be decomposed into two shifts
3502 // (decomposing 2^m ± 2^n as 2^(a+b) ± 2^b). Similar to
3503 // DAGCombiner::visitMUL, consider the constant `2` decomposable as
3504 // (2^0 + 1).
3505 APInt ShiftedMulC = MulC.abs();
3506 unsigned TZeros = ShiftedMulC == 2 ? 0 : ShiftedMulC.countr_zero();
3507 ShiftedMulC.lshrInPlace(TZeros);
3508 if ((ShiftedMulC - 1).isPowerOf2() || (ShiftedMulC + 1).isPowerOf2())
3509 return true;
3510 }
3511
3512 // Find the type this will be legalized too. Otherwise we might prematurely
3513 // convert this to shl+add/sub and then still have to type legalize those ops.
3514 // Another choice would be to defer the decision for illegal types until
3515 // after type legalization. But constant splat vectors of i64 can't make it
3516 // through type legalization on 32-bit targets so we would need to special
3517 // case vXi64.
3518 while (getTypeAction(Context, VT) != TypeLegal)
3519 VT = getTypeToTransformTo(Context, VT);
3520
3521 // If vector multiply is legal, assume that's faster than shl + add/sub.
3522 // Multiply is a complex op with higher latency and lower throughput in
3523 // most implementations, sub-vXi32 vector multiplies are always fast,
3524 // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
3525 // is always going to be slow.
3526 unsigned EltSizeInBits = VT.getScalarSizeInBits();
3527 if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
3528 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
3529 return false;
3530
3531 // shl+add, shl+sub, shl+add+neg
3532 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
3533 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
3534}
3535
3537 unsigned Index) const {
3539 return false;
3540
3541 // Mask vectors support all subregister combinations and operations that
3542 // extract half of vector.
3543 if (ResVT.getVectorElementType() == MVT::i1)
3544 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
3545 (Index == ResVT.getVectorNumElements()));
3546
3547 return (Index % ResVT.getVectorNumElements()) == 0;
3548}
3549
3551 unsigned Opc = VecOp.getOpcode();
3552
3553 // Assume target opcodes can't be scalarized.
3554 // TODO - do we have any exceptions?
3555 if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
3556 return false;
3557
3558 // If the vector op is not supported, try to convert to scalar.
3559 EVT VecVT = VecOp.getValueType();
3561 return true;
3562
3563 // If the vector op is supported, but the scalar op is not, the transform may
3564 // not be worthwhile.
3565 EVT ScalarVT = VecVT.getScalarType();
3566 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
3567}
3568
3570 bool) const {
3571 // TODO: Allow vectors?
3572 if (VT.isVector())
3573 return false;
3574 return VT.isSimple() || !isOperationExpand(Opcode, VT);
3575}
3576
3578 // Speculate cttz only if we can directly use TZCNT/CMOV, can promote to
3579 // i32/i64 or can rely on BSF passthrough value.
3580 return Subtarget.hasBMI() || Subtarget.canUseCMOV() ||
3581 Subtarget.hasBitScanPassThrough() ||
3582 (!Ty->isVectorTy() &&
3583 Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u));
3584}
3585
3587 // Speculate ctlz only if we can directly use LZCNT/CMOV, or can rely on BSR
3588 // passthrough value.
3589 return Subtarget.hasLZCNT() || Subtarget.canUseCMOV() ||
3590 Subtarget.hasBitScanPassThrough();
3591}
3592
3594 // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
3595 // expensive than a straight movsd. On the other hand, it's important to
3596 // shrink long double fp constant since fldt is very slow.
3597 return !Subtarget.hasSSE2() || VT == MVT::f80;
3598}
3599
3601 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
3602 (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
3603}
3604
3606 const SelectionDAG &DAG,
3607 const MachineMemOperand &MMO) const {
3608 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
3609 BitcastVT.getVectorElementType() == MVT::i1)
3610 return false;
3611
3612 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
3613 return false;
3614
3615 if (LoadVT.isVector() && BitcastVT.isVector()) {
3616 // If both types are legal vectors, it's always ok to convert them.
3617 // Don't convert to an illegal type.
3618 if (isTypeLegal(LoadVT))
3619 return isTypeLegal(BitcastVT);
3620 }
3621
3622 // If we have a large vector type (even if illegal), don't bitcast to large
3623 // (illegal) scalar types. Better to load fewer vectors and extract.
3624 if (LoadVT.isVector() && !BitcastVT.isVector() && LoadVT.isInteger() &&
3625 BitcastVT.isInteger() && (LoadVT.getSizeInBits() % 128) == 0)
3626 return false;
3627
3628 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
3629}
3630
3632 const MachineFunction &MF) const {
3633 // Do not merge to float value size (128 bytes) if no implicit
3634 // float attribute is set.
3635 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
3636
3637 if (NoFloat) {
3638 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
3639 return (MemVT.getSizeInBits() <= MaxIntSize);
3640 }
3641 // Make sure we don't merge greater than our preferred vector
3642 // width.
3643 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
3644 return false;
3645
3646 return true;
3647}
3648
3650 return Subtarget.hasFastLZCNT();
3651}
3652
3654 const Instruction &AndI) const {
3655 return true;
3656}
3657
3659 // Scalar integer and-not compares are efficiently handled by NOT+TEST (or
3660 // BMI ANDN).
3661 return Y.getValueType().isScalarInteger();
3662}
3663
3665 EVT VT = Y.getValueType();
3666
3667 if (!VT.isVector()) {
3668 if (!Subtarget.hasBMI())
3669 return false;
3670
3671 // There are only 32-bit and 64-bit forms for 'andn'.
3672 if (VT != MVT::i32 && VT != MVT::i64)
3673 return false;
3674 return !isa<ConstantSDNode>(Y) || cast<ConstantSDNode>(Y)->isOpaque();
3675 }
3676
3677 // Vector.
3678 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
3679 return false;
3680
3681 if (VT == MVT::v4i32)
3682 return true;
3683
3684 return Subtarget.hasSSE2();
3685}
3686
3688 return X.getValueType().isScalarInteger(); // 'bt'
3689}
3690
3694 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
3695 SelectionDAG &DAG) const {
3696 // Does baseline recommend not to perform the fold by default?
3698 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
3699 return false;
3700 // For scalars this transform is always beneficial.
3701 if (X.getValueType().isScalarInteger())
3702 return true;
3703 // If all the shift amounts are identical, then transform is beneficial even
3704 // with rudimentary SSE2 shifts.
3705 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
3706 return true;
3707 // If we have AVX2 with it's powerful shift operations, then it's also good.
3708 if (Subtarget.hasAVX2())
3709 return true;
3710 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
3711 return NewShiftOpcode == ISD::SHL;
3712}
3713
3715 EVT VT, unsigned ShiftOpc, bool MayTransformRotate,
3716 const APInt &ShiftOrRotateAmt, const std::optional<APInt> &AndMask) const {
3717 if (!VT.isInteger())
3718 return ShiftOpc;
3719
3720 bool PreferRotate = false;
3721 if (VT.isVector()) {
3722 // For vectors, if we have rotate instruction support, then its definetly
3723 // best. Otherwise its not clear what the best so just don't make changed.
3724 PreferRotate = Subtarget.hasAVX512() && (VT.getScalarType() == MVT::i32 ||
3725 VT.getScalarType() == MVT::i64);
3726 } else {
3727 // For scalar, if we have bmi prefer rotate for rorx. Otherwise prefer
3728 // rotate unless we have a zext mask+shr.
3729 PreferRotate = Subtarget.hasBMI2();
3730 if (!PreferRotate) {
3731 unsigned MaskBits =
3732 VT.getScalarSizeInBits() - ShiftOrRotateAmt.getZExtValue();
3733 PreferRotate = (MaskBits != 8) && (MaskBits != 16) && (MaskBits != 32);
3734 }
3735 }
3736
3737 if (ShiftOpc == ISD::SHL || ShiftOpc == ISD::SRL) {
3738 assert(AndMask.has_value() && "Null andmask when querying about shift+and");
3739
3740 if (PreferRotate && MayTransformRotate)
3741 return ISD::ROTL;
3742
3743 // If vector we don't really get much benefit swapping around constants.
3744 // Maybe we could check if the DAG has the flipped node already in the
3745 // future.
3746 if (VT.isVector())
3747 return ShiftOpc;
3748
3749 // See if the beneficial to swap shift type.
3750 if (ShiftOpc == ISD::SHL) {
3751 // If the current setup has imm64 mask, then inverse will have
3752 // at least imm32 mask (or be zext i32 -> i64).
3753 if (VT == MVT::i64)
3754 return AndMask->getSignificantBits() > 32 ? (unsigned)ISD::SRL
3755 : ShiftOpc;
3756
3757 // We can only benefit if req at least 7-bit for the mask. We
3758 // don't want to replace shl of 1,2,3 as they can be implemented
3759 // with lea/add.
3760 return ShiftOrRotateAmt.uge(7) ? (unsigned)ISD::SRL : ShiftOpc;
3761 }
3762
3763 if (VT == MVT::i64)
3764 // Keep exactly 32-bit imm64, this is zext i32 -> i64 which is
3765 // extremely efficient.
3766 return AndMask->getSignificantBits() > 33 ? (unsigned)ISD::SHL : ShiftOpc;
3767
3768 // Keep small shifts as shl so we can generate add/lea.
3769 return ShiftOrRotateAmt.ult(7) ? (unsigned)ISD::SHL : ShiftOpc;
3770 }
3771
3772 // We prefer rotate for vectors of if we won't get a zext mask with SRL
3773 // (PreferRotate will be set in the latter case).
3774 if (PreferRotate || !MayTransformRotate || VT.isVector())
3775 return ShiftOpc;
3776
3777 // Non-vector type and we have a zext mask with SRL.
3778 return ISD::SRL;
3779}
3780
3783 const Value *Lhs,
3784 const Value *Rhs) const {
3785 using namespace llvm::PatternMatch;
3786 int BaseCost = BrMergingBaseCostThresh.getValue();
3787 // With CCMP, branches can be merged in a more efficient way.
3788 if (BaseCost >= 0 && Subtarget.hasCCMP())
3789 BaseCost += BrMergingCcmpBias;
3790 // a == b && a == c is a fast pattern on x86.
3791 if (BaseCost >= 0 && Opc == Instruction::And &&
3794 BaseCost += 1;
3795
3796 // For OR conditions with EQ comparisons, prefer splitting into branches
3797 // (unless CCMP is available). OR+EQ cannot be optimized via bitwise ops,
3798 // unlike OR+NE which becomes (P|Q)!=0. Similarly, don't split signed
3799 // comparisons (SLT, SGT) that can be optimized.
3800 if (BaseCost >= 0 && !Subtarget.hasCCMP() && Opc == Instruction::Or &&
3803 return {-1, -1, -1};
3804
3805 return {BaseCost, BrMergingLikelyBias.getValue(),
3806 BrMergingUnlikelyBias.getValue()};
3807}
3808
3810 return N->getOpcode() != ISD::FP_EXTEND;
3811}
3812
3814 const SDNode *N) const {
3815 assert(((N->getOpcode() == ISD::SHL &&
3816 N->getOperand(0).getOpcode() == ISD::SRL) ||
3817 (N->getOpcode() == ISD::SRL &&
3818 N->getOperand(0).getOpcode() == ISD::SHL)) &&
3819 "Expected shift-shift mask");
3820 // TODO: Should we always create i64 masks? Or only folded immediates?
3821 EVT VT = N->getValueType(0);
3822 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
3823 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
3824 // Only fold if the shift values are equal - so it folds to AND.
3825 // TODO - we should fold if either is a non-uniform vector but we don't do
3826 // the fold for non-splats yet.
3827 return N->getOperand(1) == N->getOperand(0).getOperand(1);
3828 }
3830}
3831
3833 EVT VT = Y.getValueType();
3834
3835 // For vectors, we don't have a preference, but we probably want a mask.
3836 if (VT.isVector())
3837 return false;
3838
3839 unsigned MaxWidth = Subtarget.is64Bit() ? 64 : 32;
3840 return VT.getScalarSizeInBits() <= MaxWidth;
3841}
3842
3845 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
3847 !Subtarget.isOSWindows())
3850 ExpansionFactor);
3851}
3852
3854 // Any legal vector type can be splatted more efficiently than
3855 // loading/spilling from memory.
3856 return isTypeLegal(VT);
3857}
3858
3860 MVT VT = MVT::getIntegerVT(NumBits);
3861 if (isTypeLegal(VT))
3862 return VT;
3863
3864 // PMOVMSKB can handle this.
3865 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
3866 return MVT::v16i8;
3867
3868 // VPMOVMSKB can handle this.
3869 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
3870 return MVT::v32i8;
3871
3872 // TODO: Allow 64-bit type for 32-bit target.
3873 // TODO: 512-bit types should be allowed, but make sure that those
3874 // cases are handled in combineVectorSizedSetCCEquality().
3875
3877}
3878
3879/// Val is the undef sentinel value or equal to the specified value.
3880static bool isUndefOrEqual(int Val, int CmpVal) {
3881 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
3882}
3883
3884/// Return true if every element in Mask is the undef sentinel value or equal to
3885/// the specified value.
3886static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
3887 return llvm::all_of(Mask, [CmpVal](int M) {
3888 return (M == SM_SentinelUndef) || (M == CmpVal);
3889 });
3890}
3891
3892/// Return true if every element in Mask, beginning from position Pos and ending
3893/// in Pos+Size is the undef sentinel value or equal to the specified value.
3894static bool isUndefOrEqualInRange(ArrayRef<int> Mask, int CmpVal, unsigned Pos,
3895 unsigned Size) {
3896 return llvm::all_of(Mask.slice(Pos, Size),
3897 [CmpVal](int M) { return isUndefOrEqual(M, CmpVal); });
3898}
3899
3900/// Val is either the undef or zero sentinel value.
3901static bool isUndefOrZero(int Val) {
3902 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
3903}
3904
3905/// Return true if every element in Mask, beginning from position Pos and ending
3906/// in Pos+Size is the undef sentinel value.
3907static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
3908 return llvm::all_of(Mask.slice(Pos, Size), equal_to(SM_SentinelUndef));
3909}
3910
3911/// Return true if the mask creates a vector whose lower half is undefined.
3913 unsigned NumElts = Mask.size();
3914 return isUndefInRange(Mask, 0, NumElts / 2);
3915}
3916
3917/// Return true if the mask creates a vector whose upper half is undefined.
3919 unsigned NumElts = Mask.size();
3920 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
3921}
3922
3923/// Return true if Val falls within the specified range (L, H].
3924static bool isInRange(int Val, int Low, int Hi) {
3925 return (Val >= Low && Val < Hi);
3926}
3927
3928/// Return true if the value of any element in Mask falls within the specified
3929/// range (L, H].
3930static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
3931 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
3932}
3933
3934/// Return true if the value of any element in Mask is the zero sentinel value.
3935static bool isAnyZero(ArrayRef<int> Mask) {
3936 return llvm::any_of(Mask, equal_to(SM_SentinelZero));
3937}
3938
3939/// Return true if Val is undef or if its value falls within the
3940/// specified range (L, H].
3941static bool isUndefOrInRange(int Val, int Low, int Hi) {
3942 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
3943}
3944
3945/// Return true if every element in Mask is undef or if its value
3946/// falls within the specified range (L, H].
3947static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3948 return llvm::all_of(
3949 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
3950}
3951
3952/// Return true if Val is undef, zero or if its value falls within the
3953/// specified range (L, H].
3954static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
3955 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
3956}
3957
3958/// Return true if every element in Mask is undef, zero or if its value
3959/// falls within the specified range (L, H].
3960static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3961 return llvm::all_of(
3962 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
3963}
3964
3965/// Return true if every element in Mask, is an in-place blend/select mask or is
3966/// undef.
3967[[maybe_unused]] static bool isBlendOrUndef(ArrayRef<int> Mask) {
3968 unsigned NumElts = Mask.size();
3969 for (auto [I, M] : enumerate(Mask))
3970 if (!isUndefOrEqual(M, I) && !isUndefOrEqual(M, I + NumElts))
3971 return false;
3972 return true;
3973}
3974
3975/// Return true if every element in Mask, beginning
3976/// from position Pos and ending in Pos + Size, falls within the specified
3977/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
3978static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
3979 unsigned Size, int Low, int Step = 1) {
3980 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3981 if (!isUndefOrEqual(Mask[i], Low))
3982 return false;
3983 return true;
3984}
3985
3986/// Return true if every element in Mask, beginning
3987/// from position Pos and ending in Pos+Size, falls within the specified
3988/// sequential range (Low, Low+Size], or is undef or is zero.
3990 unsigned Size, int Low,
3991 int Step = 1) {
3992 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3993 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
3994 return false;
3995 return true;
3996}
3997
3998/// Return true if every element in Mask, beginning
3999/// from position Pos and ending in Pos+Size is undef or is zero.
4000static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4001 unsigned Size) {
4002 return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
4003}
4004
4005/// Return true if every element of a single input is referenced by the shuffle
4006/// mask. i.e. it just permutes them all.
4008 unsigned NumElts = Mask.size();
4009 APInt DemandedElts = APInt::getZero(NumElts);
4010 for (int M : Mask)
4011 if (isInRange(M, 0, NumElts))
4012 DemandedElts.setBit(M);
4013 return DemandedElts.isAllOnes();
4014}
4015
4016/// Helper function to test whether a shuffle mask could be
4017/// simplified by widening the elements being shuffled.
4018///
4019/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
4020/// leaves it in an unspecified state.
4021///
4022/// NOTE: This must handle normal vector shuffle masks and *target* vector
4023/// shuffle masks. The latter have the special property of a '-2' representing
4024/// a zero-ed lane of a vector.
4026 SmallVectorImpl<int> &WidenedMask) {
4027 WidenedMask.assign(Mask.size() / 2, 0);
4028 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
4029 int M0 = Mask[i];
4030 int M1 = Mask[i + 1];
4031
4032 // If both elements are undef, its trivial.
4033 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
4034 WidenedMask[i / 2] = SM_SentinelUndef;
4035 continue;
4036 }
4037
4038 // Check for an undef mask and a mask value properly aligned to fit with
4039 // a pair of values. If we find such a case, use the non-undef mask's value.
4040 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
4041 WidenedMask[i / 2] = M1 / 2;
4042 continue;
4043 }
4044 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
4045 WidenedMask[i / 2] = M0 / 2;
4046 continue;
4047 }
4048
4049 // When zeroing, we need to spread the zeroing across both lanes to widen.
4050 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
4051 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
4053 WidenedMask[i / 2] = SM_SentinelZero;
4054 continue;
4055 }
4056 return false;
4057 }
4058
4059 // Finally check if the two mask values are adjacent and aligned with
4060 // a pair.
4061 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
4062 WidenedMask[i / 2] = M0 / 2;
4063 continue;
4064 }
4065
4066 // Otherwise we can't safely widen the elements used in this shuffle.
4067 return false;
4068 }
4069 assert(WidenedMask.size() == Mask.size() / 2 &&
4070 "Incorrect size of mask after widening the elements!");
4071
4072 return true;
4073}
4074
4076 const APInt &Zeroable,
4077 bool V2IsZero,
4078 SmallVectorImpl<int> &WidenedMask) {
4079 // Create an alternative mask with info about zeroable elements.
4080 // Here we do not set undef elements as zeroable.
4081 SmallVector<int, 64> ZeroableMask(Mask);
4082 if (V2IsZero) {
4083 assert(!Zeroable.isZero() && "V2's non-undef elements are used?!");
4084 for (int i = 0, Size = Mask.size(); i != Size; ++i)
4085 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
4086 ZeroableMask[i] = SM_SentinelZero;
4087 }
4088 return canWidenShuffleElements(ZeroableMask, WidenedMask);
4089}
4090
4092 SmallVector<int, 32> WidenedMask;
4093 return canWidenShuffleElements(Mask, WidenedMask);
4094}
4095
4096// Attempt to narrow/widen shuffle mask until it matches the target number of
4097// elements.
4098static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
4099 SmallVectorImpl<int> &ScaledMask) {
4100 unsigned NumSrcElts = Mask.size();
4101 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
4102 "Illegal shuffle scale factor");
4103
4104 // Narrowing is guaranteed to work.
4105 if (NumDstElts >= NumSrcElts) {
4106 int Scale = NumDstElts / NumSrcElts;
4107 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
4108 return true;
4109 }
4110
4111 // We have to repeat the widening until we reach the target size, but we can
4112 // split out the first widening as it sets up ScaledMask for us.
4113 if (canWidenShuffleElements(Mask, ScaledMask)) {
4114 while (ScaledMask.size() > NumDstElts) {
4115 SmallVector<int, 16> WidenedMask;
4116 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
4117 return false;
4118 ScaledMask = std::move(WidenedMask);
4119 }
4120 return true;
4121 }
4122
4123 return false;
4124}
4125
4126static bool canScaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts) {
4127 SmallVector<int, 32> ScaledMask;
4128 return scaleShuffleElements(Mask, NumDstElts, ScaledMask);
4129}
4130
4131// Helper to grow the shuffle mask for a larger value type.
4132// NOTE: This is different to scaleShuffleElements which is a same size type.
4133static void growShuffleMask(ArrayRef<int> SrcMask,
4134 SmallVectorImpl<int> &DstMask,
4135 unsigned SrcSizeInBits, unsigned DstSizeInBits) {
4136 assert(DstMask.empty() && "Expected an empty shuffle mas");
4137 assert((DstSizeInBits % SrcSizeInBits) == 0 && "Illegal shuffle scale");
4138 unsigned Scale = DstSizeInBits / SrcSizeInBits;
4139 unsigned NumSrcElts = SrcMask.size();
4140 DstMask.assign(SrcMask.begin(), SrcMask.end());
4141 for (int &M : DstMask) {
4142 if (M < 0)
4143 continue;
4144 M = (M % NumSrcElts) + ((M / NumSrcElts) * Scale * NumSrcElts);
4145 }
4146 DstMask.append((Scale - 1) * NumSrcElts, SM_SentinelUndef);
4147}
4148
4149/// Returns true if Elt is a constant zero or a floating point constant +0.0.
4151 return isNullConstant(Elt) || isNullFPConstant(Elt);
4152}
4153
4154// Build a vector of constants.
4155// Use an UNDEF node if MaskElt == -1.
4156// Split 64-bit constants in the 32-bit mode.
4158 const SDLoc &dl, bool IsMask = false) {
4159
4161 bool Split = false;
4162
4163 MVT ConstVecVT = VT;
4164 unsigned NumElts = VT.getVectorNumElements();
4165 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4166 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4167 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4168 Split = true;
4169 }
4170
4171 MVT EltVT = ConstVecVT.getVectorElementType();
4172 for (unsigned i = 0; i < NumElts; ++i) {
4173 bool IsUndef = Values[i] < 0 && IsMask;
4174 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4175 DAG.getConstant(Values[i], dl, EltVT);
4176 Ops.push_back(OpNode);
4177 if (Split)
4178 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4179 DAG.getConstant(0, dl, EltVT));
4180 }
4181 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4182 if (Split)
4183 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4184 return ConstsNode;
4185}
4186
4187static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
4188 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4189 assert(Bits.size() == Undefs.getBitWidth() &&
4190 "Unequal constant and undef arrays");
4192 bool Split = false;
4193
4194 MVT ConstVecVT = VT;
4195 unsigned NumElts = VT.getVectorNumElements();
4196 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4197 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4198 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4199 Split = true;
4200 }
4201
4202 MVT EltVT = ConstVecVT.getVectorElementType();
4203 MVT EltIntVT = EltVT.changeTypeToInteger();
4204 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4205 if (Undefs[i]) {
4206 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4207 continue;
4208 }
4209 const APInt &V = Bits[i];
4210 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4211 if (Split) {
4212 Ops.push_back(DAG.getConstant(V.extractBits(32, 0), dl, EltVT));
4213 Ops.push_back(DAG.getConstant(V.extractBits(32, 32), dl, EltVT));
4214 } else {
4215 Ops.push_back(DAG.getBitcast(EltVT, DAG.getConstant(V, dl, EltIntVT)));
4216 }
4217 }
4218
4219 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4220 return DAG.getBitcast(VT, ConstsNode);
4221}
4222
4224 SelectionDAG &DAG, const SDLoc &dl) {
4225 APInt Undefs = APInt::getZero(Bits.size());
4226 return getConstVector(Bits, Undefs, VT, DAG, dl);
4227}
4228
4229/// Returns a vector of specified type with all zero elements.
4230static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4231 SelectionDAG &DAG, const SDLoc &dl) {
4232 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4233 VT.getVectorElementType() == MVT::i1) &&
4234 "Unexpected vector type");
4235
4236 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4237 // type. This ensures they get CSE'd. But if the integer type is not
4238 // available, use a floating-point +0.0 instead.
4239 SDValue Vec;
4240 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4241 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4242 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4243 } else if (VT.isFloatingPoint() &&
4245 Vec = DAG.getConstantFP(+0.0, dl, VT);
4246 } else if (VT.getVectorElementType() == MVT::i1) {
4247 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4248 "Unexpected vector type");
4249 Vec = DAG.getConstant(0, dl, VT);
4250 } else {
4251 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4252 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4253 }
4254 return DAG.getBitcast(VT, Vec);
4255}
4256
4257// Helper to determine if the ops are all the extracted subvectors come from a
4258// single source. If we allow commute they don't have to be in order (Lo/Hi).
4259static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
4260 if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4261 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4262 LHS.getValueType() != RHS.getValueType() ||
4263 LHS.getOperand(0) != RHS.getOperand(0))
4264 return SDValue();
4265
4266 SDValue Src = LHS.getOperand(0);
4267 if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
4268 return SDValue();
4269
4270 unsigned NumElts = LHS.getValueType().getVectorNumElements();
4271 if ((LHS.getConstantOperandAPInt(1) == 0 &&
4272 RHS.getConstantOperandAPInt(1) == NumElts) ||
4273 (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
4274 LHS.getConstantOperandAPInt(1) == NumElts))
4275 return Src;
4276
4277 return SDValue();
4278}
4279
4280static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4281 const SDLoc &dl, unsigned vectorWidth) {
4282 EVT VT = Vec.getValueType();
4283 EVT ElVT = VT.getVectorElementType();
4284 unsigned ResultNumElts =
4285 (VT.getVectorNumElements() * vectorWidth) / VT.getSizeInBits();
4286 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, ResultNumElts);
4287
4288 assert(ResultVT.getSizeInBits() == vectorWidth &&
4289 "Illegal subvector extraction");
4290
4291 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4292 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4293 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4294
4295 // This is the index of the first element of the vectorWidth-bit chunk
4296 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4297 IdxVal &= ~(ElemsPerChunk - 1);
4298
4299 // If the input is a buildvector just emit a smaller one.
4300 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4301 return DAG.getBuildVector(ResultVT, dl,
4302 Vec->ops().slice(IdxVal, ElemsPerChunk));
4303
4304 // Check if we're extracting the upper undef of a widening pattern.
4305 if (Vec.getOpcode() == ISD::INSERT_SUBVECTOR && Vec.getOperand(0).isUndef() &&
4306 Vec.getOperand(1).getValueType().getVectorNumElements() <= IdxVal &&
4307 isNullConstant(Vec.getOperand(2)))
4308 return DAG.getUNDEF(ResultVT);
4309
4310 return DAG.getExtractSubvector(dl, ResultVT, Vec, IdxVal);
4311}
4312
4313/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4314/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4315/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4316/// instructions or a simple subregister reference. Idx is an index in the
4317/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
4318/// lowering EXTRACT_VECTOR_ELT operations easier.
4319static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4320 SelectionDAG &DAG, const SDLoc &dl) {
4322 Vec.getValueType().is512BitVector()) &&
4323 "Unexpected vector size!");
4324 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4325}
4326
4327/// Generate a DAG to grab 256-bits from a 512-bit vector.
4328static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4329 SelectionDAG &DAG, const SDLoc &dl) {
4330 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4331 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
4332}
4333
4334static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4335 SelectionDAG &DAG, const SDLoc &dl,
4336 unsigned vectorWidth) {
4337 assert((vectorWidth == 128 || vectorWidth == 256) &&
4338 "Unsupported vector width");
4339 // Inserting UNDEF is Result
4340 if (Vec.isUndef())
4341 return Result;
4342
4343 // Insert the relevant vectorWidth bits.
4344 EVT VT = Vec.getValueType();
4345 unsigned ElemsPerChunk = vectorWidth / VT.getScalarSizeInBits();
4346 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4347
4348 // This is the index of the first element of the vectorWidth-bit chunk
4349 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4350 IdxVal &= ~(ElemsPerChunk - 1);
4351 return DAG.getInsertSubvector(dl, Result, Vec, IdxVal);
4352}
4353
4354/// Generate a DAG to put 128-bits into a vector > 128 bits. This
4355/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
4356/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
4357/// simple superregister reference. Idx is an index in the 128 bits
4358/// we want. It need not be aligned to a 128-bit boundary. That makes
4359/// lowering INSERT_VECTOR_ELT operations easier.
4360static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4361 SelectionDAG &DAG, const SDLoc &dl) {
4362 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
4363 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
4364}
4365
4366/// Widen a vector to a larger size with the same scalar type, with the new
4367/// elements either zero or undef.
4368static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
4369 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4370 const SDLoc &dl) {
4371 EVT VecVT = Vec.getValueType();
4373 VecVT.getScalarType() == VT.getScalarType() &&
4374 "Unsupported vector widening type");
4375 // If the upper 128-bits of a build vector are already undef/zero, then try to
4376 // widen from the lower 128-bits.
4377 if (Vec.getOpcode() == ISD::BUILD_VECTOR && VecVT.is256BitVector()) {
4378 unsigned NumSrcElts = VecVT.getVectorNumElements();
4379 ArrayRef<SDUse> Hi = Vec->ops().drop_front(NumSrcElts / 2);
4380 if (all_of(Hi, [&](SDValue V) {
4381 return V.isUndef() || (ZeroNewElements && X86::isZeroNode(V));
4382 }))
4383 Vec = extract128BitVector(Vec, 0, DAG, dl);
4384 }
4385 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
4386 : DAG.getUNDEF(VT);
4387 return DAG.getInsertSubvector(dl, Res, Vec, 0);
4388}
4389
4390/// Widen a vector to a larger size with the same scalar type, with the new
4391/// elements either zero or undef.
4392static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
4393 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4394 const SDLoc &dl, unsigned WideSizeInBits) {
4395 assert(Vec.getValueSizeInBits() <= WideSizeInBits &&
4396 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
4397 "Unsupported vector widening type");
4398 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
4399 MVT SVT = Vec.getSimpleValueType().getScalarType();
4400 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
4401 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4402}
4403
4404/// Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT
4405/// and bitcast with integer types.
4406static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget) {
4407 assert(VT.getVectorElementType() == MVT::i1 && "Expected bool vector");
4408 unsigned NumElts = VT.getVectorNumElements();
4409 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
4410 return Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
4411 return VT;
4412}
4413
4414/// Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and
4415/// bitcast with integer types.
4416static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements,
4417 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4418 const SDLoc &dl) {
4419 MVT VT = widenMaskVectorType(Vec.getSimpleValueType(), Subtarget);
4420 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4421}
4422
4423// Helper function to collect subvector ops that are concatenated together,
4424// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
4425// The subvectors in Ops are guaranteed to be the same type.
4427 SelectionDAG &DAG) {
4428 assert(Ops.empty() && "Expected an empty ops vector");
4429
4430 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
4431 Ops.append(N->op_begin(), N->op_end());
4432 return true;
4433 }
4434
4435 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
4436 SDValue Src = N->getOperand(0);
4437 SDValue Sub = N->getOperand(1);
4438 const APInt &Idx = N->getConstantOperandAPInt(2);
4439 EVT VT = Src.getValueType();
4440 EVT SubVT = Sub.getValueType();
4441
4442 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
4443 // insert_subvector(undef, x, lo)
4444 if (Idx == 0 && Src.isUndef()) {
4445 Ops.push_back(Sub);
4446 Ops.push_back(DAG.getUNDEF(SubVT));
4447 return true;
4448 }
4449 if (Idx == (VT.getVectorNumElements() / 2)) {
4450 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
4451 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
4452 Src.getOperand(1).getValueType() == SubVT &&
4453 isNullConstant(Src.getOperand(2))) {
4454 // Attempt to recurse into inner (matching) concats.
4455 SDValue Lo = Src.getOperand(1);
4456 SDValue Hi = Sub;
4457 SmallVector<SDValue, 2> LoOps, HiOps;
4458 if (collectConcatOps(Lo.getNode(), LoOps, DAG) &&
4459 collectConcatOps(Hi.getNode(), HiOps, DAG) &&
4460 LoOps.size() == HiOps.size()) {
4461 Ops.append(LoOps);
4462 Ops.append(HiOps);
4463 return true;
4464 }
4465 Ops.push_back(Lo);
4466 Ops.push_back(Hi);
4467 return true;
4468 }
4469 // insert_subvector(x, extract_subvector(x, lo), hi)
4470 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4471 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
4472 Ops.append(2, Sub);
4473 return true;
4474 }
4475 // insert_subvector(undef, x, hi)
4476 if (Src.isUndef()) {
4477 Ops.push_back(DAG.getUNDEF(SubVT));
4478 Ops.push_back(Sub);
4479 return true;
4480 }
4481 }
4482 }
4483 }
4484
4485 if (N->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
4486 EVT VT = N->getValueType(0);
4487 SDValue Src = N->getOperand(0);
4488 uint64_t Idx = N->getConstantOperandVal(1);
4489
4490 // Collect all the subvectors from the source vector and slice off the
4491 // extraction.
4493 if (collectConcatOps(Src.getNode(), SrcOps, DAG) &&
4494 VT.getSizeInBits() > SrcOps[0].getValueSizeInBits() &&
4495 (VT.getSizeInBits() % SrcOps[0].getValueSizeInBits()) == 0 &&
4496 (Idx % SrcOps[0].getValueType().getVectorNumElements()) == 0) {
4497 unsigned SubIdx = Idx / SrcOps[0].getValueType().getVectorNumElements();
4498 unsigned NumSubs = VT.getSizeInBits() / SrcOps[0].getValueSizeInBits();
4499 Ops.append(SrcOps.begin() + SubIdx, SrcOps.begin() + SubIdx + NumSubs);
4500 return true;
4501 }
4502 }
4503
4504 assert(Ops.empty() && "Expected an empty ops vector");
4505 return false;
4506}
4507
4508// Helper to check if \p V can be split into subvectors and the upper subvectors
4509// are all undef. In which case return the lower subvector.
4511 SelectionDAG &DAG) {
4512 SmallVector<SDValue> SubOps;
4513 if (!collectConcatOps(V.getNode(), SubOps, DAG))
4514 return SDValue();
4515
4516 unsigned NumSubOps = SubOps.size();
4517 unsigned HalfNumSubOps = NumSubOps / 2;
4518 assert((NumSubOps % 2) == 0 && "Unexpected number of subvectors");
4519
4520 ArrayRef<SDValue> UpperOps(SubOps.begin() + HalfNumSubOps, SubOps.end());
4521 if (any_of(UpperOps, [](SDValue Op) { return !Op.isUndef(); }))
4522 return SDValue();
4523
4524 EVT HalfVT = V.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
4525 ArrayRef<SDValue> LowerOps(SubOps.begin(), SubOps.begin() + HalfNumSubOps);
4526 return DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT, LowerOps);
4527}
4528
4529// Helper to check if we can access all the constituent subvectors without any
4530// extract ops.
4533 return collectConcatOps(V.getNode(), Ops, DAG);
4534}
4535
4536static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
4537 const SDLoc &dl) {
4538 EVT VT = Op.getValueType();
4539 unsigned NumElems = VT.getVectorNumElements();
4540 unsigned SizeInBits = VT.getSizeInBits();
4541 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
4542 "Can't split odd sized vector");
4543
4545 if (collectConcatOps(Op.getNode(), SubOps, DAG)) {
4546 assert((SubOps.size() % 2) == 0 && "Can't split odd sized vector concat");
4547 unsigned HalfOps = SubOps.size() / 2;
4548 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
4549 SmallVector<SDValue, 2> LoOps(SubOps.begin(), SubOps.begin() + HalfOps);
4550 SmallVector<SDValue, 2> HiOps(SubOps.begin() + HalfOps, SubOps.end());
4551 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, LoOps);
4552 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, HiOps);
4553 return std::make_pair(Lo, Hi);
4554 }
4555
4556 // If this is a splat value (with no-undefs) then use the lower subvector,
4557 // which should be a free extraction.
4558 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
4559 if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
4560 return std::make_pair(Lo, Lo);
4561
4562 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
4563 return std::make_pair(Lo, Hi);
4564}
4565
4566/// Break an operation into 2 half sized ops and then concatenate the results.
4568 unsigned NumOps = Op.getNumOperands();
4569 EVT VT = Op.getValueType();
4570
4571 // Extract the LHS Lo/Hi vectors
4574 for (unsigned I = 0; I != NumOps; ++I) {
4575 SDValue SrcOp = Op.getOperand(I);
4576 if (!SrcOp.getValueType().isVector()) {
4577 LoOps[I] = HiOps[I] = SrcOp;
4578 continue;
4579 }
4580 std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
4581 }
4582
4583 EVT LoVT, HiVT;
4584 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
4585 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
4586 DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
4587 DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
4588}
4589
4590/// Break an unary integer operation into 2 half sized ops and then
4591/// concatenate the result back.
4593 const SDLoc &dl) {
4594 // Make sure we only try to split 256/512-bit types to avoid creating
4595 // narrow vectors.
4596 [[maybe_unused]] EVT VT = Op.getValueType();
4597 assert((Op.getOperand(0).getValueType().is256BitVector() ||
4598 Op.getOperand(0).getValueType().is512BitVector()) &&
4599 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4600 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
4601 VT.getVectorNumElements() &&
4602 "Unexpected VTs!");
4603 return splitVectorOp(Op, DAG, dl);
4604}
4605
4606/// Break a binary integer operation into 2 half sized ops and then
4607/// concatenate the result back.
4609 const SDLoc &dl) {
4610 // Assert that all the types match.
4611 [[maybe_unused]] EVT VT = Op.getValueType();
4612 assert(Op.getOperand(0).getValueType() == VT &&
4613 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
4614 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4615 return splitVectorOp(Op, DAG, dl);
4616}
4617
4618// Helper for splitting operands of an operation to legal target size and
4619// apply a function on each part.
4620// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
4621// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
4622// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
4623// The argument Builder is a function that will be applied on each split part:
4624// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
4625template <typename F>
4627 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
4628 F Builder, bool CheckBWI = true,
4629 bool AllowAVX512 = true) {
4630 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
4631 unsigned NumSubs = 1;
4632 if (AllowAVX512 && ((CheckBWI && Subtarget.useBWIRegs()) ||
4633 (!CheckBWI && Subtarget.useAVX512Regs()))) {
4634 if (VT.getSizeInBits() > 512) {
4635 NumSubs = VT.getSizeInBits() / 512;
4636 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
4637 }
4638 } else if (Subtarget.hasAVX2()) {
4639 if (VT.getSizeInBits() > 256) {
4640 NumSubs = VT.getSizeInBits() / 256;
4641 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
4642 }
4643 } else {
4644 if (VT.getSizeInBits() > 128) {
4645 NumSubs = VT.getSizeInBits() / 128;
4646 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
4647 }
4648 }
4649
4650 if (NumSubs == 1)
4651 return Builder(DAG, DL, Ops);
4652
4654 for (unsigned i = 0; i != NumSubs; ++i) {
4656 for (SDValue Op : Ops) {
4657 EVT OpVT = Op.getValueType();
4658 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
4659 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
4660 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
4661 }
4662 Subs.push_back(Builder(DAG, DL, SubOps));
4663 }
4664 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
4665}
4666
4667// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
4668// targets.
4669static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
4671 const X86Subtarget &Subtarget) {
4672 assert(Subtarget.hasAVX512() && "AVX512 target expected");
4673 MVT SVT = VT.getScalarType();
4674
4675 // If we have a 32/64 splatted constant, splat it to DstTy to
4676 // encourage a foldable broadcast'd operand.
4677 auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
4678 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
4679 // AVX512 broadcasts 32/64-bit operands.
4680 // TODO: Support float once getAVX512Node is used by fp-ops.
4681 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
4683 return SDValue();
4684 // If we're not widening, don't bother if we're not bitcasting.
4685 if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
4686 return SDValue();
4688 APInt SplatValue, SplatUndef;
4689 unsigned SplatBitSize;
4690 bool HasAnyUndefs;
4691 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
4692 HasAnyUndefs, OpEltSizeInBits) &&
4693 !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
4694 return DAG.getConstant(SplatValue, DL, DstVT);
4695 }
4696 return SDValue();
4697 };
4698
4699 bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
4700
4701 MVT DstVT = VT;
4702 if (Widen)
4703 DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
4704
4705 // Canonicalize src operands.
4706 SmallVector<SDValue> SrcOps(Ops);
4707 for (SDValue &Op : SrcOps) {
4708 MVT OpVT = Op.getSimpleValueType();
4709 // Just pass through scalar operands.
4710 if (!OpVT.isVector())
4711 continue;
4712 assert(OpVT == VT && "Vector type mismatch");
4713
4714 if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
4715 Op = BroadcastOp;
4716 continue;
4717 }
4718
4719 // Just widen the subvector by inserting into an undef wide vector.
4720 if (Widen)
4721 Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
4722 }
4723
4724 SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
4725
4726 // Perform the 512-bit op then extract the bottom subvector.
4727 if (Widen)
4728 Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
4729 return Res;
4730}
4731
4732/// Insert i1-subvector to i1-vector.
4734 const X86Subtarget &Subtarget) {
4735
4736 SDLoc dl(Op);
4737 SDValue Vec = Op.getOperand(0);
4738 SDValue SubVec = Op.getOperand(1);
4739 SDValue Idx = Op.getOperand(2);
4740 unsigned IdxVal = Op.getConstantOperandVal(2);
4741
4742 // Inserting undef is a nop. We can just return the original vector.
4743 if (SubVec.isUndef())
4744 return Vec;
4745
4746 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
4747 return Op;
4748
4749 MVT OpVT = Op.getSimpleValueType();
4750 unsigned NumElems = OpVT.getVectorNumElements();
4751 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
4752
4753 // Extend to natively supported kshift.
4754 MVT WideOpVT = widenMaskVectorType(OpVT, Subtarget);
4755
4756 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
4757 // if necessary.
4758 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
4759 // May need to promote to a legal type.
4760 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4761 DAG.getConstant(0, dl, WideOpVT),
4762 SubVec, Idx);
4763 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4764 }
4765
4766 MVT SubVecVT = SubVec.getSimpleValueType();
4767 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4768 assert(IdxVal + SubVecNumElems <= NumElems &&
4769 IdxVal % SubVecVT.getSizeInBits() == 0 &&
4770 "Unexpected index value in INSERT_SUBVECTOR");
4771
4772 SDValue Undef = DAG.getUNDEF(WideOpVT);
4773
4774 if (IdxVal == 0) {
4775 // Zero lower bits of the Vec
4776 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
4777 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
4778 ZeroIdx);
4779 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4780 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4781 // Merge them together, SubVec should be zero extended.
4782 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4783 DAG.getConstant(0, dl, WideOpVT),
4784 SubVec, ZeroIdx);
4785 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4786 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4787 }
4788
4789 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4790 Undef, SubVec, ZeroIdx);
4791
4792 if (Vec.isUndef()) {
4793 assert(IdxVal != 0 && "Unexpected index");
4794 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4795 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4796 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4797 }
4798
4800 assert(IdxVal != 0 && "Unexpected index");
4801 // If upper elements of Vec are known undef, then just shift into place.
4802 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
4803 [](SDValue V) { return V.isUndef(); })) {
4804 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4805 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4806 } else {
4807 NumElems = WideOpVT.getVectorNumElements();
4808 unsigned ShiftLeft = NumElems - SubVecNumElems;
4809 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4810 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4811 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4812 if (ShiftRight != 0)
4813 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4814 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4815 }
4816 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4817 }
4818
4819 // Simple case when we put subvector in the upper part
4820 if (IdxVal + SubVecNumElems == NumElems) {
4821 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4822 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4823 if (SubVecNumElems * 2 == NumElems) {
4824 // Special case, use legal zero extending insert_subvector. This allows
4825 // isel to optimize when bits are known zero.
4826 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
4827 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4828 DAG.getConstant(0, dl, WideOpVT),
4829 Vec, ZeroIdx);
4830 } else {
4831 // Otherwise use explicit shifts to zero the bits.
4832 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4833 Undef, Vec, ZeroIdx);
4834 NumElems = WideOpVT.getVectorNumElements();
4835 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
4836 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4837 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4838 }
4839 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4840 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4841 }
4842
4843 // Inserting into the middle is more complicated.
4844
4845 NumElems = WideOpVT.getVectorNumElements();
4846
4847 // Widen the vector if needed.
4848 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4849
4850 unsigned ShiftLeft = NumElems - SubVecNumElems;
4851 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4852
4853 // Do an optimization for the most frequently used types.
4854 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
4855 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
4856 Mask0.flipAllBits();
4857 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
4858 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
4859 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
4860 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4861 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4862 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4863 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4864 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4865
4866 // Reduce to original width if needed.
4867 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4868 }
4869
4870 // Clear the upper bits of the subvector and move it to its insert position.
4871 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4872 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4873 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4874 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4875
4876 // Isolate the bits below the insertion point.
4877 unsigned LowShift = NumElems - IdxVal;
4878 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
4879 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4880 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
4881 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4882
4883 // Isolate the bits after the last inserted bit.
4884 unsigned HighShift = IdxVal + SubVecNumElems;
4885 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
4886 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4887 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
4888 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4889
4890 // Now OR all 3 pieces together.
4891 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
4892 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
4893
4894 // Reduce to original width if needed.
4895 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4896}
4897
4899 const SDLoc &dl) {
4900 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
4901 EVT SubVT = V1.getValueType();
4902 EVT SubSVT = SubVT.getScalarType();
4903 unsigned SubNumElts = SubVT.getVectorNumElements();
4904 unsigned SubVectorWidth = SubVT.getSizeInBits();
4905 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
4906 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
4907 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
4908}
4909
4910/// Returns a vector of specified type with all bits set.
4911/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
4912/// Then bitcast to their original type, ensuring they get CSE'd.
4913static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4914 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
4915 "Expected a 128/256/512-bit vector type");
4916 unsigned NumElts = VT.getSizeInBits() / 32;
4917 SDValue Vec = DAG.getAllOnesConstant(dl, MVT::getVectorVT(MVT::i32, NumElts));
4918 return DAG.getBitcast(VT, Vec);
4919}
4920
4921// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
4922static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
4923 switch (Opc) {
4924 case ISD::SHL:
4925 case X86ISD::VSHL:
4926 case X86ISD::VSHLI:
4927 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
4928 case ISD::SRL:
4929 case X86ISD::VSRL:
4930 case X86ISD::VSRLI:
4931 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
4932 case ISD::SRA:
4933 case X86ISD::VSRA:
4934 case X86ISD::VSRAI:
4935 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
4936 }
4937 llvm_unreachable("Unknown target vector shift node");
4938}
4939
4940/// Handle vector element shifts where the shift amount is a constant.
4941/// Takes immediate version of shift as input.
4942static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
4943 SDValue SrcOp, uint64_t ShiftAmt,
4944 SelectionDAG &DAG) {
4945 MVT ElementType = VT.getVectorElementType();
4946
4947 // Bitcast the source vector to the output type, this is mainly necessary for
4948 // vXi8/vXi64 shifts.
4949 if (VT != SrcOp.getSimpleValueType())
4950 SrcOp = DAG.getBitcast(VT, SrcOp);
4951
4952 // Fold this packed shift into its first operand if ShiftAmt is 0.
4953 if (ShiftAmt == 0)
4954 return SrcOp;
4955
4956 // Check for ShiftAmt >= element width
4957 if (ShiftAmt >= ElementType.getSizeInBits()) {
4958 if (Opc == X86ISD::VSRAI)
4959 ShiftAmt = ElementType.getSizeInBits() - 1;
4960 else
4961 return DAG.getConstant(0, dl, VT);
4962 }
4963
4964 assert(
4965 (Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) &&
4966 "Unknown target vector shift-by-constant node");
4967
4968 // Fold this packed vector shift into a build vector if SrcOp is a
4969 // vector of Constants or UNDEFs.
4971 unsigned ShiftOpc;
4972 switch (Opc) {
4973 default:
4974 llvm_unreachable("Unknown opcode!");
4975 case X86ISD::VSHLI:
4976 ShiftOpc = ISD::SHL;
4977 break;
4978 case X86ISD::VSRLI:
4979 ShiftOpc = ISD::SRL;
4980 break;
4981 case X86ISD::VSRAI:
4982 ShiftOpc = ISD::SRA;
4983 break;
4984 }
4985
4986 SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);
4987 if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))
4988 return C;
4989 }
4990
4991 return DAG.getNode(Opc, dl, VT, SrcOp,
4992 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
4993}
4994
4995/// Handle vector element shifts by a splat shift amount
4996static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
4997 SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,
4998 const X86Subtarget &Subtarget,
4999 SelectionDAG &DAG) {
5000 MVT AmtVT = ShAmt.getSimpleValueType();
5001 assert(AmtVT.isVector() && "Vector shift type mismatch");
5002 assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&
5003 "Illegal vector splat index");
5004
5005 // Move the splat element to the bottom element.
5006 if (ShAmtIdx != 0) {
5007 SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
5008 Mask[0] = ShAmtIdx;
5009 ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);
5010 }
5011
5012 // Peek through any zext node if we can get back to a 128-bit source.
5013 if (AmtVT.getScalarSizeInBits() == 64 &&
5014 (ShAmt.getOpcode() == ISD::ZERO_EXTEND ||
5016 ShAmt.getOperand(0).getValueType().isSimple() &&
5017 ShAmt.getOperand(0).getValueType().is128BitVector()) {
5018 ShAmt = ShAmt.getOperand(0);
5019 AmtVT = ShAmt.getSimpleValueType();
5020 }
5021
5022 // See if we can mask off the upper elements using the existing source node.
5023 // The shift uses the entire lower 64-bits of the amount vector, so no need to
5024 // do this for vXi64 types.
5025 bool IsMasked = false;
5026 if (AmtVT.getScalarSizeInBits() < 64) {
5027 if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||
5028 ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {
5029 // If the shift amount has come from a scalar, then zero-extend the scalar
5030 // before moving to the vector.
5031 ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);
5032 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
5033 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
5034 AmtVT = MVT::v4i32;
5035 IsMasked = true;
5036 } else if (ShAmt.getOpcode() == ISD::AND) {
5037 // See if the shift amount is already masked (e.g. for rotation modulo),
5038 // then we can zero-extend it by setting all the other mask elements to
5039 // zero.
5040 SmallVector<SDValue> MaskElts(
5041 AmtVT.getVectorNumElements(),
5042 DAG.getConstant(0, dl, AmtVT.getScalarType()));
5043 MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());
5044 SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);
5045 if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,
5046 {ShAmt.getOperand(1), Mask}))) {
5047 ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);
5048 IsMasked = true;
5049 }
5050 }
5051 }
5052
5053 // Extract if the shift amount vector is larger than 128-bits.
5054 if (AmtVT.getSizeInBits() > 128) {
5055 ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);
5056 AmtVT = ShAmt.getSimpleValueType();
5057 }
5058
5059 // Zero-extend bottom element to v2i64 vector type, either by extension or
5060 // shuffle masking.
5061 if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {
5062 if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||
5063 ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {
5064 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);
5065 } else if (Subtarget.hasSSE41()) {
5066 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
5067 MVT::v2i64, ShAmt);
5068 } else {
5069 SDValue ByteShift = DAG.getTargetConstant(
5070 (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
5071 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
5072 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
5073 ByteShift);
5074 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
5075 ByteShift);
5076 }
5077 }
5078
5079 // Change opcode to non-immediate version.
5081
5082 // The return type has to be a 128-bit type with the same element
5083 // type as the input type.
5084 MVT EltVT = VT.getVectorElementType();
5085 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
5086
5087 ShAmt = DAG.getBitcast(ShVT, ShAmt);
5088 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
5089}
5090
5091static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
5092 SDValue In, SelectionDAG &DAG) {
5093 EVT InVT = In.getValueType();
5094 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
5095
5096 // Canonicalize Opcode to general extension version.
5097 switch (Opcode) {
5098 case ISD::ANY_EXTEND:
5100 Opcode = ISD::ANY_EXTEND;
5101 break;
5102 case ISD::SIGN_EXTEND:
5104 Opcode = ISD::SIGN_EXTEND;
5105 break;
5106 case ISD::ZERO_EXTEND:
5108 Opcode = ISD::ZERO_EXTEND;
5109 break;
5110 default:
5111 llvm_unreachable("Unknown extension opcode");
5112 }
5113
5114 // For 256-bit vectors, we only need the lower (128-bit) input half.
5115 // For 512-bit vectors, we only need the lower input half or quarter.
5116 if (InVT.getSizeInBits() > 128) {
5117 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
5118 "Expected VTs to be the same size!");
5119 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
5120 In = extractSubVector(In, 0, DAG, DL,
5121 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
5122 InVT = In.getValueType();
5123 }
5124
5125 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
5126 Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);
5127
5128 return DAG.getNode(Opcode, DL, VT, In);
5129}
5130
5131// Create OR(AND(LHS,MASK),AND(RHS,~MASK)) bit select pattern
5133 SDValue Mask, SelectionDAG &DAG) {
5134 LHS = DAG.getNode(ISD::AND, DL, VT, LHS, Mask);
5135 RHS = DAG.getNode(X86ISD::ANDNP, DL, VT, Mask, RHS);
5136 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
5137}
5138
5140 bool Lo, bool Unary) {
5141 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
5142 "Illegal vector type to unpack");
5143 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5144 int NumElts = VT.getVectorNumElements();
5145 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
5146 for (int i = 0; i < NumElts; ++i) {
5147 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
5148 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
5149 Pos += (Unary ? 0 : NumElts * (i % 2));
5150 Pos += (Lo ? 0 : NumEltsInLane / 2);
5151 Mask.push_back(Pos);
5152 }
5153}
5154
5155/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
5156/// imposed by AVX and specific to the unary pattern. Example:
5157/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
5158/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
5160 bool Lo) {
5161 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5162 int NumElts = VT.getVectorNumElements();
5163 for (int i = 0; i < NumElts; ++i) {
5164 int Pos = i / 2;
5165 Pos += (Lo ? 0 : NumElts / 2);
5166 Mask.push_back(Pos);
5167 }
5168}
5169
5170// Attempt to constant fold, else just create a VECTOR_SHUFFLE.
5171static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
5172 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
5175 SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
5176 for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
5177 int M = Mask[I];
5178 if (M < 0)
5179 continue;
5180 SDValue V = (M < NumElts) ? V1 : V2;
5181 if (V.isUndef())
5182 continue;
5183 Ops[I] = V.getOperand(M % NumElts);
5184 }
5185 return DAG.getBuildVector(VT, dl, Ops);
5186 }
5187
5188 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5189}
5190
5191/// Returns a vector_shuffle node for an unpackl operation.
5192static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
5193 SDValue V1, SDValue V2) {
5195 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
5196 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
5197}
5198
5199/// Returns a vector_shuffle node for an unpackh operation.
5200static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
5201 SDValue V1, SDValue V2) {
5203 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
5204 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
5205}
5206
5207/// Returns a node that packs the LHS + RHS nodes together at half width.
5208/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
5209/// TODO: Add subvector splitting if/when we have a need for it.
5210static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
5211 const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
5212 bool PackHiHalf = false) {
5213 MVT OpVT = LHS.getSimpleValueType();
5214 unsigned EltSizeInBits = VT.getScalarSizeInBits();
5215 bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
5216 assert(OpVT == RHS.getSimpleValueType() &&
5217 VT.getSizeInBits() == OpVT.getSizeInBits() &&
5218 (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&
5219 "Unexpected PACK operand types");
5220 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&
5221 "Unexpected PACK result type");
5222
5223 // Rely on vector shuffles for vXi64 -> vXi32 packing.
5224 if (EltSizeInBits == 32) {
5225 SmallVector<int> PackMask;
5226 int Offset = PackHiHalf ? 1 : 0;
5227 int NumElts = VT.getVectorNumElements();
5228 for (int I = 0; I != NumElts; I += 4) {
5229 PackMask.push_back(I + Offset);
5230 PackMask.push_back(I + Offset + 2);
5231 PackMask.push_back(I + Offset + NumElts);
5232 PackMask.push_back(I + Offset + NumElts + 2);
5233 }
5234 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
5235 DAG.getBitcast(VT, RHS), PackMask);
5236 }
5237
5238 // See if we already have sufficient leading bits for PACKSS/PACKUS.
5239 if (!PackHiHalf) {
5240 if (UsePackUS &&
5241 DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
5242 DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
5243 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
5244
5245 if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
5246 DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
5247 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
5248 }
5249
5250 // Fallback to sign/zero extending the requested half and pack.
5251 SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
5252 if (UsePackUS) {
5253 if (PackHiHalf) {
5254 LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
5255 RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
5256 } else {
5257 SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
5258 LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
5259 RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
5260 };
5261 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
5262 };
5263
5264 if (!PackHiHalf) {
5265 LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
5266 RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
5267 }
5268 LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
5269 RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
5270 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
5271}
5272
5273/// Return a vector_shuffle of the specified vector of zero or undef vector.
5274/// This produces a shuffle where the low element of V2 is swizzled into the
5275/// zero/undef vector, landing at element Idx.
5276/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
5278 bool IsZero,
5279 const X86Subtarget &Subtarget,
5280 SelectionDAG &DAG) {
5281 MVT VT = V2.getSimpleValueType();
5282 SDValue V1 = IsZero
5283 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5284 int NumElems = VT.getVectorNumElements();
5285 SmallVector<int, 16> MaskVec(NumElems);
5286 for (int i = 0; i != NumElems; ++i)
5287 // If this is the insertion idx, put the low elt of V2 here.
5288 MaskVec[i] = (i == Idx) ? NumElems : i;
5289 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
5290}
5291
5293 if (Ptr.getOpcode() == X86ISD::Wrapper ||
5294 Ptr.getOpcode() == X86ISD::WrapperRIP)
5295 Ptr = Ptr.getOperand(0);
5296 return dyn_cast<ConstantPoolSDNode>(Ptr);
5297}
5298
5299// TODO: Add support for non-zero offsets.
5302 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
5303 return nullptr;
5304 return CNode->getConstVal();
5305}
5306
5308 if (!Load || !ISD::isNormalLoad(Load))
5309 return nullptr;
5310 return getTargetConstantFromBasePtr(Load->getBasePtr());
5311}
5312
5317
5318const Constant *
5320 assert(LD && "Unexpected null LoadSDNode");
5321 return getTargetConstantFromNode(LD);
5322}
5323
5325 // Do not fold (vselect not(C), X, 0s) to (vselect C, Os, X)
5326 SDValue Cond = N->getOperand(0);
5327 SDValue RHS = N->getOperand(2);
5328 EVT CondVT = Cond.getValueType();
5329 return N->getOpcode() == ISD::VSELECT && Subtarget.hasAVX512() &&
5330 CondVT.getVectorElementType() == MVT::i1 &&
5331 ISD::isBuildVectorAllZeros(RHS.getNode());
5332}
5333
5334// Extract raw constant bits from constant pools.
5335static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
5336 APInt &UndefElts,
5337 SmallVectorImpl<APInt> &EltBits,
5338 bool AllowWholeUndefs = true,
5339 bool AllowPartialUndefs = false) {
5340 assert(EltBits.empty() && "Expected an empty EltBits vector");
5341
5343
5344 EVT VT = Op.getValueType();
5345 unsigned SizeInBits = VT.getSizeInBits();
5346 unsigned NumElts = SizeInBits / EltSizeInBits;
5347
5348 // Can't split constant.
5349 if ((SizeInBits % EltSizeInBits) != 0)
5350 return false;
5351
5352 // Bitcast a source array of element bits to the target size.
5353 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
5354 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
5355 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5356 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5357 "Constant bit sizes don't match");
5358
5359 // Don't split if we don't allow undef bits.
5360 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5361 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
5362 return false;
5363
5364 // If we're already the right size, don't bother bitcasting.
5365 if (NumSrcElts == NumElts) {
5366 UndefElts = UndefSrcElts;
5367 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
5368 return true;
5369 }
5370
5371 // Extract all the undef/constant element data and pack into single bitsets.
5372 APInt UndefBits(SizeInBits, 0);
5373 APInt MaskBits(SizeInBits, 0);
5374
5375 for (unsigned i = 0; i != NumSrcElts; ++i) {
5376 unsigned BitOffset = i * SrcEltSizeInBits;
5377 if (UndefSrcElts[i])
5378 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5379 MaskBits.insertBits(SrcEltBits[i], BitOffset);
5380 }
5381
5382 // Split the undef/constant single bitset data into the target elements.
5383 UndefElts = APInt(NumElts, 0);
5384 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5385
5386 for (unsigned i = 0; i != NumElts; ++i) {
5387 unsigned BitOffset = i * EltSizeInBits;
5388 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5389
5390 // Only treat an element as UNDEF if all bits are UNDEF.
5391 if (UndefEltBits.isAllOnes()) {
5392 if (!AllowWholeUndefs)
5393 return false;
5394 UndefElts.setBit(i);
5395 continue;
5396 }
5397
5398 // If only some bits are UNDEF then treat them as zero (or bail if not
5399 // supported).
5400 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5401 return false;
5402
5403 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
5404 }
5405 return true;
5406 };
5407
5408 // Collect constant bits and insert into mask/undef bit masks.
5409 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5410 unsigned UndefBitIndex) {
5411 if (!Cst)
5412 return false;
5413 if (isa<UndefValue>(Cst)) {
5414 Undefs.setBit(UndefBitIndex);
5415 return true;
5416 }
5417 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5418 Mask = CInt->getValue();
5419 return true;
5420 }
5421 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5422 Mask = CFP->getValueAPF().bitcastToAPInt();
5423 return true;
5424 }
5425 if (auto *CDS = dyn_cast<ConstantDataSequential>(Cst)) {
5426 Type *Ty = CDS->getType();
5427 Mask = APInt::getZero(Ty->getPrimitiveSizeInBits());
5428 Type *EltTy = CDS->getElementType();
5429 bool IsInteger = EltTy->isIntegerTy();
5430 bool IsFP =
5431 EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy();
5432 if (!IsInteger && !IsFP)
5433 return false;
5434 unsigned EltBits = EltTy->getPrimitiveSizeInBits();
5435 for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
5436 if (IsInteger)
5437 Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits);
5438 else
5439 Mask.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(),
5440 I * EltBits);
5441 return true;
5442 }
5443 return false;
5444 };
5445
5446 // Handle UNDEFs.
5447 if (Op.isUndef()) {
5448 APInt UndefSrcElts = APInt::getAllOnes(NumElts);
5449 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
5450 return CastBitData(UndefSrcElts, SrcEltBits);
5451 }
5452
5453 // Extract scalar constant bits.
5454 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
5455 APInt UndefSrcElts = APInt::getZero(1);
5456 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
5457 return CastBitData(UndefSrcElts, SrcEltBits);
5458 }
5459 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5460 APInt UndefSrcElts = APInt::getZero(1);
5461 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5462 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
5463 return CastBitData(UndefSrcElts, SrcEltBits);
5464 }
5465
5466 // Extract constant bits from build vector.
5467 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
5468 BitVector Undefs;
5469 SmallVector<APInt> SrcEltBits;
5470 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5471 if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
5472 APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());
5473 for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
5474 if (Undefs[I])
5475 UndefSrcElts.setBit(I);
5476 return CastBitData(UndefSrcElts, SrcEltBits);
5477 }
5478 }
5479
5480 // Extract constant bits from constant pool vector.
5481 if (auto *Cst = getTargetConstantFromNode(Op)) {
5482 Type *CstTy = Cst->getType();
5483 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5484 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
5485 return false;
5486
5487 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5488 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5489 if ((SizeInBits % SrcEltSizeInBits) != 0)
5490 return false;
5491
5492 APInt UndefSrcElts(NumSrcElts, 0);
5493 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5494 for (unsigned i = 0; i != NumSrcElts; ++i)
5495 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5496 UndefSrcElts, i))
5497 return false;
5498
5499 return CastBitData(UndefSrcElts, SrcEltBits);
5500 }
5501
5502 // Extract constant bits from a broadcasted constant pool scalar.
5503 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
5504 EltSizeInBits <= VT.getScalarSizeInBits()) {
5505 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5506 if (MemIntr->getMemoryVT().getStoreSizeInBits() != VT.getScalarSizeInBits())
5507 return false;
5508
5509 SDValue Ptr = MemIntr->getBasePtr();
5510 if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {
5511 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5512 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5513
5514 APInt UndefSrcElts(NumSrcElts, 0);
5515 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5516 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
5517 if (UndefSrcElts[0])
5518 UndefSrcElts.setBits(0, NumSrcElts);
5519 if (SrcEltBits[0].getBitWidth() != SrcEltSizeInBits)
5520 SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);
5521 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5522 return CastBitData(UndefSrcElts, SrcEltBits);
5523 }
5524 }
5525 }
5526
5527 // Extract constant bits from a subvector broadcast.
5528 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
5529 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5530 SDValue Ptr = MemIntr->getBasePtr();
5531 // The source constant may be larger than the subvector broadcast,
5532 // ensure we extract the correct subvector constants.
5533 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
5534 Type *CstTy = Cst->getType();
5535 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5536 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
5537 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
5538 (SizeInBits % SubVecSizeInBits) != 0)
5539 return false;
5540 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
5541 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
5542 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
5543 APInt UndefSubElts(NumSubElts, 0);
5544 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
5545 APInt(CstEltSizeInBits, 0));
5546 for (unsigned i = 0; i != NumSubElts; ++i) {
5547 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
5548 UndefSubElts, i))
5549 return false;
5550 for (unsigned j = 1; j != NumSubVecs; ++j)
5551 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
5552 }
5553 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
5554 UndefSubElts);
5555 return CastBitData(UndefSubElts, SubEltBits);
5556 }
5557 }
5558
5559 // Extract a rematerialized scalar constant insertion.
5560 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5561 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5562 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5563 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5564 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5565
5566 APInt UndefSrcElts(NumSrcElts, 0);
5567 SmallVector<APInt, 64> SrcEltBits;
5568 const APInt &C = Op.getOperand(0).getConstantOperandAPInt(0);
5569 SrcEltBits.push_back(C.zextOrTrunc(SrcEltSizeInBits));
5570 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5571 return CastBitData(UndefSrcElts, SrcEltBits);
5572 }
5573
5574 // Insert constant bits from a base and sub vector sources.
5575 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
5576 // If bitcasts to larger elements we might lose track of undefs - don't
5577 // allow any to be safe.
5578 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5579 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
5580
5581 APInt UndefSrcElts, UndefSubElts;
5582 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
5583 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
5584 UndefSubElts, EltSubBits,
5585 AllowWholeUndefs && AllowUndefs,
5586 AllowPartialUndefs && AllowUndefs) &&
5587 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
5588 UndefSrcElts, EltSrcBits,
5589 AllowWholeUndefs && AllowUndefs,
5590 AllowPartialUndefs && AllowUndefs)) {
5591 unsigned BaseIdx = Op.getConstantOperandVal(2);
5592 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
5593 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
5594 EltSrcBits[BaseIdx + i] = EltSubBits[i];
5595 return CastBitData(UndefSrcElts, EltSrcBits);
5596 }
5597 }
5598
5599 // Extract constant bits from a subvector's source.
5600 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5601 getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, UndefElts,
5602 EltBits, AllowWholeUndefs,
5603 AllowPartialUndefs)) {
5604 EVT SrcVT = Op.getOperand(0).getValueType();
5605 unsigned NumSrcElts = SrcVT.getSizeInBits() / EltSizeInBits;
5606 unsigned NumSubElts = VT.getSizeInBits() / EltSizeInBits;
5607 unsigned BaseOfs = Op.getConstantOperandVal(1) * VT.getScalarSizeInBits();
5608 unsigned BaseIdx = BaseOfs / EltSizeInBits;
5609 assert((SrcVT.getSizeInBits() % EltSizeInBits) == 0 &&
5610 (VT.getSizeInBits() % EltSizeInBits) == 0 &&
5611 (BaseOfs % EltSizeInBits) == 0 && "Bad subvector index");
5612
5613 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
5614 if ((BaseIdx + NumSubElts) != NumSrcElts)
5615 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
5616 if (BaseIdx != 0)
5617 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
5618 return true;
5619 }
5620
5621 // Extract constant bits from shuffle node sources.
5622 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
5623 // TODO - support shuffle through bitcasts.
5624 if (EltSizeInBits != VT.getScalarSizeInBits())
5625 return false;
5626
5627 ArrayRef<int> Mask = SVN->getMask();
5628 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
5629 llvm::any_of(Mask, [](int M) { return M < 0; }))
5630 return false;
5631
5632 APInt UndefElts0, UndefElts1;
5633 SmallVector<APInt, 32> EltBits0, EltBits1;
5634 if (isAnyInRange(Mask, 0, NumElts) &&
5635 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
5636 UndefElts0, EltBits0, AllowWholeUndefs,
5637 AllowPartialUndefs))
5638 return false;
5639 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
5640 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
5641 UndefElts1, EltBits1, AllowWholeUndefs,
5642 AllowPartialUndefs))
5643 return false;
5644
5645 UndefElts = APInt::getZero(NumElts);
5646 for (int i = 0; i != (int)NumElts; ++i) {
5647 int M = Mask[i];
5648 if (M < 0) {
5649 UndefElts.setBit(i);
5650 EltBits.push_back(APInt::getZero(EltSizeInBits));
5651 } else if (M < (int)NumElts) {
5652 if (UndefElts0[M])
5653 UndefElts.setBit(i);
5654 EltBits.push_back(EltBits0[M]);
5655 } else {
5656 if (UndefElts1[M - NumElts])
5657 UndefElts.setBit(i);
5658 EltBits.push_back(EltBits1[M - NumElts]);
5659 }
5660 }
5661 return true;
5662 }
5663
5664 return false;
5665}
5666
5667namespace llvm {
5668namespace X86 {
5669bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
5670 APInt UndefElts;
5671 SmallVector<APInt, 16> EltBits;
5673 Op, Op.getScalarValueSizeInBits(), UndefElts, EltBits,
5674 /*AllowWholeUndefs*/ true, AllowPartialUndefs)) {
5675 int SplatIndex = -1;
5676 for (int i = 0, e = EltBits.size(); i != e; ++i) {
5677 if (UndefElts[i])
5678 continue;
5679 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
5680 SplatIndex = -1;
5681 break;
5682 }
5683 SplatIndex = i;
5684 }
5685 if (0 <= SplatIndex) {
5686 SplatVal = EltBits[SplatIndex];
5687 return true;
5688 }
5689 }
5690
5691 return false;
5692}
5693
5694int getRoundingModeX86(unsigned RM) {
5695 switch (static_cast<::llvm::RoundingMode>(RM)) {
5696 // clang-format off
5697 case ::llvm::RoundingMode::NearestTiesToEven: return X86::rmToNearest;
5698 case ::llvm::RoundingMode::TowardNegative: return X86::rmDownward;
5699 case ::llvm::RoundingMode::TowardPositive: return X86::rmUpward;
5700 case ::llvm::RoundingMode::TowardZero: return X86::rmTowardZero;
5701 default: return X86::rmInvalid;
5702 // clang-format on
5703 }
5704}
5705
5706} // namespace X86
5707} // namespace llvm
5708
5710 unsigned MaskEltSizeInBits,
5712 APInt &UndefElts) {
5713 // Extract the raw target constant bits.
5714 SmallVector<APInt, 64> EltBits;
5715 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5716 EltBits, /* AllowWholeUndefs */ true,
5717 /* AllowPartialUndefs */ false))
5718 return false;
5719
5720 // Insert the extracted elements into the mask.
5721 for (const APInt &Elt : EltBits)
5722 RawMask.push_back(Elt.getZExtValue());
5723
5724 return true;
5725}
5726
5727static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts,
5728 bool AllowUndefs) {
5729 APInt UndefElts;
5730 SmallVector<APInt, 64> EltBits;
5731 if (!getTargetConstantBitsFromNode(V, EltSizeInBIts, UndefElts, EltBits,
5732 /*AllowWholeUndefs*/ AllowUndefs,
5733 /*AllowPartialUndefs*/ false))
5734 return false;
5735
5736 bool IsPow2OrUndef = true;
5737 for (unsigned I = 0, E = EltBits.size(); I != E; ++I)
5738 IsPow2OrUndef &= UndefElts[I] || EltBits[I].isPowerOf2();
5739 return IsPow2OrUndef;
5740}
5741
5742// Helper to attempt to return a cheaper, bit-inverted version of \p V.
5744 // TODO: don't always ignore oneuse constraints.
5745 V = peekThroughBitcasts(V);
5746 EVT VT = V.getValueType();
5747
5748 // Match not(xor X, -1) -> X.
5749 if (V.getOpcode() == ISD::XOR &&
5750 (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||
5751 isAllOnesConstant(V.getOperand(1))))
5752 return V.getOperand(0);
5753
5754 // Match not(extract_subvector(not(X)) -> extract_subvector(X).
5755 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5756 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
5757 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
5758 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
5759 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), VT, Not,
5760 V.getOperand(1));
5761 }
5762 }
5763
5764 // Match not(pcmpgt(C, X)) -> pcmpgt(X, C - 1).
5765 if (V.getOpcode() == X86ISD::PCMPGT &&
5766 !ISD::isBuildVectorAllZeros(V.getOperand(0).getNode()) &&
5767 !ISD::isBuildVectorAllOnes(V.getOperand(0).getNode()) &&
5768 V.getOperand(0).hasOneUse()) {
5769 APInt UndefElts;
5770 SmallVector<APInt> EltBits;
5771 if (getTargetConstantBitsFromNode(V.getOperand(0),
5772 V.getScalarValueSizeInBits(), UndefElts,
5773 EltBits) &&
5774 !ISD::isBuildVectorOfConstantSDNodes(V.getOperand(1).getNode())) {
5775 // Don't fold min_signed_value -> (min_signed_value - 1)
5776 bool MinSigned = false;
5777 for (APInt &Elt : EltBits) {
5778 MinSigned |= Elt.isMinSignedValue();
5779 Elt -= 1;
5780 }
5781 if (!MinSigned) {
5782 SDLoc DL(V);
5783 MVT VT = V.getSimpleValueType();
5784 return DAG.getNode(X86ISD::PCMPGT, DL, VT, V.getOperand(1),
5785 getConstVector(EltBits, UndefElts, VT, DAG, DL));
5786 }
5787 }
5788 }
5789
5790 // Match not(concat_vectors(not(X), not(Y))) -> concat_vectors(X, Y).
5792 if (collectConcatOps(V.getNode(), CatOps, DAG)) {
5793 for (SDValue &CatOp : CatOps) {
5794 SDValue NotCat = IsNOT(CatOp, DAG);
5795 if (!NotCat)
5796 return SDValue();
5797 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
5798 }
5799 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), VT, CatOps);
5800 }
5801
5802 // Match not(or(not(X),not(Y))) -> and(X, Y).
5803 if (V.getOpcode() == ISD::OR && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
5804 V.getOperand(0).hasOneUse() && V.getOperand(1).hasOneUse()) {
5805 // TODO: Handle cases with single NOT operand -> ANDNP
5806 if (SDValue Op1 = IsNOT(V.getOperand(1), DAG))
5807 if (SDValue Op0 = IsNOT(V.getOperand(0), DAG))
5808 return DAG.getNode(ISD::AND, SDLoc(V), VT, DAG.getBitcast(VT, Op0),
5809 DAG.getBitcast(VT, Op1));
5810 }
5811
5812 return SDValue();
5813}
5814
5815/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5816/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
5817/// Note: This ignores saturation, so inputs must be checked first.
5819 bool Unary, unsigned NumStages = 1) {
5820 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5821 unsigned NumElts = VT.getVectorNumElements();
5822 unsigned NumLanes = VT.getSizeInBits() / 128;
5823 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5824 unsigned Offset = Unary ? 0 : NumElts;
5825 unsigned Repetitions = 1u << (NumStages - 1);
5826 unsigned Increment = 1u << NumStages;
5827 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
5828
5829 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5830 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
5831 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5832 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5833 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5834 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5835 }
5836 }
5837}
5838
5839// Split the demanded elts of a PACKSS/PACKUS node between its operands.
5840static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
5841 APInt &DemandedLHS, APInt &DemandedRHS) {
5842 int NumLanes = VT.getSizeInBits() / 128;
5843 int NumElts = DemandedElts.getBitWidth();
5844 int NumInnerElts = NumElts / 2;
5845 int NumEltsPerLane = NumElts / NumLanes;
5846 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
5847
5848 DemandedLHS = APInt::getZero(NumInnerElts);
5849 DemandedRHS = APInt::getZero(NumInnerElts);
5850
5851 // Map DemandedElts to the packed operands.
5852 for (int Lane = 0; Lane != NumLanes; ++Lane) {
5853 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
5854 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
5855 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
5856 if (DemandedElts[OuterIdx])
5857 DemandedLHS.setBit(InnerIdx);
5858 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
5859 DemandedRHS.setBit(InnerIdx);
5860 }
5861 }
5862}
5863
5864// Split the demanded elts of a HADD/HSUB node between its operands.
5865static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
5866 APInt &DemandedLHS, APInt &DemandedRHS) {
5868 DemandedLHS, DemandedRHS);
5869 DemandedLHS |= DemandedLHS << 1;
5870 DemandedRHS |= DemandedRHS << 1;
5871}
5872
5873/// Calculates the shuffle mask corresponding to the target-specific opcode.
5874/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5875/// operands in \p Ops, and returns true.
5876/// Sets \p IsUnary to true if only one source is used. Note that this will set
5877/// IsUnary for shuffles which use a single input multiple times, and in those
5878/// cases it will adjust the mask to only have indices within that single input.
5879/// It is an error to call this with non-empty Mask/Ops vectors.
5880static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5882 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5883 if (!isTargetShuffle(N.getOpcode()))
5884 return false;
5885
5886 MVT VT = N.getSimpleValueType();
5887 unsigned NumElems = VT.getVectorNumElements();
5888 unsigned MaskEltSize = VT.getScalarSizeInBits();
5890 APInt RawUndefs;
5891 uint64_t ImmN;
5892
5893 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5894 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5895
5896 IsUnary = false;
5897 bool IsFakeUnary = false;
5898 switch (N.getOpcode()) {
5899 case X86ISD::BLENDI:
5900 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5901 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5902 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5903 DecodeBLENDMask(NumElems, ImmN, Mask);
5904 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5905 break;
5906 case X86ISD::SHUFP:
5907 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5908 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5909 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5910 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
5911 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5912 break;
5913 case X86ISD::INSERTPS:
5914 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5915 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5916 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5917 DecodeINSERTPSMask(ImmN, Mask, /*SrcIsMem=*/false);
5918 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5919 break;
5920 case X86ISD::EXTRQI:
5921 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5922 if (isa<ConstantSDNode>(N.getOperand(1)) &&
5923 isa<ConstantSDNode>(N.getOperand(2))) {
5924 int BitLen = N.getConstantOperandVal(1);
5925 int BitIdx = N.getConstantOperandVal(2);
5926 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5927 IsUnary = true;
5928 }
5929 break;
5930 case X86ISD::INSERTQI:
5931 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5932 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5933 if (isa<ConstantSDNode>(N.getOperand(2)) &&
5934 isa<ConstantSDNode>(N.getOperand(3))) {
5935 int BitLen = N.getConstantOperandVal(2);
5936 int BitIdx = N.getConstantOperandVal(3);
5937 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5938 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5939 }
5940 break;
5941 case X86ISD::UNPCKH:
5942 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5943 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5944 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
5945 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5946 break;
5947 case X86ISD::UNPCKL:
5948 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5949 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5950 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
5951 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5952 break;
5953 case X86ISD::MOVHLPS:
5954 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5955 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5956 DecodeMOVHLPSMask(NumElems, Mask);
5957 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5958 break;
5959 case X86ISD::MOVLHPS:
5960 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5961 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5962 DecodeMOVLHPSMask(NumElems, Mask);
5963 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5964 break;
5965 case X86ISD::VALIGN:
5966 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
5967 "Only 32-bit and 64-bit elements are supported!");
5968 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5969 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5970 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5971 DecodeVALIGNMask(NumElems, ImmN, Mask);
5972 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5973 Ops.push_back(N.getOperand(1));
5974 Ops.push_back(N.getOperand(0));
5975 break;
5976 case X86ISD::PALIGNR:
5977 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5978 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5979 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5980 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5981 DecodePALIGNRMask(NumElems, ImmN, Mask);
5982 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5983 Ops.push_back(N.getOperand(1));
5984 Ops.push_back(N.getOperand(0));
5985 break;
5986 case X86ISD::VSHLDQ:
5987 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5988 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5989 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5990 DecodePSLLDQMask(NumElems, ImmN, Mask);
5991 IsUnary = true;
5992 break;
5993 case X86ISD::VSRLDQ:
5994 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5995 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5996 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5997 DecodePSRLDQMask(NumElems, ImmN, Mask);
5998 IsUnary = true;
5999 break;
6000 case X86ISD::PSHUFD:
6001 case X86ISD::VPERMILPI:
6002 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6003 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
6004 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
6005 IsUnary = true;
6006 break;
6007 case X86ISD::PSHUFHW:
6008 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6009 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
6010 DecodePSHUFHWMask(NumElems, ImmN, Mask);
6011 IsUnary = true;
6012 break;
6013 case X86ISD::PSHUFLW:
6014 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6015 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
6016 DecodePSHUFLWMask(NumElems, ImmN, Mask);
6017 IsUnary = true;
6018 break;
6019 case X86ISD::VZEXT_MOVL:
6020 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6021 DecodeZeroMoveLowMask(NumElems, Mask);
6022 IsUnary = true;
6023 break;
6024 case X86ISD::VBROADCAST:
6025 // We only decode broadcasts of same-sized vectors, peeking through to
6026 // extracted subvectors is likely to cause hasOneUse issues with
6027 // SimplifyDemandedBits etc.
6028 if (N.getOperand(0).getValueType() == VT) {
6029 DecodeVectorBroadcast(NumElems, Mask);
6030 IsUnary = true;
6031 break;
6032 }
6033 return false;
6034 case X86ISD::VPERMILPV: {
6035 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6036 IsUnary = true;
6037 SDValue MaskNode = N.getOperand(1);
6038 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
6039 RawUndefs)) {
6040 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
6041 break;
6042 }
6043 return false;
6044 }
6045 case X86ISD::PSHUFB: {
6046 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
6047 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6048 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
6049 IsUnary = true;
6050 SDValue MaskNode = N.getOperand(1);
6051 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
6052 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
6053 break;
6054 }
6055 return false;
6056 }
6057 case X86ISD::VPERMI:
6058 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6059 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
6060 DecodeVPERMMask(NumElems, ImmN, Mask);
6061 IsUnary = true;
6062 break;
6063 case X86ISD::MOVSS:
6064 case X86ISD::MOVSD:
6065 case X86ISD::MOVSH:
6066 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6067 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
6068 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
6069 break;
6070 case X86ISD::VPERM2X128:
6071 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6072 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
6073 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
6074 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
6075 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
6076 break;
6077 case X86ISD::SHUF128:
6078 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6079 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
6080 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
6081 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
6082 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
6083 break;
6084 case X86ISD::MOVSLDUP:
6085 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6086 DecodeMOVSLDUPMask(NumElems, Mask);
6087 IsUnary = true;
6088 break;
6089 case X86ISD::MOVSHDUP:
6090 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6091 DecodeMOVSHDUPMask(NumElems, Mask);
6092 IsUnary = true;
6093 break;
6094 case X86ISD::MOVDDUP:
6095 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6096 DecodeMOVDDUPMask(NumElems, Mask);
6097 IsUnary = true;
6098 break;
6099 case X86ISD::VPERMIL2: {
6100 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6101 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
6102 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
6103 SDValue MaskNode = N.getOperand(2);
6104 SDValue CtrlNode = N.getOperand(3);
6105 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
6106 unsigned CtrlImm = CtrlOp->getZExtValue();
6107 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
6108 RawUndefs)) {
6109 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
6110 Mask);
6111 break;
6112 }
6113 }
6114 return false;
6115 }
6116 case X86ISD::VPPERM: {
6117 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6118 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
6119 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
6120 SDValue MaskNode = N.getOperand(2);
6121 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
6122 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
6123 break;
6124 }
6125 return false;
6126 }
6127 case X86ISD::VPERMV: {
6128 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
6129 IsUnary = true;
6130 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
6131 Ops.push_back(N.getOperand(1));
6132 SDValue MaskNode = N.getOperand(0);
6133 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
6134 RawUndefs)) {
6135 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
6136 break;
6137 }
6138 return false;
6139 }
6140 case X86ISD::VPERMV3: {
6141 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
6142 assert(N.getOperand(2).getValueType() == VT && "Unexpected value type");
6143 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(2);
6144 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
6145 Ops.push_back(N.getOperand(0));
6146 Ops.push_back(N.getOperand(2));
6147 SDValue MaskNode = N.getOperand(1);
6148 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
6149 RawUndefs)) {
6150 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
6151 break;
6152 }
6153 return false;
6154 }
6155 case X86ISD::COMPRESS: {
6156 SDValue CmpVec = N.getOperand(0);
6157 SDValue PassThru = N.getOperand(1);
6158 SDValue CmpMask = N.getOperand(2);
6159 APInt UndefElts;
6160 SmallVector<APInt> EltBits;
6161 if (!getTargetConstantBitsFromNode(CmpMask, 1, UndefElts, EltBits))
6162 return false;
6163 assert(UndefElts.getBitWidth() == NumElems && EltBits.size() == NumElems &&
6164 "Illegal compression mask");
6165 for (unsigned I = 0; I != NumElems; ++I) {
6166 if (!EltBits[I].isZero())
6167 Mask.push_back(I);
6168 }
6169 while (Mask.size() != NumElems) {
6170 Mask.push_back(NumElems + Mask.size());
6171 }
6172 Ops.push_back(CmpVec);
6173 Ops.push_back(PassThru);
6174 return true;
6175 }
6176 case X86ISD::EXPAND: {
6177 SDValue ExpVec = N.getOperand(0);
6178 SDValue PassThru = N.getOperand(1);
6179 SDValue ExpMask = N.getOperand(2);
6180 APInt UndefElts;
6181 SmallVector<APInt> EltBits;
6182 if (!getTargetConstantBitsFromNode(ExpMask, 1, UndefElts, EltBits))
6183 return false;
6184 assert(UndefElts.getBitWidth() == NumElems && EltBits.size() == NumElems &&
6185 "Illegal expansion mask");
6186 unsigned ExpIndex = 0;
6187 for (unsigned I = 0; I != NumElems; ++I) {
6188 if (EltBits[I].isZero())
6189 Mask.push_back(I + NumElems);
6190 else
6191 Mask.push_back(ExpIndex++);
6192 }
6193 Ops.push_back(ExpVec);
6194 Ops.push_back(PassThru);
6195 return true;
6196 }
6197 default:
6198 llvm_unreachable("unknown target shuffle node");
6199 }
6200
6201 // Empty mask indicates the decode failed.
6202 if (Mask.empty())
6203 return false;
6204
6205 // Check if we're getting a shuffle mask with zero'd elements.
6206 if (!AllowSentinelZero && isAnyZero(Mask))
6207 return false;
6208
6209 // If we have a fake unary shuffle, the shuffle mask is spread across two
6210 // inputs that are actually the same node. Re-map the mask to always point
6211 // into the first input.
6212 if (IsFakeUnary)
6213 for (int &M : Mask)
6214 if (M >= (int)Mask.size())
6215 M -= Mask.size();
6216
6217 // If we didn't already add operands in the opcode-specific code, default to
6218 // adding 1 or 2 operands starting at 0.
6219 if (Ops.empty()) {
6220 Ops.push_back(N.getOperand(0));
6221 if (!IsUnary || IsFakeUnary)
6222 Ops.push_back(N.getOperand(1));
6223 }
6224
6225 return true;
6226}
6227
6228// Wrapper for getTargetShuffleMask with InUnary;
6229static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
6231 SmallVectorImpl<int> &Mask) {
6232 bool IsUnary;
6233 return getTargetShuffleMask(N, AllowSentinelZero, Ops, Mask, IsUnary);
6234}
6235
6236/// Compute whether each element of a shuffle is zeroable.
6237///
6238/// A "zeroable" vector shuffle element is one which can be lowered to zero.
6239/// Either it is an undef element in the shuffle mask, the element of the input
6240/// referenced is undef, or the element of the input referenced is known to be
6241/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
6242/// as many lanes with this technique as possible to simplify the remaining
6243/// shuffle.
6245 SDValue V1, SDValue V2,
6246 APInt &KnownUndef, APInt &KnownZero) {
6247 int Size = Mask.size();
6248 KnownUndef = KnownZero = APInt::getZero(Size);
6249
6250 V1 = peekThroughBitcasts(V1);
6251 V2 = peekThroughBitcasts(V2);
6252
6253 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
6254 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
6255
6256 int VectorSizeInBits = V1.getValueSizeInBits();
6257 int ScalarSizeInBits = VectorSizeInBits / Size;
6258 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
6259
6260 for (int i = 0; i < Size; ++i) {
6261 int M = Mask[i];
6262 // Handle the easy cases.
6263 if (M < 0) {
6264 KnownUndef.setBit(i);
6265 continue;
6266 }
6267 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
6268 KnownZero.setBit(i);
6269 continue;
6270 }
6271
6272 // Determine shuffle input and normalize the mask.
6273 SDValue V = M < Size ? V1 : V2;
6274 M %= Size;
6275
6276 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
6277 if (V.getOpcode() != ISD::BUILD_VECTOR)
6278 continue;
6279
6280 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
6281 // the (larger) source element must be UNDEF/ZERO.
6282 if ((Size % V.getNumOperands()) == 0) {
6283 int Scale = Size / V->getNumOperands();
6284 SDValue Op = V.getOperand(M / Scale);
6285 if (Op.isUndef())
6286 KnownUndef.setBit(i);
6287 if (X86::isZeroNode(Op))
6288 KnownZero.setBit(i);
6289 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
6290 APInt Val = Cst->getAPIntValue();
6291 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
6292 if (Val == 0)
6293 KnownZero.setBit(i);
6294 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
6295 APInt Val = Cst->getValueAPF().bitcastToAPInt();
6296 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
6297 if (Val == 0)
6298 KnownZero.setBit(i);
6299 }
6300 continue;
6301 }
6302
6303 // If the BUILD_VECTOR has more elements then all the (smaller) source
6304 // elements must be UNDEF or ZERO.
6305 if ((V.getNumOperands() % Size) == 0) {
6306 int Scale = V->getNumOperands() / Size;
6307 bool AllUndef = true;
6308 bool AllZero = true;
6309 for (int j = 0; j < Scale; ++j) {
6310 SDValue Op = V.getOperand((M * Scale) + j);
6311 AllUndef &= Op.isUndef();
6312 AllZero &= X86::isZeroNode(Op);
6313 }
6314 if (AllUndef)
6315 KnownUndef.setBit(i);
6316 if (AllZero)
6317 KnownZero.setBit(i);
6318 continue;
6319 }
6320 }
6321}
6322
6323/// Decode a target shuffle mask and inputs and see if any values are
6324/// known to be undef or zero from their inputs.
6325/// Returns true if the target shuffle mask was decoded.
6326/// FIXME: Merge this with computeZeroableShuffleElements?
6329 APInt &KnownUndef, APInt &KnownZero) {
6330 bool IsUnary;
6331 if (!isTargetShuffle(N.getOpcode()))
6332 return false;
6333
6334 MVT VT = N.getSimpleValueType();
6335 if (!getTargetShuffleMask(N, true, Ops, Mask, IsUnary))
6336 return false;
6337
6338 int Size = Mask.size();
6339 SDValue V1 = Ops[0];
6340 SDValue V2 = IsUnary ? V1 : Ops[1];
6341 KnownUndef = KnownZero = APInt::getZero(Size);
6342
6343 V1 = peekThroughBitcasts(V1);
6344 V2 = peekThroughBitcasts(V2);
6345
6346 assert((VT.getSizeInBits() % Size) == 0 &&
6347 "Illegal split of shuffle value type");
6348 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
6349
6350 // Extract known constant input data.
6351 APInt UndefSrcElts[2];
6352 SmallVector<APInt, 32> SrcEltBits[2];
6353 bool IsSrcConstant[2] = {
6354 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
6355 SrcEltBits[0], /*AllowWholeUndefs*/ true,
6356 /*AllowPartialUndefs*/ false),
6357 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
6358 SrcEltBits[1], /*AllowWholeUndefs*/ true,
6359 /*AllowPartialUndefs*/ false)};
6360
6361 for (int i = 0; i < Size; ++i) {
6362 int M = Mask[i];
6363
6364 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
6365 if (M < 0) {
6366 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
6367 if (SM_SentinelUndef == M)
6368 KnownUndef.setBit(i);
6369 if (SM_SentinelZero == M)
6370 KnownZero.setBit(i);
6371 continue;
6372 }
6373
6374 // Determine shuffle input and normalize the mask.
6375 unsigned SrcIdx = M / Size;
6376 SDValue V = M < Size ? V1 : V2;
6377 M %= Size;
6378
6379 // We are referencing an UNDEF input.
6380 if (V.isUndef()) {
6381 KnownUndef.setBit(i);
6382 continue;
6383 }
6384
6385 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
6386 // TODO: We currently only set UNDEF for integer types - floats use the same
6387 // registers as vectors and many of the scalar folded loads rely on the
6388 // SCALAR_TO_VECTOR pattern.
6389 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
6390 (Size % V.getValueType().getVectorNumElements()) == 0) {
6391 int Scale = Size / V.getValueType().getVectorNumElements();
6392 int Idx = M / Scale;
6393 if (Idx != 0 && !VT.isFloatingPoint())
6394 KnownUndef.setBit(i);
6395 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
6396 KnownZero.setBit(i);
6397 continue;
6398 }
6399
6400 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
6401 // base vectors.
6402 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
6403 SDValue Vec = V.getOperand(0);
6404 int NumVecElts = Vec.getValueType().getVectorNumElements();
6405 if (Vec.isUndef() && Size == NumVecElts) {
6406 int Idx = V.getConstantOperandVal(2);
6407 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
6408 if (M < Idx || (Idx + NumSubElts) <= M)
6409 KnownUndef.setBit(i);
6410 }
6411 continue;
6412 }
6413
6414 // Attempt to extract from the source's constant bits.
6415 if (IsSrcConstant[SrcIdx]) {
6416 if (UndefSrcElts[SrcIdx][M])
6417 KnownUndef.setBit(i);
6418 else if (SrcEltBits[SrcIdx][M] == 0)
6419 KnownZero.setBit(i);
6420 }
6421 }
6422
6423 assert(VT.getVectorNumElements() == (unsigned)Size &&
6424 "Different mask size from vector size!");
6425 return true;
6426}
6427
6428// Replace target shuffle mask elements with known undef/zero sentinels.
6430 const APInt &KnownUndef,
6431 const APInt &KnownZero,
6432 bool ResolveKnownZeros= true) {
6433 unsigned NumElts = Mask.size();
6434 assert(KnownUndef.getBitWidth() == NumElts &&
6435 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
6436
6437 for (unsigned i = 0; i != NumElts; ++i) {
6438 if (KnownUndef[i])
6439 Mask[i] = SM_SentinelUndef;
6440 else if (ResolveKnownZeros && KnownZero[i])
6441 Mask[i] = SM_SentinelZero;
6442 }
6443}
6444
6445// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
6447 APInt &KnownUndef,
6448 APInt &KnownZero) {
6449 unsigned NumElts = Mask.size();
6450 KnownUndef = KnownZero = APInt::getZero(NumElts);
6451
6452 for (unsigned i = 0; i != NumElts; ++i) {
6453 int M = Mask[i];
6454 if (SM_SentinelUndef == M)
6455 KnownUndef.setBit(i);
6456 if (SM_SentinelZero == M)
6457 KnownZero.setBit(i);
6458 }
6459}
6460
6461// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
6463 SDValue Cond, bool IsBLENDV = false) {
6464 EVT CondVT = Cond.getValueType();
6465 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
6466 unsigned NumElts = CondVT.getVectorNumElements();
6467
6468 APInt UndefElts;
6469 SmallVector<APInt, 32> EltBits;
6470 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
6471 /*AllowWholeUndefs*/ true,
6472 /*AllowPartialUndefs*/ false))
6473 return false;
6474
6475 Mask.resize(NumElts, SM_SentinelUndef);
6476
6477 for (int i = 0; i != (int)NumElts; ++i) {
6478 Mask[i] = i;
6479 // Arbitrarily choose from the 2nd operand if the select condition element
6480 // is undef.
6481 // TODO: Can we do better by matching patterns such as even/odd?
6482 if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
6483 (IsBLENDV && EltBits[i].isNonNegative()))
6484 Mask[i] += NumElts;
6485 }
6486
6487 return true;
6488}
6489
6490// Forward declaration (for getFauxShuffleMask recursive check).
6491static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6494 const SelectionDAG &DAG, unsigned Depth,
6495 bool ResolveKnownElts);
6496
6497// Attempt to decode ops that could be represented as a shuffle mask.
6498// The decoded shuffle mask may contain a different number of elements to the
6499// destination value type.
6500// TODO: Merge into getTargetShuffleInputs()
6501static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
6504 const SelectionDAG &DAG, unsigned Depth,
6505 bool ResolveKnownElts) {
6506 Mask.clear();
6507 Ops.clear();
6508
6509 MVT VT = N.getSimpleValueType();
6510 unsigned NumElts = VT.getVectorNumElements();
6511 unsigned NumSizeInBits = VT.getSizeInBits();
6512 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
6513 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
6514 return false;
6515 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
6516 unsigned NumSizeInBytes = NumSizeInBits / 8;
6517 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6518
6519 unsigned Opcode = N.getOpcode();
6520 switch (Opcode) {
6521 case ISD::VECTOR_SHUFFLE: {
6522 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
6523 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
6524 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
6525 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
6526 Ops.push_back(N.getOperand(0));
6527 Ops.push_back(N.getOperand(1));
6528 return true;
6529 }
6530 return false;
6531 }
6532 case ISD::AND:
6533 case X86ISD::ANDNP: {
6534 // Attempt to decode as a per-byte mask.
6535 APInt UndefElts;
6536 SmallVector<APInt, 32> EltBits;
6537 SDValue N0 = N.getOperand(0);
6538 SDValue N1 = N.getOperand(1);
6539 bool IsAndN = (X86ISD::ANDNP == Opcode);
6540 uint64_t ZeroMask = IsAndN ? 255 : 0;
6541 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits,
6542 /*AllowWholeUndefs*/ false,
6543 /*AllowPartialUndefs*/ false))
6544 return false;
6545 // We can't assume an undef src element gives an undef dst - the other src
6546 // might be zero.
6547 assert(UndefElts.isZero() && "Unexpected UNDEF element in AND/ANDNP mask");
6548 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
6549 const APInt &ByteBits = EltBits[i];
6550 if (ByteBits != 0 && ByteBits != 255)
6551 return false;
6552 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
6553 }
6554 Ops.push_back(IsAndN ? N1 : N0);
6555 return true;
6556 }
6557 case ISD::OR: {
6558 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
6559 // is a valid shuffle index.
6560 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
6561 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
6562 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
6563 return false;
6564
6565 SmallVector<int, 64> SrcMask0, SrcMask1;
6566 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
6569 if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,
6570 Depth + 1, true) ||
6571 !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,
6572 Depth + 1, true))
6573 return false;
6574
6575 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
6576 SmallVector<int, 64> Mask0, Mask1;
6577 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
6578 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
6579 for (int i = 0; i != (int)MaskSize; ++i) {
6580 // NOTE: Don't handle demanded SM_SentinelUndef, as we can end up in
6581 // infinite loops converting between OR and BLEND shuffles due to
6582 // canWidenShuffleElements merging away undef elements, meaning we
6583 // fail to recognise the OR as the undef element isn't known zero.
6584 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
6585 Mask.push_back(SM_SentinelZero);
6586 else if (Mask1[i] == SM_SentinelZero)
6587 Mask.push_back(i);
6588 else if (Mask0[i] == SM_SentinelZero)
6589 Mask.push_back(i + MaskSize);
6590 else if (MaskSize == NumElts && !DemandedElts[i])
6591 Mask.push_back(SM_SentinelUndef);
6592 else
6593 return false;
6594 }
6595 Ops.push_back(N.getOperand(0));
6596 Ops.push_back(N.getOperand(1));
6597 return true;
6598 }
6599 case ISD::CONCAT_VECTORS: {
6600 // Limit this to vXi64 vector cases to make the most of cross lane shuffles.
6601 unsigned NumSubElts = N.getOperand(0).getValueType().getVectorNumElements();
6602 if (NumBitsPerElt == 64) {
6603 for (unsigned I = 0, E = N.getNumOperands(); I != E; ++I) {
6604 for (unsigned M = 0; M != NumSubElts; ++M)
6605 Mask.push_back((I * NumElts) + M);
6606 Ops.push_back(N.getOperand(I));
6607 }
6608 return true;
6609 }
6610 return false;
6611 }
6612 case ISD::INSERT_SUBVECTOR: {
6613 SDValue Src = N.getOperand(0);
6614 SDValue Sub = N.getOperand(1);
6615 EVT SubVT = Sub.getValueType();
6616 unsigned NumSubElts = SubVT.getVectorNumElements();
6617 uint64_t InsertIdx = N.getConstantOperandVal(2);
6618 // Subvector isn't demanded - just return the base vector.
6619 if (DemandedElts.extractBits(NumSubElts, InsertIdx) == 0) {
6620 Mask.resize(NumElts);
6621 std::iota(Mask.begin(), Mask.end(), 0);
6622 Ops.push_back(Src);
6623 return true;
6624 }
6625 // Handle CONCAT(SUB0, SUB1).
6626 // Limit to vXi64/splat cases to make the most of cross lane shuffles.
6627 if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
6628 Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6629 Src.getOperand(0).isUndef() &&
6630 Src.getOperand(1).getValueType() == SubVT &&
6631 Src.getConstantOperandVal(2) == 0 &&
6632 (NumBitsPerElt == 64 || Src.getOperand(1) == Sub) &&
6633 SDNode::areOnlyUsersOf({N.getNode(), Src.getNode()}, Sub.getNode())) {
6634 Mask.resize(NumElts);
6635 std::iota(Mask.begin(), Mask.begin() + NumSubElts, 0);
6636 std::iota(Mask.begin() + NumSubElts, Mask.end(), NumElts);
6637 Ops.push_back(Src.getOperand(1));
6638 Ops.push_back(Sub);
6639 return true;
6640 }
6641 if (!N->isOnlyUserOf(Sub.getNode()))
6642 return false;
6643
6644 SmallVector<int, 64> SubMask;
6645 SmallVector<SDValue, 2> SubInputs;
6647 EVT SubSrcVT = SubSrc.getValueType();
6648 if (!SubSrcVT.isVector())
6649 return false;
6650
6651 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
6652 if (SubSrc.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6653 SubSrc.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6654 uint64_t ExtractIdx = SubSrc.getConstantOperandVal(1);
6655 SDValue SubSrcSrc = SubSrc.getOperand(0);
6656 unsigned NumSubSrcSrcElts =
6657 SubSrcSrc.getValueType().getVectorNumElements();
6658 unsigned MaxElts = std::max(NumElts, NumSubSrcSrcElts);
6659 assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcSrcElts) == 0 &&
6660 "Subvector valuetype mismatch");
6661 InsertIdx *= (MaxElts / NumElts);
6662 ExtractIdx *= (MaxElts / NumSubSrcSrcElts);
6663 NumSubElts *= (MaxElts / NumElts);
6664 bool SrcIsUndef = Src.isUndef();
6665 for (int i = 0; i != (int)MaxElts; ++i)
6666 Mask.push_back(SrcIsUndef ? SM_SentinelUndef : i);
6667 for (int i = 0; i != (int)NumSubElts; ++i)
6668 Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i;
6669 if (!SrcIsUndef)
6670 Ops.push_back(Src);
6671 Ops.push_back(SubSrcSrc);
6672 return true;
6673 }
6674
6675 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
6676 APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());
6677 if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,
6678 Depth + 1, ResolveKnownElts))
6679 return false;
6680
6681 // Subvector shuffle inputs must not be larger than the subvector.
6682 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
6683 return SubVT.getFixedSizeInBits() <
6684 SubInput.getValueSizeInBits().getFixedValue();
6685 }))
6686 return false;
6687
6688 if (SubMask.size() != NumSubElts) {
6689 assert(((SubMask.size() % NumSubElts) == 0 ||
6690 (NumSubElts % SubMask.size()) == 0) &&
6691 "Illegal submask scale");
6692 if ((NumSubElts % SubMask.size()) == 0) {
6693 int Scale = NumSubElts / SubMask.size();
6694 SmallVector<int, 64> ScaledSubMask;
6695 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
6696 SubMask = ScaledSubMask;
6697 } else {
6698 int Scale = SubMask.size() / NumSubElts;
6699 NumSubElts = SubMask.size();
6700 NumElts *= Scale;
6701 InsertIdx *= Scale;
6702 }
6703 }
6704 Ops.push_back(Src);
6705 Ops.append(SubInputs.begin(), SubInputs.end());
6706 if (ISD::isBuildVectorAllZeros(Src.getNode()))
6707 Mask.append(NumElts, SM_SentinelZero);
6708 else
6709 for (int i = 0; i != (int)NumElts; ++i)
6710 Mask.push_back(i);
6711 for (int i = 0; i != (int)NumSubElts; ++i) {
6712 int M = SubMask[i];
6713 if (0 <= M) {
6714 int InputIdx = M / NumSubElts;
6715 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
6716 }
6717 Mask[i + InsertIdx] = M;
6718 }
6719 return true;
6720 }
6721 case X86ISD::PINSRB:
6722 case X86ISD::PINSRW:
6725 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
6726 // vector, for matching src/dst vector types.
6727 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
6728
6729 unsigned DstIdx = 0;
6730 if (Opcode != ISD::SCALAR_TO_VECTOR) {
6731 // Check we have an in-range constant insertion index.
6732 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
6733 N.getConstantOperandAPInt(2).uge(NumElts))
6734 return false;
6735 DstIdx = N.getConstantOperandVal(2);
6736
6737 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
6738 if (X86::isZeroNode(Scl)) {
6739 Ops.push_back(N.getOperand(0));
6740 for (unsigned i = 0; i != NumElts; ++i)
6741 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
6742 return true;
6743 }
6744 }
6745
6746 // Peek through trunc/aext/zext/bitcast.
6747 // TODO: aext shouldn't require SM_SentinelZero padding.
6748 // TODO: handle shift of scalars.
6749 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
6750 while (Scl.getOpcode() == ISD::TRUNCATE ||
6751 Scl.getOpcode() == ISD::ANY_EXTEND ||
6752 Scl.getOpcode() == ISD::ZERO_EXTEND ||
6753 (Scl.getOpcode() == ISD::BITCAST &&
6756 Scl = Scl.getOperand(0);
6757 MinBitsPerElt =
6758 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
6759 }
6760 if ((MinBitsPerElt % 8) != 0)
6761 return false;
6762
6763 // Attempt to find the source vector the scalar was extracted from.
6764 SDValue SrcExtract;
6765 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
6766 Scl.getOpcode() == X86ISD::PEXTRW ||
6767 Scl.getOpcode() == X86ISD::PEXTRB) &&
6768 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6769 SrcExtract = Scl;
6770 }
6771 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
6772 return false;
6773
6774 SDValue SrcVec = SrcExtract.getOperand(0);
6775 EVT SrcVT = SrcVec.getValueType();
6776 if (!SrcVT.getScalarType().isByteSized())
6777 return false;
6778 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6779 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
6780 unsigned DstByte = DstIdx * NumBytesPerElt;
6781 MinBitsPerElt =
6782 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
6783
6784 // Create 'identity' byte level shuffle mask and then add inserted bytes.
6785 if (Opcode == ISD::SCALAR_TO_VECTOR) {
6786 Ops.push_back(SrcVec);
6787 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6788 } else {
6789 Ops.push_back(SrcVec);
6790 Ops.push_back(N.getOperand(0));
6791 for (int i = 0; i != (int)NumSizeInBytes; ++i)
6792 Mask.push_back(NumSizeInBytes + i);
6793 }
6794
6795 unsigned MinBytesPerElts = MinBitsPerElt / 8;
6796 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
6797 for (unsigned i = 0; i != MinBytesPerElts; ++i)
6798 Mask[DstByte + i] = SrcByte + i;
6799 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
6800 Mask[DstByte + i] = SM_SentinelZero;
6801 return true;
6802 }
6803 case X86ISD::PACKSS:
6804 case X86ISD::PACKUS: {
6805 SDValue N0 = N.getOperand(0);
6806 SDValue N1 = N.getOperand(1);
6807 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6808 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6809 "Unexpected input value type");
6810
6811 APInt EltsLHS, EltsRHS;
6812 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
6813
6814 // If we know input saturation won't happen (or we don't care for particular
6815 // lanes), we can treat this as a truncation shuffle.
6816 bool Offset0 = false, Offset1 = false;
6817 if (Opcode == X86ISD::PACKSS) {
6818 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6819 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
6820 (!(N1.isUndef() || EltsRHS.isZero()) &&
6821 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
6822 return false;
6823 // We can't easily fold ASHR into a shuffle, but if it was feeding a
6824 // PACKSS then it was likely being used for sign-extension for a
6825 // truncation, so just peek through and adjust the mask accordingly.
6826 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
6827 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
6828 Offset0 = true;
6829 N0 = N0.getOperand(0);
6830 }
6831 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
6832 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
6833 Offset1 = true;
6834 N1 = N1.getOperand(0);
6835 }
6836 } else {
6837 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6838 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6839 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
6840 (!(N1.isUndef() || EltsRHS.isZero()) &&
6841 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
6842 return false;
6843 }
6844
6845 bool IsUnary = (N0 == N1);
6846
6847 Ops.push_back(N0);
6848 if (!IsUnary)
6849 Ops.push_back(N1);
6850
6851 createPackShuffleMask(VT, Mask, IsUnary);
6852
6853 if (Offset0 || Offset1) {
6854 for (int &M : Mask)
6855 if ((Offset0 && isInRange(M, 0, NumElts)) ||
6856 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
6857 ++M;
6858 }
6859 return true;
6860 }
6861 case ISD::VSELECT:
6862 case X86ISD::BLENDV: {
6863 SDValue Cond = N.getOperand(0);
6864 if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {
6865 Ops.push_back(N.getOperand(1));
6866 Ops.push_back(N.getOperand(2));
6867 return true;
6868 }
6869 return false;
6870 }
6871 case X86ISD::VTRUNC: {
6872 SDValue Src = N.getOperand(0);
6873 EVT SrcVT = Src.getValueType();
6874 if (SrcVT.getSizeInBits() != NumSizeInBits)
6875 return false;
6876 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6877 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6878 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
6879 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
6880 for (unsigned i = 0; i != NumSrcElts; ++i)
6881 Mask.push_back(i * Scale);
6882 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
6883 Ops.push_back(Src);
6884 return true;
6885 }
6886 case ISD::SHL:
6887 case ISD::SRL: {
6888 APInt UndefElts;
6889 SmallVector<APInt, 32> EltBits;
6890 if (!getTargetConstantBitsFromNode(N.getOperand(1), NumBitsPerElt,
6891 UndefElts, EltBits,
6892 /*AllowWholeUndefs*/ true,
6893 /*AllowPartialUndefs*/ false))
6894 return false;
6895
6896 // We can only decode 'whole byte' bit shifts as shuffles.
6897 for (unsigned I = 0; I != NumElts; ++I)
6898 if (DemandedElts[I] && !UndefElts[I] &&
6899 (EltBits[I].urem(8) != 0 || EltBits[I].uge(NumBitsPerElt)))
6900 return false;
6901
6902 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6903 Ops.push_back(N.getOperand(0));
6904
6905 for (unsigned I = 0; I != NumElts; ++I) {
6906 if (!DemandedElts[I] || UndefElts[I])
6907 continue;
6908 unsigned ByteShift = EltBits[I].getZExtValue() / 8;
6909 unsigned Lo = I * NumBytesPerElt;
6910 unsigned Hi = Lo + NumBytesPerElt;
6911 // Clear mask to all zeros and insert the shifted byte indices.
6912 std::fill(Mask.begin() + Lo, Mask.begin() + Hi, SM_SentinelZero);
6913 if (ISD::SHL == Opcode)
6914 std::iota(Mask.begin() + Lo + ByteShift, Mask.begin() + Hi, Lo);
6915 else
6916 std::iota(Mask.begin() + Lo, Mask.begin() + Hi - ByteShift,
6917 Lo + ByteShift);
6918 }
6919 return true;
6920 }
6921 case X86ISD::VSHLI:
6922 case X86ISD::VSRLI: {
6923 uint64_t ShiftVal = N.getConstantOperandVal(1);
6924 // Out of range bit shifts are guaranteed to be zero.
6925 if (NumBitsPerElt <= ShiftVal) {
6926 Mask.append(NumElts, SM_SentinelZero);
6927 return true;
6928 }
6929
6930 // We can only decode 'whole byte' bit shifts as shuffles.
6931 if ((ShiftVal % 8) != 0)
6932 break;
6933
6934 uint64_t ByteShift = ShiftVal / 8;
6935 Ops.push_back(N.getOperand(0));
6936
6937 // Clear mask to all zeros and insert the shifted byte indices.
6938 Mask.append(NumSizeInBytes, SM_SentinelZero);
6939
6940 if (X86ISD::VSHLI == Opcode) {
6941 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6942 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6943 Mask[i + j] = i + j - ByteShift;
6944 } else {
6945 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6946 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6947 Mask[i + j - ByteShift] = i + j;
6948 }
6949 return true;
6950 }
6951 case ISD::ROTL:
6952 case ISD::ROTR: {
6953 APInt UndefElts;
6954 SmallVector<APInt, 32> EltBits;
6955 if (!getTargetConstantBitsFromNode(N.getOperand(1), NumBitsPerElt,
6956 UndefElts, EltBits,
6957 /*AllowWholeUndefs*/ true,
6958 /*AllowPartialUndefs*/ false))
6959 return false;
6960
6961 // We can only decode 'whole byte' bit rotates as shuffles.
6962 for (unsigned I = 0; I != NumElts; ++I)
6963 if (DemandedElts[I] && !UndefElts[I] &&
6964 (EltBits[I].urem(NumBitsPerElt) % 8) != 0)
6965 return false;
6966
6967 Ops.push_back(N.getOperand(0));
6968 for (unsigned I = 0; I != NumElts; ++I) {
6969 if (!DemandedElts[I] || UndefElts[I]) {
6970 Mask.append(NumBytesPerElt, SM_SentinelUndef);
6971 continue;
6972 }
6973 int Offset = EltBits[I].urem(NumBitsPerElt) / 8;
6974 Offset = (ISD::ROTL == Opcode ? NumBytesPerElt - Offset : Offset);
6975 int BaseIdx = I * NumBytesPerElt;
6976 for (int J = 0; J != (int)NumBytesPerElt; ++J) {
6977 Mask.push_back(BaseIdx + ((Offset + J) % NumBytesPerElt));
6978 }
6979 }
6980 return true;
6981 }
6982 case X86ISD::VROTLI:
6983 case X86ISD::VROTRI: {
6984 // We can only decode 'whole byte' bit rotates as shuffles.
6985 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
6986 if ((RotateVal % 8) != 0)
6987 return false;
6988 Ops.push_back(N.getOperand(0));
6989 int Offset = RotateVal / 8;
6990 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
6991 for (int i = 0; i != (int)NumElts; ++i) {
6992 int BaseIdx = i * NumBytesPerElt;
6993 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
6994 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
6995 }
6996 }
6997 return true;
6998 }
6999 case X86ISD::VBROADCAST: {
7000 SDValue Src = N.getOperand(0);
7001 if (!Src.getSimpleValueType().isVector()) {
7002 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7003 !isNullConstant(Src.getOperand(1)) ||
7004 Src.getOperand(0).getValueType().getScalarType() !=
7005 VT.getScalarType())
7006 return false;
7007 Src = Src.getOperand(0);
7008 }
7009 Ops.push_back(Src);
7010 Mask.append(NumElts, 0);
7011 return true;
7012 }
7014 SDValue Src = N.getOperand(0);
7015 EVT SrcVT = Src.getValueType();
7016 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
7017
7018 // Extended source must be a simple vector.
7019 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
7020 (NumBitsPerSrcElt % 8) != 0)
7021 return false;
7022
7023 // We can only handle all-signbits extensions.
7024 APInt DemandedSrcElts =
7025 DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
7026 if (DAG.ComputeNumSignBits(Src, DemandedSrcElts) != NumBitsPerSrcElt)
7027 return false;
7028
7029 assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 && "Unexpected extension");
7030 unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
7031 for (unsigned I = 0; I != NumElts; ++I)
7032 Mask.append(Scale, I);
7033 Ops.push_back(Src);
7034 return true;
7035 }
7036 case ISD::ZERO_EXTEND:
7037 case ISD::ANY_EXTEND:
7040 SDValue Src = N.getOperand(0);
7041 EVT SrcVT = Src.getValueType();
7042
7043 // Extended source must be a simple vector.
7044 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
7045 (SrcVT.getScalarSizeInBits() % 8) != 0)
7046 return false;
7047
7048 bool IsAnyExtend =
7049 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
7050 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
7051 IsAnyExtend, Mask);
7052 Ops.push_back(Src);
7053 return true;
7054 }
7055 }
7056
7057 return false;
7058}
7059
7060/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
7062 SmallVectorImpl<int> &Mask) {
7063 int MaskWidth = Mask.size();
7064 SmallVector<SDValue, 16> UsedInputs;
7065 for (int i = 0, e = Inputs.size(); i < e; ++i) {
7066 int lo = UsedInputs.size() * MaskWidth;
7067 int hi = lo + MaskWidth;
7068
7069 // Strip UNDEF input usage.
7070 if (Inputs[i].isUndef())
7071 for (int &M : Mask)
7072 if ((lo <= M) && (M < hi))
7073 M = SM_SentinelUndef;
7074
7075 // Check for unused inputs.
7076 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
7077 for (int &M : Mask)
7078 if (lo <= M)
7079 M -= MaskWidth;
7080 continue;
7081 }
7082
7083 // Check for repeated inputs.
7084 bool IsRepeat = false;
7085 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
7086 if (UsedInputs[j] != Inputs[i])
7087 continue;
7088 for (int &M : Mask)
7089 if (lo <= M)
7090 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
7091 IsRepeat = true;
7092 break;
7093 }
7094 if (IsRepeat)
7095 continue;
7096
7097 UsedInputs.push_back(Inputs[i]);
7098 }
7099 Inputs = std::move(UsedInputs);
7100}
7101
7102/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
7103/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
7104/// Returns true if the target shuffle mask was decoded.
7105static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
7108 APInt &KnownUndef, APInt &KnownZero,
7109 const SelectionDAG &DAG, unsigned Depth,
7110 bool ResolveKnownElts) {
7112 return false; // Limit search depth.
7113
7114 EVT VT = Op.getValueType();
7115 if (!VT.isSimple() || !VT.isVector())
7116 return false;
7117
7118 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
7119 if (ResolveKnownElts)
7120 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
7121 return true;
7122 }
7123 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
7124 ResolveKnownElts)) {
7125 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
7126 return true;
7127 }
7128 return false;
7129}
7130
7131static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
7134 const SelectionDAG &DAG, unsigned Depth,
7135 bool ResolveKnownElts) {
7136 APInt KnownUndef, KnownZero;
7137 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
7138 KnownZero, DAG, Depth, ResolveKnownElts);
7139}
7140
7143 const SelectionDAG &DAG, unsigned Depth = 0,
7144 bool ResolveKnownElts = true) {
7145 EVT VT = Op.getValueType();
7146 if (!VT.isSimple() || !VT.isVector())
7147 return false;
7148
7149 unsigned NumElts = Op.getValueType().getVectorNumElements();
7150 APInt DemandedElts = APInt::getAllOnes(NumElts);
7151 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,
7152 ResolveKnownElts);
7153}
7154
7155// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
7156static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
7157 EVT MemVT, MemSDNode *Mem, unsigned Offset,
7158 SelectionDAG &DAG) {
7159 assert((Opcode == X86ISD::VBROADCAST_LOAD ||
7160 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&
7161 "Unknown broadcast load type");
7162
7163 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
7164 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
7165 return SDValue();
7166
7167 SDValue Ptr = DAG.getMemBasePlusOffset(Mem->getBasePtr(),
7169 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7170 SDValue Ops[] = {Mem->getChain(), Ptr};
7171 SDValue BcstLd = DAG.getMemIntrinsicNode(
7172 Opcode, DL, Tys, Ops, MemVT,
7174 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
7175 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
7176 return BcstLd;
7177}
7178
7179/// Returns the scalar element that will make up the i'th
7180/// element of the result of the vector shuffle.
7181static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
7182 SelectionDAG &DAG, unsigned Depth) {
7184 return SDValue(); // Limit search depth.
7185
7186 EVT VT = Op.getValueType();
7187 unsigned Opcode = Op.getOpcode();
7188 unsigned NumElems = VT.getVectorNumElements();
7189
7190 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
7191 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
7192 int Elt = SV->getMaskElt(Index);
7193
7194 if (Elt < 0)
7195 return DAG.getUNDEF(VT.getVectorElementType());
7196
7197 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
7198 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
7199 }
7200
7201 // Recurse into target specific vector shuffles to find scalars.
7202 if (isTargetShuffle(Opcode)) {
7203 MVT ShufVT = VT.getSimpleVT();
7204 MVT ShufSVT = ShufVT.getVectorElementType();
7205 int NumElems = (int)ShufVT.getVectorNumElements();
7206 SmallVector<int, 16> ShuffleMask;
7208 if (!getTargetShuffleMask(Op, true, ShuffleOps, ShuffleMask))
7209 return SDValue();
7210
7211 int Elt = ShuffleMask[Index];
7212 if (Elt == SM_SentinelZero)
7213 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
7214 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
7215 if (Elt == SM_SentinelUndef)
7216 return DAG.getUNDEF(ShufSVT);
7217
7218 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
7219 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
7220 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
7221 }
7222
7223 // Recurse into insert_subvector base/sub vector to find scalars.
7224 if (Opcode == ISD::INSERT_SUBVECTOR) {
7225 SDValue Vec = Op.getOperand(0);
7226 SDValue Sub = Op.getOperand(1);
7227 uint64_t SubIdx = Op.getConstantOperandVal(2);
7228 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
7229
7230 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
7231 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
7232 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
7233 }
7234
7235 // Recurse into concat_vectors sub vector to find scalars.
7236 if (Opcode == ISD::CONCAT_VECTORS) {
7237 EVT SubVT = Op.getOperand(0).getValueType();
7238 unsigned NumSubElts = SubVT.getVectorNumElements();
7239 uint64_t SubIdx = Index / NumSubElts;
7240 uint64_t SubElt = Index % NumSubElts;
7241 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
7242 }
7243
7244 // Recurse into extract_subvector src vector to find scalars.
7245 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
7246 SDValue Src = Op.getOperand(0);
7247 uint64_t SrcIdx = Op.getConstantOperandVal(1);
7248 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
7249 }
7250
7251 // We only peek through bitcasts of the same vector width.
7252 if (Opcode == ISD::BITCAST) {
7253 SDValue Src = Op.getOperand(0);
7254 EVT SrcVT = Src.getValueType();
7255 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
7256 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
7257 return SDValue();
7258 }
7259
7260 // Actual nodes that may contain scalar elements
7261
7262 // For insert_vector_elt - either return the index matching scalar or recurse
7263 // into the base vector.
7264 if (Opcode == ISD::INSERT_VECTOR_ELT &&
7265 isa<ConstantSDNode>(Op.getOperand(2))) {
7266 if (Op.getConstantOperandAPInt(2) == Index)
7267 return Op.getOperand(1);
7268 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
7269 }
7270
7271 if (Opcode == ISD::SCALAR_TO_VECTOR)
7272 return (Index == 0) ? Op.getOperand(0)
7273 : DAG.getUNDEF(VT.getVectorElementType());
7274
7275 if (Opcode == ISD::BUILD_VECTOR)
7276 return Op.getOperand(Index);
7277
7278 return SDValue();
7279}
7280
7281// Use PINSRB/PINSRW/PINSRD to create a build vector.
7283 const APInt &NonZeroMask,
7284 unsigned NumNonZero, unsigned NumZero,
7285 SelectionDAG &DAG,
7286 const X86Subtarget &Subtarget) {
7287 MVT VT = Op.getSimpleValueType();
7288 unsigned NumElts = VT.getVectorNumElements();
7289 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
7290 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
7291 "Illegal vector insertion");
7292
7293 SDValue V;
7294 bool First = true;
7295
7296 for (unsigned i = 0; i < NumElts; ++i) {
7297 bool IsNonZero = NonZeroMask[i];
7298 if (!IsNonZero)
7299 continue;
7300
7301 // If the build vector contains zeros or our first insertion is not the
7302 // first index then insert into zero vector to break any register
7303 // dependency else use SCALAR_TO_VECTOR.
7304 if (First) {
7305 First = false;
7306 if (NumZero || 0 != i)
7307 V = getZeroVector(VT, Subtarget, DAG, DL);
7308 else {
7309 assert(0 == i && "Expected insertion into zero-index");
7310 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
7311 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
7312 V = DAG.getBitcast(VT, V);
7313 continue;
7314 }
7315 }
7316 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, V, Op.getOperand(i),
7317 DAG.getVectorIdxConstant(i, DL));
7318 }
7319
7320 return V;
7321}
7322
7323/// Custom lower build_vector of v16i8.
7325 const APInt &NonZeroMask,
7326 unsigned NumNonZero, unsigned NumZero,
7327 SelectionDAG &DAG,
7328 const X86Subtarget &Subtarget) {
7329 if (NumNonZero > 8 && !Subtarget.hasSSE41())
7330 return SDValue();
7331
7332 // SSE4.1 - use PINSRB to insert each byte directly.
7333 if (Subtarget.hasSSE41())
7334 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero,
7335 DAG, Subtarget);
7336
7337 SDValue V;
7338
7339 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
7340 // If both the lowest 16-bits are non-zero, then convert to MOVD.
7341 if (!NonZeroMask.extractBits(2, 0).isZero() &&
7342 !NonZeroMask.extractBits(2, 2).isZero()) {
7343 for (unsigned I = 0; I != 4; ++I) {
7344 if (!NonZeroMask[I])
7345 continue;
7346 SDValue Elt = DAG.getZExtOrTrunc(Op.getOperand(I), DL, MVT::i32);
7347 if (I != 0)
7348 Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt,
7349 DAG.getConstant(I * 8, DL, MVT::i8));
7350 V = V ? DAG.getNode(ISD::OR, DL, MVT::i32, V, Elt) : Elt;
7351 }
7352 assert(V && "Failed to fold v16i8 vector to zero");
7353 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
7354 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32, V);
7355 V = DAG.getBitcast(MVT::v8i16, V);
7356 }
7357 for (unsigned i = V ? 4 : 0; i < 16; i += 2) {
7358 bool ThisIsNonZero = NonZeroMask[i];
7359 bool NextIsNonZero = NonZeroMask[i + 1];
7360 if (!ThisIsNonZero && !NextIsNonZero)
7361 continue;
7362
7363 SDValue Elt;
7364 if (ThisIsNonZero) {
7365 if (NumZero || NextIsNonZero)
7366 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
7367 else
7368 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
7369 }
7370
7371 if (NextIsNonZero) {
7372 SDValue NextElt = Op.getOperand(i + 1);
7373 if (i == 0 && NumZero)
7374 NextElt = DAG.getZExtOrTrunc(NextElt, DL, MVT::i32);
7375 else
7376 NextElt = DAG.getAnyExtOrTrunc(NextElt, DL, MVT::i32);
7377 NextElt = DAG.getNode(ISD::SHL, DL, MVT::i32, NextElt,
7378 DAG.getConstant(8, DL, MVT::i8));
7379 if (ThisIsNonZero)
7380 Elt = DAG.getNode(ISD::OR, DL, MVT::i32, NextElt, Elt);
7381 else
7382 Elt = NextElt;
7383 }
7384
7385 // If our first insertion is not the first index or zeros are needed, then
7386 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
7387 // elements undefined).
7388 if (!V) {
7389 if (i != 0 || NumZero)
7390 V = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
7391 else {
7392 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, Elt);
7393 V = DAG.getBitcast(MVT::v8i16, V);
7394 continue;
7395 }
7396 }
7397 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
7398 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16, V, Elt,
7399 DAG.getVectorIdxConstant(i / 2, DL));
7400 }
7401
7402 return DAG.getBitcast(MVT::v16i8, V);
7403}
7404
7405/// Custom lower build_vector of v8i16.
7407 const APInt &NonZeroMask,
7408 unsigned NumNonZero, unsigned NumZero,
7409 SelectionDAG &DAG,
7410 const X86Subtarget &Subtarget) {
7411 if (NumNonZero > 4 && !Subtarget.hasSSE41())
7412 return SDValue();
7413
7414 // Use PINSRW to insert each byte directly.
7415 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero, DAG,
7416 Subtarget);
7417}
7418
7419/// Custom lower build_vector of v4i32 or v4f32.
7421 SelectionDAG &DAG,
7422 const X86Subtarget &Subtarget) {
7423 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
7424 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
7425 // Because we're creating a less complicated build vector here, we may enable
7426 // further folding of the MOVDDUP via shuffle transforms.
7427 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
7428 Op.getOperand(0) == Op.getOperand(2) &&
7429 Op.getOperand(1) == Op.getOperand(3) &&
7430 Op.getOperand(0) != Op.getOperand(1)) {
7431 MVT VT = Op.getSimpleValueType();
7432 MVT EltVT = VT.getVectorElementType();
7433 // Create a new build vector with the first 2 elements followed by undef
7434 // padding, bitcast to v2f64, duplicate, and bitcast back.
7435 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
7436 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
7437 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
7438 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
7439 return DAG.getBitcast(VT, Dup);
7440 }
7441
7442 // Find all zeroable elements.
7443 std::bitset<4> Zeroable, Undefs;
7444 for (int i = 0; i < 4; ++i) {
7445 SDValue Elt = Op.getOperand(i);
7446 Undefs[i] = Elt.isUndef();
7447 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
7448 }
7449 assert(Zeroable.size() - Zeroable.count() > 1 &&
7450 "We expect at least two non-zero elements!");
7451
7452 // We only know how to deal with build_vector nodes where elements are either
7453 // zeroable or extract_vector_elt with constant index.
7454 SDValue FirstNonZero;
7455 unsigned FirstNonZeroIdx;
7456 for (unsigned i = 0; i < 4; ++i) {
7457 if (Zeroable[i])
7458 continue;
7459 SDValue Elt = Op.getOperand(i);
7460 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7462 return SDValue();
7463 // Make sure that this node is extracting from a 128-bit vector.
7464 MVT VT = Elt.getOperand(0).getSimpleValueType();
7465 if (!VT.is128BitVector())
7466 return SDValue();
7467 if (!FirstNonZero.getNode()) {
7468 FirstNonZero = Elt;
7469 FirstNonZeroIdx = i;
7470 }
7471 }
7472
7473 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
7474 SDValue V1 = FirstNonZero.getOperand(0);
7475 MVT VT = V1.getSimpleValueType();
7476
7477 // See if this build_vector can be lowered as a blend with zero.
7478 SDValue Elt;
7479 unsigned EltMaskIdx, EltIdx;
7480 int Mask[4];
7481 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
7482 if (Zeroable[EltIdx]) {
7483 // The zero vector will be on the right hand side.
7484 Mask[EltIdx] = EltIdx+4;
7485 continue;
7486 }
7487
7488 Elt = Op->getOperand(EltIdx);
7489 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
7490 EltMaskIdx = Elt.getConstantOperandVal(1);
7491 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
7492 break;
7493 Mask[EltIdx] = EltIdx;
7494 }
7495
7496 if (EltIdx == 4) {
7497 // Let the shuffle legalizer deal with blend operations.
7498 SDValue VZeroOrUndef = (Zeroable == Undefs)
7499 ? DAG.getUNDEF(VT)
7500 : getZeroVector(VT, Subtarget, DAG, DL);
7501 if (V1.getSimpleValueType() != VT)
7502 V1 = DAG.getBitcast(VT, V1);
7503 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
7504 }
7505
7506 // See if we can lower this build_vector to a INSERTPS.
7507 if (!Subtarget.hasSSE41())
7508 return SDValue();
7509
7510 SDValue V2 = Elt.getOperand(0);
7511 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
7512 V1 = SDValue();
7513
7514 bool CanFold = true;
7515 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
7516 if (Zeroable[i])
7517 continue;
7518
7519 SDValue Current = Op->getOperand(i);
7520 SDValue SrcVector = Current->getOperand(0);
7521 if (!V1.getNode())
7522 V1 = SrcVector;
7523 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
7524 }
7525
7526 if (!CanFold)
7527 return SDValue();
7528
7529 assert(V1.getNode() && "Expected at least two non-zero elements!");
7530 if (V1.getSimpleValueType() != MVT::v4f32)
7531 V1 = DAG.getBitcast(MVT::v4f32, V1);
7532 if (V2.getSimpleValueType() != MVT::v4f32)
7533 V2 = DAG.getBitcast(MVT::v4f32, V2);
7534
7535 // Ok, we can emit an INSERTPS instruction.
7536 unsigned ZMask = Zeroable.to_ulong();
7537
7538 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
7539 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
7540 SDValue Result =
7541 DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
7542 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
7543 return DAG.getBitcast(VT, Result);
7544}
7545
7546/// Return a vector logical shift node.
7547static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
7548 SelectionDAG &DAG, const TargetLowering &TLI,
7549 const SDLoc &dl) {
7550 assert(VT.is128BitVector() && "Unknown type for VShift");
7551 MVT ShVT = MVT::v16i8;
7552 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
7553 SrcOp = DAG.getBitcast(ShVT, SrcOp);
7554 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
7555 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
7556 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
7557}
7558
7560 SelectionDAG &DAG) {
7561
7562 // Check if the scalar load can be widened into a vector load. And if
7563 // the address is "base + cst" see if the cst can be "absorbed" into
7564 // the shuffle mask.
7566 SDValue Ptr = LD->getBasePtr();
7567 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
7568 return SDValue();
7569 EVT PVT = LD->getValueType(0);
7570 if (PVT != MVT::i32 && PVT != MVT::f32)
7571 return SDValue();
7572
7573 int FI = -1;
7574 int64_t Offset = 0;
7575 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
7576 FI = FINode->getIndex();
7577 Offset = 0;
7578 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
7580 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
7582 Ptr = Ptr.getOperand(0);
7583 } else {
7584 return SDValue();
7585 }
7586
7587 // FIXME: 256-bit vector instructions don't require a strict alignment,
7588 // improve this code to support it better.
7589 Align RequiredAlign(VT.getSizeInBits() / 8);
7590 SDValue Chain = LD->getChain();
7591 // Make sure the stack object alignment is at least 16 or 32.
7593 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
7594 if (!InferredAlign || *InferredAlign < RequiredAlign) {
7595 if (MFI.isFixedObjectIndex(FI)) {
7596 // Can't change the alignment. FIXME: It's possible to compute
7597 // the exact stack offset and reference FI + adjust offset instead.
7598 // If someone *really* cares about this. That's the way to implement it.
7599 return SDValue();
7600 } else {
7601 MFI.setObjectAlignment(FI, RequiredAlign);
7602 }
7603 }
7604
7605 // (Offset % 16 or 32) must be multiple of 4. Then address is then
7606 // Ptr + (Offset & ~15).
7607 if (Offset < 0)
7608 return SDValue();
7609 if ((Offset % RequiredAlign.value()) & 3)
7610 return SDValue();
7611 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
7612 if (StartOffset) {
7613 SDLoc DL(Ptr);
7614 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7615 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
7616 }
7617
7618 int EltNo = (Offset - StartOffset) >> 2;
7619 unsigned NumElems = VT.getVectorNumElements();
7620
7621 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
7622 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
7623 LD->getPointerInfo().getWithOffset(StartOffset));
7624
7625 SmallVector<int, 8> Mask(NumElems, EltNo);
7626
7627 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
7628 }
7629
7630 return SDValue();
7631}
7632
7633// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
7634static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
7635 if (ISD::isNON_EXTLoad(Elt.getNode())) {
7636 auto *BaseLd = cast<LoadSDNode>(Elt);
7637 if (!BaseLd->isSimple())
7638 return false;
7639 Ld = BaseLd;
7640 ByteOffset = 0;
7641 return true;
7642 }
7643
7644 switch (Elt.getOpcode()) {
7645 case ISD::BITCAST:
7646 case ISD::TRUNCATE:
7648 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
7649 case ISD::SRL:
7650 if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7651 uint64_t Amt = AmtC->getZExtValue();
7652 if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
7653 ByteOffset += Amt / 8;
7654 return true;
7655 }
7656 }
7657 break;
7659 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7660 SDValue Src = Elt.getOperand(0);
7661 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
7662 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
7663 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
7664 findEltLoadSrc(Src, Ld, ByteOffset)) {
7665 uint64_t Idx = IdxC->getZExtValue();
7666 ByteOffset += Idx * (SrcSizeInBits / 8);
7667 return true;
7668 }
7669 }
7670 break;
7671 }
7672
7673 return false;
7674}
7675
7676/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
7677/// elements can be replaced by a single large load which has the same value as
7678/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
7679///
7680/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
7682 const SDLoc &DL, SelectionDAG &DAG,
7683 const X86Subtarget &Subtarget,
7684 bool IsAfterLegalize,
7685 unsigned Depth = 0) {
7687 return SDValue(); // Limit search depth.
7688 if ((VT.getScalarSizeInBits() % 8) != 0)
7689 return SDValue();
7690
7691 unsigned NumElems = Elts.size();
7692
7693 int LastLoadedElt = -1;
7694 APInt LoadMask = APInt::getZero(NumElems);
7695 APInt ZeroMask = APInt::getZero(NumElems);
7696 APInt UndefMask = APInt::getZero(NumElems);
7697
7698 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
7699 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
7700
7701 // For each element in the initializer, see if we've found a load, zero or an
7702 // undef.
7703 for (unsigned i = 0; i < NumElems; ++i) {
7704 SDValue Elt = peekThroughBitcasts(Elts[i]);
7705 if (!Elt.getNode())
7706 return SDValue();
7707 if (Elt.isUndef()) {
7708 UndefMask.setBit(i);
7709 continue;
7710 }
7712 ZeroMask.setBit(i);
7713 continue;
7714 }
7715
7716 // Each loaded element must be the correct fractional portion of the
7717 // requested vector load.
7718 unsigned EltSizeInBits = Elt.getValueSizeInBits();
7719 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
7720 return SDValue();
7721
7722 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
7723 return SDValue();
7724 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
7725 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
7726 return SDValue();
7727
7728 LoadMask.setBit(i);
7729 LastLoadedElt = i;
7730 }
7731 assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==
7732 NumElems &&
7733 "Incomplete element masks");
7734
7735 // Handle Special Cases - all undef or undef/zero.
7736 if (UndefMask.popcount() == NumElems)
7737 return DAG.getUNDEF(VT);
7738 if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)
7739 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
7740 : DAG.getConstantFP(0.0, DL, VT);
7741
7742 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7743 int FirstLoadedElt = LoadMask.countr_zero();
7744 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
7745 EVT EltBaseVT = EltBase.getValueType();
7746 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
7747 "Register/Memory size mismatch");
7748 LoadSDNode *LDBase = Loads[FirstLoadedElt];
7749 assert(LDBase && "Did not find base load for merging consecutive loads");
7750 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
7751 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
7752 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
7753 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
7754 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
7755
7756 // TODO: Support offsetting the base load.
7757 if (ByteOffsets[FirstLoadedElt] != 0)
7758 return SDValue();
7759
7760 // Check to see if the element's load is consecutive to the base load
7761 // or offset from a previous (already checked) load.
7762 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
7763 LoadSDNode *Ld = Loads[EltIdx];
7764 int64_t ByteOffset = ByteOffsets[EltIdx];
7765 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
7766 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
7767 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
7768 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
7769 }
7770 int Stride = EltIdx - FirstLoadedElt;
7771 if (DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes, Stride))
7772 return true;
7773 // Try again using the memory load size (we might have broken a large load
7774 // into smaller elements), ensure the stride is the full memory load size
7775 // apart and a whole number of elements fit in each memory load.
7776 unsigned BaseMemSizeInBits = Base->getMemoryVT().getSizeInBits();
7777 if (((Stride * BaseSizeInBits) % BaseMemSizeInBits) == 0 &&
7778 (BaseMemSizeInBits % BaseSizeInBits) == 0) {
7779 unsigned Scale = BaseMemSizeInBits / BaseSizeInBits;
7780 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseMemSizeInBits / 8,
7781 Stride / Scale);
7782 }
7783 return false;
7784 };
7785
7786 // Consecutive loads can contain UNDEFS but not ZERO elements.
7787 // Consecutive loads with UNDEFs and ZEROs elements require a
7788 // an additional shuffle stage to clear the ZERO elements.
7789 bool IsConsecutiveLoad = true;
7790 bool IsConsecutiveLoadWithZeros = true;
7791 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
7792 if (LoadMask[i]) {
7793 if (!CheckConsecutiveLoad(LDBase, i)) {
7794 IsConsecutiveLoad = false;
7795 IsConsecutiveLoadWithZeros = false;
7796 break;
7797 }
7798 } else if (ZeroMask[i]) {
7799 IsConsecutiveLoad = false;
7800 }
7801 }
7802
7803 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
7804 auto MMOFlags = LDBase->getMemOperand()->getFlags();
7805 assert(LDBase->isSimple() &&
7806 "Cannot merge volatile or atomic loads.");
7807 SDValue NewLd =
7808 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
7809 LDBase->getPointerInfo(), LDBase->getBaseAlign(), MMOFlags);
7810 for (auto *LD : Loads)
7811 if (LD)
7812 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
7813 return NewLd;
7814 };
7815
7816 // Check if the base load is entirely dereferenceable.
7817 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
7818 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
7819
7820 // LOAD - all consecutive load/undefs (must start/end with a load or be
7821 // entirely dereferenceable). If we have found an entire vector of loads and
7822 // undefs, then return a large load of the entire vector width starting at the
7823 // base pointer. If the vector contains zeros, then attempt to shuffle those
7824 // elements.
7825 if (FirstLoadedElt == 0 &&
7826 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
7827 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
7828 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
7829 return SDValue();
7830
7831 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
7832 // will lower to regular temporal loads and use the cache.
7833 if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
7834 VT.is256BitVector() && !Subtarget.hasInt256())
7835 return SDValue();
7836
7837 if (NumElems == 1)
7838 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
7839
7840 if (!ZeroMask)
7841 return CreateLoad(VT, LDBase);
7842
7843 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
7844 // vector and a zero vector to clear out the zero elements.
7845 if (!IsAfterLegalize && VT.isVector()) {
7846 unsigned NumMaskElts = VT.getVectorNumElements();
7847 if ((NumMaskElts % NumElems) == 0) {
7848 unsigned Scale = NumMaskElts / NumElems;
7849 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
7850 for (unsigned i = 0; i < NumElems; ++i) {
7851 if (UndefMask[i])
7852 continue;
7853 int Offset = ZeroMask[i] ? NumMaskElts : 0;
7854 for (unsigned j = 0; j != Scale; ++j)
7855 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
7856 }
7857 SDValue V = CreateLoad(VT, LDBase);
7858 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
7859 : DAG.getConstantFP(0.0, DL, VT);
7860 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
7861 }
7862 }
7863 }
7864
7865 // If the upper half of a ymm/zmm load is undef then just load the lower half.
7866 if (VT.is256BitVector() || VT.is512BitVector()) {
7867 unsigned HalfNumElems = NumElems / 2;
7868 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
7869 EVT HalfVT =
7870 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
7871 SDValue HalfLD =
7872 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
7873 DAG, Subtarget, IsAfterLegalize, Depth + 1);
7874 if (HalfLD)
7875 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
7876 HalfLD, DAG.getVectorIdxConstant(0, DL));
7877 }
7878 }
7879
7880 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
7881 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
7882 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
7883 LoadSizeInBits == 64) &&
7884 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
7885 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
7886 : MVT::getIntegerVT(LoadSizeInBits);
7887 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
7888 // Allow v4f32 on SSE1 only targets.
7889 // FIXME: Add more isel patterns so we can just use VT directly.
7890 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
7891 VecVT = MVT::v4f32;
7892 if (TLI.isTypeLegal(VecVT)) {
7893 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
7894 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
7895 SDValue ResNode = DAG.getMemIntrinsicNode(
7896 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
7898 for (auto *LD : Loads)
7899 if (LD)
7900 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
7901 return DAG.getBitcast(VT, ResNode);
7902 }
7903 }
7904
7905 // BROADCAST - match the smallest possible repetition pattern, load that
7906 // scalar/subvector element and then broadcast to the entire vector.
7907 if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
7908 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
7909 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
7910 unsigned RepeatSize = SubElems * BaseSizeInBits;
7911 unsigned ScalarSize = std::min(RepeatSize, 64u);
7912 if (!Subtarget.hasAVX2() && ScalarSize < 32)
7913 continue;
7914
7915 // Don't attempt a 1:N subvector broadcast - it should be caught by
7916 // combineConcatVectorOps, else will cause infinite loops.
7917 if (RepeatSize > ScalarSize && SubElems == 1)
7918 continue;
7919
7920 bool Match = true;
7921 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
7922 for (unsigned i = 0; i != NumElems && Match; ++i) {
7923 if (!LoadMask[i])
7924 continue;
7925 SDValue Elt = peekThroughBitcasts(Elts[i]);
7926 if (RepeatedLoads[i % SubElems].isUndef())
7927 RepeatedLoads[i % SubElems] = Elt;
7928 else
7929 Match &= (RepeatedLoads[i % SubElems] == Elt);
7930 }
7931
7932 // We must have loads at both ends of the repetition.
7933 Match &= !RepeatedLoads.front().isUndef();
7934 Match &= !RepeatedLoads.back().isUndef();
7935 if (!Match)
7936 continue;
7937
7938 EVT RepeatVT =
7939 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
7940 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
7941 : EVT::getFloatingPointVT(ScalarSize);
7942 if (RepeatSize > ScalarSize)
7943 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
7944 RepeatSize / ScalarSize);
7945 EVT BroadcastVT =
7946 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
7947 VT.getSizeInBits() / ScalarSize);
7948 if (TLI.isTypeLegal(BroadcastVT)) {
7949 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
7950 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize,
7951 Depth + 1)) {
7952 SDValue Broadcast = RepeatLoad;
7953 if (RepeatSize > ScalarSize) {
7954 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
7955 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
7956 } else {
7957 if (!Subtarget.hasAVX2() &&
7959 RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
7960 Subtarget,
7961 /*AssumeSingleUse=*/true))
7962 return SDValue();
7963 Broadcast =
7964 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
7965 }
7966 return DAG.getBitcast(VT, Broadcast);
7967 }
7968 }
7969 }
7970 }
7971
7972 // REVERSE - attempt to match the loads in reverse and then shuffle back.
7973 // TODO: Do this for any permute or mismatching element counts.
7974 if (Depth == 0 && ZeroMask.isZero() && UndefMask.isZero() &&
7975 TLI.isTypeLegal(VT) && VT.isVector() &&
7976 NumElems == VT.getVectorNumElements()) {
7977 SmallVector<SDValue, 16> ReverseElts(Elts.rbegin(), Elts.rend());
7979 VT, ReverseElts, DL, DAG, Subtarget, IsAfterLegalize, Depth + 1)) {
7980 SmallVector<int, 16> ReverseMask(NumElems);
7981 std::iota(ReverseMask.rbegin(), ReverseMask.rend(), 0);
7982 return DAG.getVectorShuffle(VT, DL, RevLd, DAG.getUNDEF(VT), ReverseMask);
7983 }
7984 }
7985
7986 return SDValue();
7987}
7988
7989// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
7990// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
7991// are consecutive, non-overlapping, and in the right order.
7993 SelectionDAG &DAG,
7994 const X86Subtarget &Subtarget,
7995 bool IsAfterLegalize) {
7997 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
7998 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
7999 Elts.push_back(Elt);
8000 continue;
8001 }
8002 return SDValue();
8003 }
8004 assert(Elts.size() == VT.getVectorNumElements());
8005 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
8006 IsAfterLegalize);
8007}
8008
8010 const APInt &Undefs, LLVMContext &C) {
8011 unsigned ScalarSize = VT.getScalarSizeInBits();
8012 Type *Ty = EVT(VT.getScalarType()).getTypeForEVT(C);
8013
8014 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
8015 if (VT.isFloatingPoint()) {
8016 if (ScalarSize == 16)
8017 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
8018 if (ScalarSize == 32)
8019 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
8020 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
8021 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
8022 }
8023 return Constant::getIntegerValue(Ty, Val);
8024 };
8025
8026 SmallVector<Constant *, 32> ConstantVec;
8027 for (unsigned I = 0, E = Bits.size(); I != E; ++I)
8028 ConstantVec.push_back(Undefs[I] ? UndefValue::get(Ty)
8029 : getConstantScalar(Bits[I]));
8030
8031 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
8032}
8033
8034static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
8035 unsigned SplatBitSize, LLVMContext &C) {
8036 unsigned ScalarSize = VT.getScalarSizeInBits();
8037
8038 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
8039 if (VT.isFloatingPoint()) {
8040 if (ScalarSize == 16)
8041 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
8042 if (ScalarSize == 32)
8043 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
8044 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
8045 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
8046 }
8047 return Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
8048 };
8049
8050 if (ScalarSize == SplatBitSize)
8051 return getConstantScalar(SplatValue);
8052
8053 unsigned NumElm = SplatBitSize / ScalarSize;
8054 SmallVector<Constant *, 32> ConstantVec;
8055 for (unsigned I = 0; I != NumElm; ++I) {
8056 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * I);
8057 ConstantVec.push_back(getConstantScalar(Val));
8058 }
8059 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
8060}
8061
8063 for (auto *U : N->users()) {
8064 unsigned Opc = U->getOpcode();
8065 // VPERMV/VPERMV3 shuffles can never fold their index operands.
8066 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
8067 return false;
8068 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
8069 return false;
8070 if (isTargetShuffle(Opc))
8071 return true;
8072 if (Opc == ISD::BITCAST) // Ignore bitcasts
8073 return isFoldableUseOfShuffle(U);
8074 if (N->hasOneUse()) {
8075 // TODO, there may be some general way to know if a SDNode can
8076 // be folded. We now only know whether an MI is foldable.
8077 if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
8078 return false;
8079 return true;
8080 }
8081 }
8082 return false;
8083}
8084
8085// If the node has a single use by a VSELECT then AVX512 targets may be able to
8086// fold as a predicated instruction.
8087static bool isMaskableNode(SDValue V, const X86Subtarget &Subtarget) {
8088 unsigned SizeInBits = V.getValueSizeInBits();
8089 if ((SizeInBits == 512 && Subtarget.hasAVX512()) ||
8090 (SizeInBits >= 128 && Subtarget.hasVLX())) {
8091 if (V.hasOneUse() && V->user_begin()->getOpcode() == ISD::VSELECT &&
8092 V->user_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
8093 return true;
8094 }
8095 }
8096 return false;
8097}
8098
8099/// Attempt to use the vbroadcast instruction to generate a splat value
8100/// from a splat BUILD_VECTOR which uses:
8101/// a. A single scalar load, or a constant.
8102/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
8103///
8104/// The VBROADCAST node is returned when a pattern is found,
8105/// or SDValue() otherwise.
8107 const SDLoc &dl,
8108 const X86Subtarget &Subtarget,
8109 SelectionDAG &DAG) {
8110 // VBROADCAST requires AVX.
8111 // TODO: Splats could be generated for non-AVX CPUs using SSE
8112 // instructions, but there's less potential gain for only 128-bit vectors.
8113 if (!Subtarget.hasAVX())
8114 return SDValue();
8115
8116 MVT VT = BVOp->getSimpleValueType(0);
8117 unsigned NumElts = VT.getVectorNumElements();
8118 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8119 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
8120 "Unsupported vector type for broadcast.");
8121
8122 // See if the build vector is a repeating sequence of scalars (inc. splat).
8123 SDValue Ld;
8124 BitVector UndefElements;
8125 SmallVector<SDValue, 16> Sequence;
8126 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
8127 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
8128 if (Sequence.size() == 1)
8129 Ld = Sequence[0];
8130 }
8131
8132 // Attempt to use VBROADCASTM
8133 // From this pattern:
8134 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
8135 // b. t1 = (build_vector t0 t0)
8136 //
8137 // Create (VBROADCASTM v2i1 X)
8138 if (!Sequence.empty() && Subtarget.hasCDI()) {
8139 // If not a splat, are the upper sequence values zeroable?
8140 unsigned SeqLen = Sequence.size();
8141 bool UpperZeroOrUndef =
8142 SeqLen == 1 ||
8143 llvm::all_of(ArrayRef(Sequence).drop_front(),
8144 [](SDValue V) { return !V || isNullConstantOrUndef(V); });
8145 SDValue Op0 = Sequence[0];
8146 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
8147 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
8148 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
8149 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
8150 ? Op0.getOperand(0)
8151 : Op0.getOperand(0).getOperand(0);
8152 MVT MaskVT = BOperand.getSimpleValueType();
8153 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
8154 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
8155 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
8156 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
8157 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
8158 unsigned Scale = 512 / VT.getSizeInBits();
8159 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
8160 }
8161 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
8162 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
8163 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
8164 return DAG.getBitcast(VT, Bcst);
8165 }
8166 }
8167 }
8168
8169 unsigned NumUndefElts = UndefElements.count();
8170 if (!Ld || (NumElts - NumUndefElts) <= 1) {
8171 APInt SplatValue, Undef;
8172 unsigned SplatBitSize;
8173 bool HasUndef;
8174 // Check if this is a repeated constant pattern suitable for broadcasting.
8175 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
8176 SplatBitSize > VT.getScalarSizeInBits() &&
8177 SplatBitSize < VT.getSizeInBits()) {
8178 // Avoid replacing with broadcast when it's a use of a shuffle
8179 // instruction to preserve the present custom lowering of shuffles.
8180 if (isFoldableUseOfShuffle(BVOp))
8181 return SDValue();
8182 // replace BUILD_VECTOR with broadcast of the repeated constants.
8183 LLVMContext *Ctx = DAG.getContext();
8184 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
8185 if (SplatBitSize == 32 || SplatBitSize == 64 ||
8186 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
8187 // Load the constant scalar/subvector and broadcast it.
8188 MVT CVT = MVT::getIntegerVT(SplatBitSize);
8189 Constant *C = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
8190 SDValue CP = DAG.getConstantPool(C, PVT);
8191 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
8192
8193 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
8194 SDVTList Tys = DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
8195 SDValue Ops[] = {DAG.getEntryNode(), CP};
8196 MachinePointerInfo MPI =
8198 SDValue Brdcst =
8199 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
8200 MPI, Alignment, MachineMemOperand::MOLoad);
8201 return DAG.getBitcast(VT, Brdcst);
8202 }
8203 if (SplatBitSize > 64) {
8204 // Load the vector of constants and broadcast it.
8205 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
8206 SDValue VCP = DAG.getConstantPool(VecC, PVT);
8207 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
8208 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
8209 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
8210 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8211 SDValue Ops[] = {DAG.getEntryNode(), VCP};
8212 MachinePointerInfo MPI =
8214 return DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, dl, Tys,
8215 Ops, VVT, MPI, Alignment,
8217 }
8218 }
8219
8220 // If we are moving a scalar into a vector (Ld must be set and all elements
8221 // but 1 are undef) and that operation is not obviously supported by
8222 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
8223 // That's better than general shuffling and may eliminate a load to GPR and
8224 // move from scalar to vector register.
8225 if (!Ld || NumElts - NumUndefElts != 1)
8226 return SDValue();
8227 unsigned ScalarSize = Ld.getValueSizeInBits();
8228 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
8229 return SDValue();
8230 }
8231
8232 bool ConstSplatVal =
8233 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
8234 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
8235
8236 // TODO: Handle broadcasts of non-constant sequences.
8237
8238 // Make sure that all of the users of a non-constant load are from the
8239 // BUILD_VECTOR node.
8240 // FIXME: Is the use count needed for non-constant, non-load case?
8241 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
8242 return SDValue();
8243
8244 unsigned ScalarSize = Ld.getValueSizeInBits();
8245 bool IsGE256 = (VT.getSizeInBits() >= 256);
8246
8247 // When optimizing for size, generate up to 5 extra bytes for a broadcast
8248 // instruction to save 8 or more bytes of constant pool data.
8249 // TODO: If multiple splats are generated to load the same constant,
8250 // it may be detrimental to overall size. There needs to be a way to detect
8251 // that condition to know if this is truly a size win.
8252 bool OptForSize = DAG.shouldOptForSize();
8253
8254 // Handle broadcasting a single constant scalar from the constant pool
8255 // into a vector.
8256 // On Sandybridge (no AVX2), it is still better to load a constant vector
8257 // from the constant pool and not to broadcast it from a scalar.
8258 // But override that restriction when optimizing for size.
8259 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
8260 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
8261 EVT CVT = Ld.getValueType();
8262 assert(!CVT.isVector() && "Must not broadcast a vector type");
8263
8264 // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
8265 // For size optimization, also splat v2f64 and v2i64, and for size opt
8266 // with AVX2, also splat i8 and i16.
8267 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
8268 if (ScalarSize == 32 ||
8269 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
8270 (CVT == MVT::f16 && Subtarget.hasAVX2()) ||
8271 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
8272 const Constant *C = nullptr;
8274 C = CI->getConstantIntValue();
8276 C = CF->getConstantFPValue();
8277
8278 assert(C && "Invalid constant type");
8279
8280 SDValue CP =
8282 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
8283
8284 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8285 SDValue Ops[] = {DAG.getEntryNode(), CP};
8286 MachinePointerInfo MPI =
8288 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
8289 MPI, Alignment, MachineMemOperand::MOLoad);
8290 }
8291 }
8292
8293 // Handle AVX2 in-register broadcasts.
8294 if (!IsLoad && Subtarget.hasInt256() &&
8295 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
8296 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
8297
8298 // The scalar source must be a normal load.
8299 if (!IsLoad)
8300 return SDValue();
8301
8302 // Make sure the non-chain result is only used by this build vector.
8303 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
8304 return SDValue();
8305
8306 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
8307 (Subtarget.hasVLX() && ScalarSize == 64)) {
8308 auto *LN = cast<LoadSDNode>(Ld);
8309 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8310 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
8311 SDValue BCast =
8312 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
8313 LN->getMemoryVT(), LN->getMemOperand());
8314 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
8315 return BCast;
8316 }
8317
8318 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
8319 // double since there is no vbroadcastsd xmm
8320 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
8321 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
8322 auto *LN = cast<LoadSDNode>(Ld);
8323 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8324 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
8325 SDValue BCast =
8326 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
8327 LN->getMemoryVT(), LN->getMemOperand());
8328 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
8329 return BCast;
8330 }
8331
8332 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
8333 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
8334
8335 // Unsupported broadcast.
8336 return SDValue();
8337}
8338
8339/// For an EXTRACT_VECTOR_ELT with a constant index return the real
8340/// underlying vector and index.
8341///
8342/// Modifies \p ExtractedFromVec to the real vector and returns the real
8343/// index.
8344static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
8345 SDValue ExtIdx) {
8346 int Idx = ExtIdx->getAsZExtVal();
8347 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
8348 return Idx;
8349
8350 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
8351 // lowered this:
8352 // (extract_vector_elt (v8f32 %1), Constant<6>)
8353 // to:
8354 // (extract_vector_elt (vector_shuffle<2,u,u,u>
8355 // (extract_subvector (v8f32 %0), Constant<4>),
8356 // undef)
8357 // Constant<0>)
8358 // In this case the vector is the extract_subvector expression and the index
8359 // is 2, as specified by the shuffle.
8360 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
8361 SDValue ShuffleVec = SVOp->getOperand(0);
8362 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
8363 assert(ShuffleVecVT.getVectorElementType() ==
8364 ExtractedFromVec.getSimpleValueType().getVectorElementType());
8365
8366 int ShuffleIdx = SVOp->getMaskElt(Idx);
8367 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
8368 ExtractedFromVec = ShuffleVec;
8369 return ShuffleIdx;
8370 }
8371 return Idx;
8372}
8373
8375 SelectionDAG &DAG) {
8376 MVT VT = Op.getSimpleValueType();
8377
8378 // Skip if insert_vec_elt is not supported.
8379 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8381 return SDValue();
8382
8383 unsigned NumElems = Op.getNumOperands();
8384 SDValue VecIn1;
8385 SDValue VecIn2;
8386 SmallVector<unsigned, 4> InsertIndices;
8387 SmallVector<int, 8> Mask(NumElems, -1);
8388
8389 for (unsigned i = 0; i != NumElems; ++i) {
8390 unsigned Opc = Op.getOperand(i).getOpcode();
8391
8392 if (Opc == ISD::POISON || Opc == ISD::UNDEF)
8393 continue;
8394
8396 // Quit if more than 1 elements need inserting.
8397 if (InsertIndices.size() > 1)
8398 return SDValue();
8399
8400 InsertIndices.push_back(i);
8401 continue;
8402 }
8403
8404 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
8405 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
8406
8407 // Quit if non-constant index.
8408 if (!isa<ConstantSDNode>(ExtIdx))
8409 return SDValue();
8410 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
8411
8412 // Quit if extracted from vector of different type.
8413 if (ExtractedFromVec.getValueType() != VT)
8414 return SDValue();
8415
8416 if (!VecIn1.getNode())
8417 VecIn1 = ExtractedFromVec;
8418 else if (VecIn1 != ExtractedFromVec) {
8419 if (!VecIn2.getNode())
8420 VecIn2 = ExtractedFromVec;
8421 else if (VecIn2 != ExtractedFromVec)
8422 // Quit if more than 2 vectors to shuffle
8423 return SDValue();
8424 }
8425
8426 if (ExtractedFromVec == VecIn1)
8427 Mask[i] = Idx;
8428 else if (ExtractedFromVec == VecIn2)
8429 Mask[i] = Idx + NumElems;
8430 }
8431
8432 if (!VecIn1.getNode())
8433 return SDValue();
8434
8435 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getPOISON(VT);
8436 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
8437
8438 for (unsigned Idx : InsertIndices)
8439 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
8440 DAG.getVectorIdxConstant(Idx, DL));
8441
8442 return NV;
8443}
8444
8445// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.
8447 const X86Subtarget &Subtarget) {
8448 MVT VT = Op.getSimpleValueType();
8449 MVT SVT = Subtarget.hasFP16() ? MVT::f16 : MVT::i16;
8450 MVT IVT = VT.changeVectorElementType(SVT);
8452 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
8453 NewOps.push_back(DAG.getBitcast(SVT, Op.getOperand(I)));
8454 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
8455 return DAG.getBitcast(VT, Res);
8456}
8457
8458// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
8460 SelectionDAG &DAG,
8461 const X86Subtarget &Subtarget) {
8462
8463 MVT VT = Op.getSimpleValueType();
8464 assert((VT.getVectorElementType() == MVT::i1) &&
8465 "Unexpected type in LowerBUILD_VECTORvXi1!");
8466 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
8467 ISD::isBuildVectorAllOnes(Op.getNode()))
8468 return Op;
8469
8470 uint64_t Immediate = 0;
8471 SmallVector<unsigned, 16> NonConstIdx;
8472 bool IsSplat = true;
8473 bool HasConstElts = false;
8474 int SplatIdx = -1;
8475 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
8476 SDValue In = Op.getOperand(idx);
8477 if (In.isUndef())
8478 continue;
8479 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
8480 Immediate |= (InC->getZExtValue() & 0x1) << idx;
8481 HasConstElts = true;
8482 } else {
8483 NonConstIdx.push_back(idx);
8484 }
8485 if (SplatIdx < 0)
8486 SplatIdx = idx;
8487 else if (In != Op.getOperand(SplatIdx))
8488 IsSplat = false;
8489 }
8490
8491 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
8492 if (IsSplat) {
8493 // The build_vector allows the scalar element to be larger than the vector
8494 // element type. We need to mask it to use as a condition unless we know
8495 // the upper bits are zero.
8496 // FIXME: Use computeKnownBits instead of checking specific opcode?
8497 SDValue Cond = Op.getOperand(SplatIdx);
8498 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
8499 if (Cond.getOpcode() != ISD::SETCC)
8500 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
8501 DAG.getConstant(1, dl, MVT::i8));
8502
8503 // Perform the select in the scalar domain so we can use cmov.
8504 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8505 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
8506 DAG.getAllOnesConstant(dl, MVT::i32),
8507 DAG.getConstant(0, dl, MVT::i32));
8508 Select = DAG.getBitcast(MVT::v32i1, Select);
8509 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
8510 } else {
8511 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8512 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
8513 DAG.getAllOnesConstant(dl, ImmVT),
8514 DAG.getConstant(0, dl, ImmVT));
8515 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8516 Select = DAG.getBitcast(VecVT, Select);
8517 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
8518 DAG.getVectorIdxConstant(0, dl));
8519 }
8520 }
8521
8522 // insert elements one by one
8523 SDValue DstVec;
8524 if (HasConstElts) {
8525 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8526 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
8527 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
8528 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
8529 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
8530 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
8531 } else {
8532 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8533 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
8534 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8535 DstVec = DAG.getBitcast(VecVT, Imm);
8536 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
8537 DAG.getVectorIdxConstant(0, dl));
8538 }
8539 } else
8540 DstVec = DAG.getUNDEF(VT);
8541
8542 for (unsigned InsertIdx : NonConstIdx) {
8543 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
8544 Op.getOperand(InsertIdx),
8545 DAG.getVectorIdxConstant(InsertIdx, dl));
8546 }
8547 return DstVec;
8548}
8549
8550[[maybe_unused]] static bool isHorizOp(unsigned Opcode) {
8551 switch (Opcode) {
8552 case X86ISD::PACKSS:
8553 case X86ISD::PACKUS:
8554 case X86ISD::FHADD:
8555 case X86ISD::FHSUB:
8556 case X86ISD::HADD:
8557 case X86ISD::HSUB:
8558 case X86ISD::HADDS:
8559 case X86ISD::HSUBS:
8560 return true;
8561 }
8562 return false;
8563}
8564
8565/// This is a helper function of LowerToHorizontalOp().
8566/// This function checks that the build_vector \p N in input implements a
8567/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
8568/// may not match the layout of an x86 256-bit horizontal instruction.
8569/// In other words, if this returns true, then some extraction/insertion will
8570/// be required to produce a valid horizontal instruction.
8571///
8572/// Parameter \p Opcode defines the kind of horizontal operation to match.
8573/// For example, if \p Opcode is equal to ISD::ADD, then this function
8574/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
8575/// is equal to ISD::SUB, then this function checks if this is a horizontal
8576/// arithmetic sub.
8577///
8578/// This function only analyzes elements of \p N whose indices are
8579/// in range [BaseIdx, LastIdx).
8580///
8581/// TODO: This function was originally used to match both real and fake partial
8582/// horizontal operations, but the index-matching logic is incorrect for that.
8583/// See the corrected implementation in isHopBuildVector(). Can we reduce this
8584/// code because it is only used for partial h-op matching now?
8585static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
8586 const SDLoc &DL, SelectionDAG &DAG,
8587 unsigned BaseIdx, unsigned LastIdx,
8588 SDValue &V0, SDValue &V1) {
8589 EVT VT = N->getValueType(0);
8590 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
8591 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
8592 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
8593 "Invalid Vector in input!");
8594
8595 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
8596 bool CanFold = true;
8597 unsigned ExpectedVExtractIdx = BaseIdx;
8598 unsigned NumElts = LastIdx - BaseIdx;
8599 V0 = DAG.getUNDEF(VT);
8600 V1 = DAG.getUNDEF(VT);
8601
8602 // Check if N implements a horizontal binop.
8603 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
8604 SDValue Op = N->getOperand(i + BaseIdx);
8605
8606 // Skip UNDEFs.
8607 if (Op->isUndef()) {
8608 // Update the expected vector extract index.
8609 if (i * 2 == NumElts)
8610 ExpectedVExtractIdx = BaseIdx;
8611 ExpectedVExtractIdx += 2;
8612 continue;
8613 }
8614
8615 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
8616
8617 if (!CanFold)
8618 break;
8619
8620 SDValue Op0 = Op.getOperand(0);
8621 SDValue Op1 = Op.getOperand(1);
8622
8623 // Try to match the following pattern:
8624 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
8625 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
8627 Op0.getOperand(0) == Op1.getOperand(0) &&
8630 if (!CanFold)
8631 break;
8632
8633 unsigned I0 = Op0.getConstantOperandVal(1);
8634 unsigned I1 = Op1.getConstantOperandVal(1);
8635
8636 if (i * 2 < NumElts) {
8637 if (V0.isUndef()) {
8638 V0 = Op0.getOperand(0);
8639 if (V0.getValueType() != VT)
8640 return false;
8641 }
8642 } else {
8643 if (V1.isUndef()) {
8644 V1 = Op0.getOperand(0);
8645 if (V1.getValueType() != VT)
8646 return false;
8647 }
8648 if (i * 2 == NumElts)
8649 ExpectedVExtractIdx = BaseIdx;
8650 }
8651
8652 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
8653 if (I0 == ExpectedVExtractIdx)
8654 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
8655 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
8656 // Try to match the following dag sequence:
8657 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
8658 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
8659 } else
8660 CanFold = false;
8661
8662 ExpectedVExtractIdx += 2;
8663 }
8664
8665 return CanFold;
8666}
8667
8668/// Emit a sequence of two 128-bit horizontal add/sub followed by
8669/// a concat_vector.
8670///
8671/// This is a helper function of LowerToHorizontalOp().
8672/// This function expects two 256-bit vectors called V0 and V1.
8673/// At first, each vector is split into two separate 128-bit vectors.
8674/// Then, the resulting 128-bit vectors are used to implement two
8675/// horizontal binary operations.
8676///
8677/// The kind of horizontal binary operation is defined by \p X86Opcode.
8678///
8679/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
8680/// the two new horizontal binop.
8681/// When Mode is set, the first horizontal binop dag node would take as input
8682/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
8683/// horizontal binop dag node would take as input the lower 128-bit of V1
8684/// and the upper 128-bit of V1.
8685/// Example:
8686/// HADD V0_LO, V0_HI
8687/// HADD V1_LO, V1_HI
8688///
8689/// Otherwise, the first horizontal binop dag node takes as input the lower
8690/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
8691/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
8692/// Example:
8693/// HADD V0_LO, V1_LO
8694/// HADD V0_HI, V1_HI
8695///
8696/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
8697/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
8698/// the upper 128-bits of the result.
8699static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
8700 const SDLoc &DL, SelectionDAG &DAG,
8701 unsigned X86Opcode, bool Mode,
8702 bool isUndefLO, bool isUndefHI) {
8703 MVT VT = V0.getSimpleValueType();
8704 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
8705 "Invalid nodes in input!");
8706
8707 unsigned NumElts = VT.getVectorNumElements();
8708 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
8709 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
8710 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
8711 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
8712 MVT NewVT = V0_LO.getSimpleValueType();
8713
8714 SDValue LO = DAG.getUNDEF(NewVT);
8715 SDValue HI = DAG.getUNDEF(NewVT);
8716
8717 if (Mode) {
8718 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8719 if (!isUndefLO && !V0->isUndef())
8720 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
8721 if (!isUndefHI && !V1->isUndef())
8722 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
8723 } else {
8724 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8725 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
8726 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
8727
8728 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
8729 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
8730 }
8731
8732 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
8733}
8734
8735/// Returns true iff \p BV builds a vector with the result equivalent to
8736/// the result of ADDSUB/SUBADD operation.
8737/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
8738/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
8739/// \p Opnd0 and \p Opnd1.
8741 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8742 SDValue &Opnd0, SDValue &Opnd1,
8743 unsigned &NumExtracts, bool &IsSubAdd,
8744 bool &HasAllowContract) {
8745 using namespace SDPatternMatch;
8746
8747 MVT VT = BV->getSimpleValueType(0);
8748 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
8749 return false;
8750
8751 unsigned NumElts = VT.getVectorNumElements();
8752 SDValue InVec0 = DAG.getUNDEF(VT);
8753 SDValue InVec1 = DAG.getUNDEF(VT);
8754
8755 NumExtracts = 0;
8756 HasAllowContract = NumElts != 0;
8757
8758 // Odd-numbered elements in the input build vector are obtained from
8759 // adding/subtracting two integer/float elements.
8760 // Even-numbered elements in the input build vector are obtained from
8761 // subtracting/adding two integer/float elements.
8762 unsigned Opc[2] = {0, 0};
8763 for (unsigned i = 0, e = NumElts; i != e; ++i) {
8764 SDValue Op = BV->getOperand(i);
8765
8766 // Skip 'undef' values.
8767 unsigned Opcode = Op.getOpcode();
8768 if (Opcode == ISD::UNDEF)
8769 continue;
8770
8771 // Early exit if we found an unexpected opcode.
8772 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
8773 return false;
8774
8775 SDValue Op0 = Op.getOperand(0);
8776 SDValue Op1 = Op.getOperand(1);
8777
8778 // Try to match the following pattern:
8779 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
8780 // Early exit if we cannot match that sequence.
8781 if (!sd_match(Op0, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))) ||
8782 !sd_match(Op1, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))))
8783 return false;
8784
8785 // We found a valid add/sub node, make sure its the same opcode as previous
8786 // elements for this parity.
8787 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
8788 return false;
8789 Opc[i % 2] = Opcode;
8790
8791 // Update InVec0 and InVec1.
8792 if (InVec0.isUndef())
8793 InVec0 = Op0.getOperand(0);
8794 if (InVec1.isUndef())
8795 InVec1 = Op1.getOperand(0);
8796
8797 // Make sure that operands in input to each add/sub node always
8798 // come from a same pair of vectors.
8799 if (InVec0 != Op0.getOperand(0)) {
8800 if (Opcode == ISD::FSUB)
8801 return false;
8802
8803 // FADD is commutable. Try to commute the operands
8804 // and then test again.
8805 std::swap(Op0, Op1);
8806 if (InVec0 != Op0.getOperand(0))
8807 return false;
8808 }
8809
8810 if (InVec1 != Op1.getOperand(0))
8811 return false;
8812
8813 // Increment the number of extractions done.
8814 ++NumExtracts;
8815 HasAllowContract &= Op->getFlags().hasAllowContract();
8816 }
8817
8818 // Ensure we have found an opcode for both parities and that they are
8819 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
8820 // inputs are undef.
8821 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
8822 InVec0.isUndef() || InVec1.isUndef())
8823 return false;
8824
8825 IsSubAdd = Opc[0] == ISD::FADD;
8826
8827 Opnd0 = InVec0;
8828 Opnd1 = InVec1;
8829 return true;
8830}
8831
8832/// Returns true if is possible to fold MUL and an idiom that has already been
8833/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
8834/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
8835/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
8836///
8837/// Prior to calling this function it should be known that there is some
8838/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
8839/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
8840/// before replacement of such SDNode with ADDSUB operation. Thus the number
8841/// of \p Opnd0 uses is expected to be equal to 2.
8842/// For example, this function may be called for the following IR:
8843/// %AB = fmul fast <2 x double> %A, %B
8844/// %Sub = fsub fast <2 x double> %AB, %C
8845/// %Add = fadd fast <2 x double> %AB, %C
8846/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
8847/// <2 x i32> <i32 0, i32 3>
8848/// There is a def for %Addsub here, which potentially can be replaced by
8849/// X86ISD::ADDSUB operation:
8850/// %Addsub = X86ISD::ADDSUB %AB, %C
8851/// and such ADDSUB can further be replaced with FMADDSUB:
8852/// %Addsub = FMADDSUB %A, %B, %C.
8853///
8854/// The main reason why this method is called before the replacement of the
8855/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
8856/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
8857/// FMADDSUB is.
8858static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
8859 SelectionDAG &DAG, SDValue &Opnd0,
8860 SDValue &Opnd1, SDValue &Opnd2,
8861 unsigned ExpectedUses,
8862 bool AllowSubAddOrAddSubContract) {
8863 if (Opnd0.getOpcode() != ISD::FMUL ||
8864 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
8865 return false;
8866
8867 // FIXME: These checks must match the similar ones in
8868 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
8869 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
8870 // or MUL + ADDSUB to FMADDSUB.
8871 bool AllowFusion =
8872 (AllowSubAddOrAddSubContract && Opnd0->getFlags().hasAllowContract());
8873 if (!AllowFusion)
8874 return false;
8875
8876 Opnd2 = Opnd1;
8877 Opnd1 = Opnd0.getOperand(1);
8878 Opnd0 = Opnd0.getOperand(0);
8879
8880 return true;
8881}
8882
8883/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
8884/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
8885/// X86ISD::FMSUBADD node.
8887 const SDLoc &DL,
8888 const X86Subtarget &Subtarget,
8889 SelectionDAG &DAG) {
8890 SDValue Opnd0, Opnd1;
8891 unsigned NumExtracts;
8892 bool IsSubAdd;
8893 bool HasAllowContract;
8894 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts, IsSubAdd,
8895 HasAllowContract))
8896 return SDValue();
8897
8898 MVT VT = BV->getSimpleValueType(0);
8899
8900 // Try to generate X86ISD::FMADDSUB node here.
8901 SDValue Opnd2;
8902 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts,
8903 HasAllowContract)) {
8904 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
8905 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
8906 }
8907
8908 // We only support ADDSUB.
8909 if (IsSubAdd)
8910 return SDValue();
8911
8912 // There are no known X86 targets with 512-bit ADDSUB instructions!
8913 // Convert to blend(fsub,fadd).
8914 if (VT.is512BitVector()) {
8915 SmallVector<int> Mask;
8916 for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
8917 Mask.push_back(I);
8918 Mask.push_back(I + E + 1);
8919 }
8920 SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
8921 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
8922 return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
8923 }
8924
8925 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
8926}
8927
8929 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
8930 // Initialize outputs to known values.
8931 MVT VT = BV->getSimpleValueType(0);
8932 HOpcode = ISD::DELETED_NODE;
8933 V0 = DAG.getUNDEF(VT);
8934 V1 = DAG.getUNDEF(VT);
8935
8936 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
8937 // half of the result is calculated independently from the 128-bit halves of
8938 // the inputs, so that makes the index-checking logic below more complicated.
8939 unsigned NumElts = VT.getVectorNumElements();
8940 unsigned GenericOpcode = ISD::DELETED_NODE;
8941 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
8942 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
8943 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
8944 for (unsigned i = 0; i != Num128BitChunks; ++i) {
8945 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
8946 // Ignore undef elements.
8947 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
8948 if (Op.isUndef())
8949 continue;
8950
8951 // If there's an opcode mismatch, we're done.
8952 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
8953 return false;
8954
8955 // Initialize horizontal opcode.
8956 if (HOpcode == ISD::DELETED_NODE) {
8957 GenericOpcode = Op.getOpcode();
8958 switch (GenericOpcode) {
8959 // clang-format off
8960 case ISD::ADD: HOpcode = X86ISD::HADD; break;
8961 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
8962 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
8963 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
8964 default: return false;
8965 // clang-format on
8966 }
8967 }
8968
8969 SDValue Op0 = Op.getOperand(0);
8970 SDValue Op1 = Op.getOperand(1);
8971 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8973 Op0.getOperand(0) != Op1.getOperand(0) ||
8975 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
8976 return false;
8977
8978 // The source vector is chosen based on which 64-bit half of the
8979 // destination vector is being calculated.
8980 if (j < NumEltsIn64Bits) {
8981 if (V0.isUndef())
8982 V0 = Op0.getOperand(0);
8983 } else {
8984 if (V1.isUndef())
8985 V1 = Op0.getOperand(0);
8986 }
8987
8988 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
8989 if (SourceVec != Op0.getOperand(0))
8990 return false;
8991
8992 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
8993 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
8994 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
8995 unsigned ExpectedIndex = i * NumEltsIn128Bits +
8996 (j % NumEltsIn64Bits) * 2;
8997 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
8998 continue;
8999
9000 // If this is not a commutative op, this does not match.
9001 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
9002 return false;
9003
9004 // Addition is commutative, so try swapping the extract indexes.
9005 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
9006 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
9007 continue;
9008
9009 // Extract indexes do not match horizontal requirement.
9010 return false;
9011 }
9012 }
9013 // We matched. Opcode and operands are returned by reference as arguments.
9014 return true;
9015}
9016
9018 const SDLoc &DL, SelectionDAG &DAG,
9019 unsigned HOpcode, SDValue V0, SDValue V1) {
9020 // If either input vector is not the same size as the build vector,
9021 // extract/insert the low bits to the correct size.
9022 // This is free (examples: zmm --> xmm, xmm --> ymm).
9023 MVT VT = BV->getSimpleValueType(0);
9024 unsigned Width = VT.getSizeInBits();
9025 if (V0.getValueSizeInBits() > Width)
9026 V0 = extractSubVector(V0, 0, DAG, DL, Width);
9027 else if (V0.getValueSizeInBits() < Width)
9028 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, DL, Width);
9029
9030 if (V1.getValueSizeInBits() > Width)
9031 V1 = extractSubVector(V1, 0, DAG, DL, Width);
9032 else if (V1.getValueSizeInBits() < Width)
9033 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, DL, Width);
9034
9035 unsigned NumElts = VT.getVectorNumElements();
9036 APInt DemandedElts = APInt::getAllOnes(NumElts);
9037 for (unsigned i = 0; i != NumElts; ++i)
9038 if (BV->getOperand(i).isUndef())
9039 DemandedElts.clearBit(i);
9040
9041 // If we don't need the upper xmm, then perform as a xmm hop.
9042 unsigned HalfNumElts = NumElts / 2;
9043 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
9044 MVT HalfVT = VT.getHalfNumVectorElementsVT();
9045 V0 = extractSubVector(V0, 0, DAG, DL, 128);
9046 V1 = extractSubVector(V1, 0, DAG, DL, 128);
9047 SDValue Half = DAG.getNode(HOpcode, DL, HalfVT, V0, V1);
9048 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, DL, 256);
9049 }
9050
9051 return DAG.getNode(HOpcode, DL, VT, V0, V1);
9052}
9053
9054/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
9056 const X86Subtarget &Subtarget,
9057 SelectionDAG &DAG) {
9058 // We need at least 2 non-undef elements to make this worthwhile by default.
9059 unsigned NumNonUndefs =
9060 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
9061 if (NumNonUndefs < 2)
9062 return SDValue();
9063
9064 // There are 4 sets of horizontal math operations distinguished by type:
9065 // int/FP at 128-bit/256-bit. Each type was introduced with a different
9066 // subtarget feature. Try to match those "native" patterns first.
9067 MVT VT = BV->getSimpleValueType(0);
9068 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
9069 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
9070 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
9071 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
9072 unsigned HOpcode;
9073 SDValue V0, V1;
9074 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
9075 return getHopForBuildVector(BV, DL, DAG, HOpcode, V0, V1);
9076 }
9077
9078 // Try harder to match 256-bit ops by using extract/concat.
9079 if (!Subtarget.hasAVX() || !VT.is256BitVector())
9080 return SDValue();
9081
9082 // Count the number of UNDEF operands in the build_vector in input.
9083 unsigned NumElts = VT.getVectorNumElements();
9084 unsigned Half = NumElts / 2;
9085 unsigned NumUndefsLO = 0;
9086 unsigned NumUndefsHI = 0;
9087 for (unsigned i = 0, e = Half; i != e; ++i)
9088 if (BV->getOperand(i)->isUndef())
9089 NumUndefsLO++;
9090
9091 for (unsigned i = Half, e = NumElts; i != e; ++i)
9092 if (BV->getOperand(i)->isUndef())
9093 NumUndefsHI++;
9094
9095 SDValue InVec0, InVec1;
9096 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
9097 SDValue InVec2, InVec3;
9098 unsigned X86Opcode;
9099 bool CanFold = true;
9100
9101 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, Half, InVec0, InVec1) &&
9102 isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, Half, NumElts, InVec2,
9103 InVec3) &&
9104 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
9105 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
9106 X86Opcode = X86ISD::HADD;
9107 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, Half, InVec0,
9108 InVec1) &&
9109 isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, Half, NumElts, InVec2,
9110 InVec3) &&
9111 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
9112 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
9113 X86Opcode = X86ISD::HSUB;
9114 else
9115 CanFold = false;
9116
9117 if (CanFold) {
9118 // Do not try to expand this build_vector into a pair of horizontal
9119 // add/sub if we can emit a pair of scalar add/sub.
9120 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
9121 return SDValue();
9122
9123 // Convert this build_vector into a pair of horizontal binops followed by
9124 // a concat vector. We must adjust the outputs from the partial horizontal
9125 // matching calls above to account for undefined vector halves.
9126 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
9127 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
9128 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
9129 bool isUndefLO = NumUndefsLO == Half;
9130 bool isUndefHI = NumUndefsHI == Half;
9131 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
9132 isUndefHI);
9133 }
9134 }
9135
9136 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
9137 VT == MVT::v16i16) {
9138 unsigned X86Opcode;
9139 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, NumElts, InVec0,
9140 InVec1))
9141 X86Opcode = X86ISD::HADD;
9142 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, NumElts, InVec0,
9143 InVec1))
9144 X86Opcode = X86ISD::HSUB;
9145 else if (isHorizontalBinOpPart(BV, ISD::FADD, DL, DAG, 0, NumElts, InVec0,
9146 InVec1))
9147 X86Opcode = X86ISD::FHADD;
9148 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DL, DAG, 0, NumElts, InVec0,
9149 InVec1))
9150 X86Opcode = X86ISD::FHSUB;
9151 else
9152 return SDValue();
9153
9154 // Don't try to expand this build_vector into a pair of horizontal add/sub
9155 // if we can simply emit a pair of scalar add/sub.
9156 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
9157 return SDValue();
9158
9159 // Convert this build_vector into two horizontal add/sub followed by
9160 // a concat vector.
9161 bool isUndefLO = NumUndefsLO == Half;
9162 bool isUndefHI = NumUndefsHI == Half;
9163 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
9164 isUndefLO, isUndefHI);
9165 }
9166
9167 return SDValue();
9168}
9169
9170static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
9171 SelectionDAG &DAG);
9172
9173/// If a BUILD_VECTOR's source elements all apply the same bit operation and
9174/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
9175/// just apply the bit to the vectors.
9176/// NOTE: Its not in our interest to start make a general purpose vectorizer
9177/// from this, but enough scalar bit operations are created from the later
9178/// legalization + scalarization stages to need basic support.
9180 const X86Subtarget &Subtarget,
9181 SelectionDAG &DAG) {
9182 MVT VT = Op->getSimpleValueType(0);
9183 unsigned NumElems = VT.getVectorNumElements();
9184 unsigned ElemSize = VT.getScalarSizeInBits();
9185 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9186
9187 // Check that all elements have the same opcode.
9188 // TODO: Should we allow UNDEFS and if so how many?
9189 unsigned Opcode = Op->getOperand(0).getOpcode();
9190 for (unsigned i = 1; i < NumElems; ++i)
9191 if (Opcode != Op->getOperand(i).getOpcode())
9192 return SDValue();
9193
9194 // TODO: We may be able to add support for other Ops (e.g. ADD/SUB).
9195 bool IsShift = false;
9196 switch (Opcode) {
9197 default:
9198 return SDValue();
9199 case ISD::SHL:
9200 case ISD::SRL:
9201 case ISD::SRA:
9202 IsShift = true;
9203 break;
9204 case ISD::AND:
9205 case ISD::XOR:
9206 case ISD::OR:
9207 // Don't do this if the buildvector is a splat - we'd replace one
9208 // constant with an entire vector.
9209 if (Op->getSplatValue())
9210 return SDValue();
9211 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
9212 return SDValue();
9213 break;
9214 }
9215
9216 // Collect elements.
9217 bool RHSAllConst = true;
9218 SmallVector<SDValue, 4> LHSElts, RHSElts;
9219 for (SDValue Elt : Op->ops()) {
9220 SDValue LHS = Elt.getOperand(0);
9221 SDValue RHS = Elt.getOperand(1);
9222 RHSAllConst &= isa<ConstantSDNode>(RHS);
9223 LHSElts.push_back(LHS);
9224 RHSElts.push_back(RHS);
9225 }
9226
9227 // Canonicalize shift amounts.
9228 if (IsShift) {
9229 // We expect the canonicalized RHS operand to be the constant.
9230 // TODO: Permit non-constant XOP/AVX2 cases?
9231 if (!RHSAllConst)
9232 return SDValue();
9233
9234 // Extend shift amounts.
9235 for (SDValue &Op1 : RHSElts)
9236 if (Op1.getValueSizeInBits() != ElemSize)
9237 Op1 = DAG.getZExtOrTrunc(Op1, DL, VT.getScalarType());
9238
9239 // Limit to shifts by uniform immediates.
9240 // TODO: Only accept vXi8/vXi64 special cases?
9241 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
9242 if (any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
9243 return SDValue();
9244 }
9245 assert(all_of(llvm::concat<SDValue>(LHSElts, RHSElts),
9246 [ElemSize](SDValue V) {
9247 return V.getValueSizeInBits() == ElemSize;
9248 }) &&
9249 "Element size mismatch");
9250
9251 // To avoid an increase in GPR->FPU instructions, LHS/RHS must be foldable as
9252 // a load or RHS must be constant.
9253 SDValue LHS = EltsFromConsecutiveLoads(VT, LHSElts, DL, DAG, Subtarget,
9254 /*IsAfterLegalize=*/true);
9255 SDValue RHS = EltsFromConsecutiveLoads(VT, RHSElts, DL, DAG, Subtarget,
9256 /*IsAfterLegalize=*/true);
9257 if (!LHS && !RHS && !RHSAllConst)
9258 return SDValue();
9259
9260 if (!LHS)
9261 LHS = DAG.getBuildVector(VT, DL, LHSElts);
9262 if (!RHS)
9263 RHS = DAG.getBuildVector(VT, DL, RHSElts);
9264 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
9265
9266 if (!IsShift)
9267 return Res;
9268
9269 // Immediately lower the shift to ensure the constant build vector doesn't
9270 // get converted to a constant pool before the shift is lowered.
9271 return LowerShift(Res, Subtarget, DAG);
9272}
9273
9274static bool isShuffleFoldableLoad(SDValue);
9275
9276/// Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats
9277/// representing a blend.
9279 X86Subtarget const &Subtarget,
9280 SelectionDAG &DAG) {
9281 MVT VT = BVOp->getSimpleValueType(0u);
9282
9283 if (VT != MVT::v4f64)
9284 return SDValue();
9285
9286 // Collect unique operands.
9287 auto UniqueOps = SmallSet<SDValue, 16u>();
9288 for (SDValue Op : BVOp->ops()) {
9289 if (isIntOrFPConstant(Op) || Op.isUndef())
9290 return SDValue();
9291 UniqueOps.insert(Op);
9292 }
9293
9294 // Candidate BUILD_VECTOR must have 2 unique operands.
9295 if (UniqueOps.size() != 2u)
9296 return SDValue();
9297
9298 SDValue Op0 = BVOp->getOperand(0u);
9299 UniqueOps.erase(Op0);
9300 SDValue Op1 = *UniqueOps.begin();
9301
9302 if (Subtarget.hasAVX2() || isShuffleFoldableLoad(Op0) ||
9303 isShuffleFoldableLoad(Op1)) {
9304 // Create shuffle mask.
9305 auto const NumElems = VT.getVectorNumElements();
9306 SmallVector<int, 16u> Mask(NumElems);
9307 for (auto I = 0u; I < NumElems; ++I) {
9308 SDValue Op = BVOp->getOperand(I);
9309 Mask[I] = Op == Op0 ? I : I + NumElems;
9310 }
9311 // Create shuffle of splats.
9312 SDValue NewOp0 = DAG.getSplatBuildVector(VT, DL, Op0);
9313 SDValue NewOp1 = DAG.getSplatBuildVector(VT, DL, Op1);
9314 return DAG.getVectorShuffle(VT, DL, NewOp0, NewOp1, Mask);
9315 }
9316
9317 return SDValue();
9318}
9319
9320/// Widen a BUILD_VECTOR if the scalar operands are freely mergeable.
9322 X86Subtarget const &Subtarget,
9323 SelectionDAG &DAG) {
9324 using namespace SDPatternMatch;
9325 MVT VT = BVOp->getSimpleValueType(0);
9326 MVT SVT = VT.getScalarType();
9327 unsigned NumElts = VT.getVectorNumElements();
9328 unsigned EltBits = SVT.getSizeInBits();
9329
9330 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
9331 return SDValue();
9332
9333 unsigned WideBits = 2 * EltBits;
9334 MVT WideSVT = MVT::getIntegerVT(WideBits);
9335 MVT WideVT = MVT::getVectorVT(WideSVT, NumElts / 2);
9336 if (!DAG.getTargetLoweringInfo().isTypeLegal(WideSVT))
9337 return SDValue();
9338
9340 for (unsigned I = 0; I != NumElts; I += 2) {
9341 SDValue Op0 = BVOp->getOperand(I + 0);
9342 SDValue Op1 = BVOp->getOperand(I + 1);
9343
9344 if (Op0.isUndef() && Op1.isUndef()) {
9345 WideOps.push_back(DAG.getUNDEF(WideSVT));
9346 continue;
9347 }
9348
9349 // TODO: Constant repacking?
9350
9351 // Merge scalars that have been split from the same source.
9352 SDValue X, Y;
9353 if (sd_match(Op0, m_Trunc(m_Value(X))) &&
9354 sd_match(Op1, m_Trunc(m_Srl(m_Value(Y), m_SpecificInt(EltBits)))) &&
9356 X.getValueType().bitsGE(WideSVT)) {
9357 if (X.getValueType().bitsGT(WideSVT))
9358 X = DAG.getNode(ISD::TRUNCATE, DL, WideSVT, X);
9359 WideOps.push_back(X);
9360 continue;
9361 }
9362
9363 return SDValue();
9364 }
9365
9366 assert(WideOps.size() == (NumElts / 2) && "Failed to widen build vector");
9367 return DAG.getBitcast(VT, DAG.getBuildVector(WideVT, DL, WideOps));
9368}
9369
9370/// Create a vector constant without a load. SSE/AVX provide the bare minimum
9371/// functionality to do this, so it's all zeros, all ones, or some derivation
9372/// that is cheap to calculate.
9374 SelectionDAG &DAG,
9375 const X86Subtarget &Subtarget) {
9376 MVT VT = Op.getSimpleValueType();
9377
9378 // Vectors containing all zeros can be matched by pxor and xorps.
9379 if (ISD::isBuildVectorAllZeros(Op.getNode()))
9380 return Op;
9381
9382 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
9383 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
9384 // vpcmpeqd on 256-bit vectors.
9385 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
9386 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
9387 return Op;
9388
9389 return getOnesVector(VT, DAG, DL);
9390 }
9391
9392 return SDValue();
9393}
9394
9395/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
9396/// from a vector of source values and a vector of extraction indices.
9397/// The vectors might be manipulated to match the type of the permute op.
9398static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
9399 const SDLoc &DL, SelectionDAG &DAG,
9400 const X86Subtarget &Subtarget) {
9401 MVT ShuffleVT = VT;
9402 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
9403 unsigned NumElts = VT.getVectorNumElements();
9404 unsigned SizeInBits = VT.getSizeInBits();
9405
9406 // Adjust IndicesVec to match VT size.
9407 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
9408 "Illegal variable permute mask size");
9409 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
9410 // Narrow/widen the indices vector to the correct size.
9411 if (IndicesVec.getValueSizeInBits() > SizeInBits)
9412 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
9413 NumElts * VT.getScalarSizeInBits());
9414 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
9415 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
9416 SDLoc(IndicesVec), SizeInBits);
9417 // Zero-extend the index elements within the vector.
9418 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
9419 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
9420 IndicesVT, IndicesVec);
9421 }
9422 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
9423
9424 // Handle SrcVec that don't match VT type.
9425 if (SrcVec.getValueSizeInBits() != SizeInBits) {
9426 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
9427 // Handle larger SrcVec by treating it as a larger permute.
9428 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
9429 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
9430 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
9431 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
9432 Subtarget, DAG, SDLoc(IndicesVec));
9433 SDValue NewSrcVec =
9434 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
9435 if (NewSrcVec)
9436 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
9437 return SDValue();
9438 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
9439 // Widen smaller SrcVec to match VT.
9440 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
9441 } else
9442 return SDValue();
9443 }
9444
9445 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
9446 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
9447 EVT SrcVT = Idx.getValueType();
9448 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
9449 uint64_t IndexScale = 0;
9450 uint64_t IndexOffset = 0;
9451
9452 // If we're scaling a smaller permute op, then we need to repeat the
9453 // indices, scaling and offsetting them as well.
9454 // e.g. v4i32 -> v16i8 (Scale = 4)
9455 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
9456 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
9457 for (uint64_t i = 0; i != Scale; ++i) {
9458 IndexScale |= Scale << (i * NumDstBits);
9459 IndexOffset |= i << (i * NumDstBits);
9460 }
9461
9462 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
9463 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
9464 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
9465 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
9466 return Idx;
9467 };
9468
9469 unsigned Opcode = 0;
9470 switch (VT.SimpleTy) {
9471 default:
9472 break;
9473 case MVT::v16i8:
9474 if (Subtarget.hasSSSE3())
9475 Opcode = X86ISD::PSHUFB;
9476 break;
9477 case MVT::v8i16:
9478 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9479 Opcode = X86ISD::VPERMV;
9480 else if (Subtarget.hasSSSE3()) {
9481 Opcode = X86ISD::PSHUFB;
9482 ShuffleVT = MVT::v16i8;
9483 }
9484 break;
9485 case MVT::v4f32:
9486 case MVT::v4i32:
9487 if (Subtarget.hasAVX()) {
9488 Opcode = X86ISD::VPERMILPV;
9489 ShuffleVT = MVT::v4f32;
9490 } else if (Subtarget.hasSSSE3()) {
9491 Opcode = X86ISD::PSHUFB;
9492 ShuffleVT = MVT::v16i8;
9493 }
9494 break;
9495 case MVT::v2f64:
9496 case MVT::v2i64:
9497 if (Subtarget.hasAVX()) {
9498 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
9499 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
9500 Opcode = X86ISD::VPERMILPV;
9501 ShuffleVT = MVT::v2f64;
9502 } else if (Subtarget.hasSSE41()) {
9503 // SSE41 can compare v2i64 - select between indices 0 and 1.
9504 return DAG.getSelectCC(
9505 DL, IndicesVec,
9506 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
9507 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
9508 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
9510 }
9511 break;
9512 case MVT::v32i8:
9513 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
9514 Opcode = X86ISD::VPERMV;
9515 else if (Subtarget.hasXOP()) {
9516 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
9517 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
9518 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
9519 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
9520 return DAG.getNode(
9522 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
9523 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
9524 } else if (Subtarget.hasAVX()) {
9525 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
9526 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
9527 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
9528 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
9529 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
9531 // Permute Lo and Hi and then select based on index range.
9532 // This works as SHUFB uses bits[3:0] to permute elements and we don't
9533 // care about the bit[7] as its just an index vector.
9534 SDValue Idx = Ops[2];
9535 EVT VT = Idx.getValueType();
9536 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
9537 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
9538 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
9540 };
9541 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
9542 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
9543 PSHUFBBuilder);
9544 }
9545 break;
9546 case MVT::v16i16:
9547 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9548 Opcode = X86ISD::VPERMV;
9549 else if (Subtarget.hasAVX()) {
9550 // Scale to v32i8 and perform as v32i8.
9551 IndicesVec = ScaleIndices(IndicesVec, 2);
9552 return DAG.getBitcast(
9554 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
9555 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
9556 }
9557 break;
9558 case MVT::v8f32:
9559 case MVT::v8i32:
9560 if (Subtarget.hasAVX2())
9561 Opcode = X86ISD::VPERMV;
9562 else if (Subtarget.hasAVX()) {
9563 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
9564 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9565 {0, 1, 2, 3, 0, 1, 2, 3});
9566 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9567 {4, 5, 6, 7, 4, 5, 6, 7});
9568 if (Subtarget.hasXOP())
9569 return DAG.getBitcast(
9570 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
9571 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9572 // Permute Lo and Hi and then select based on index range.
9573 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
9574 SDValue Res = DAG.getSelectCC(
9575 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
9576 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
9577 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
9579 return DAG.getBitcast(VT, Res);
9580 }
9581 break;
9582 case MVT::v4i64:
9583 case MVT::v4f64:
9584 if (Subtarget.hasAVX512()) {
9585 if (!Subtarget.hasVLX()) {
9586 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
9587 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
9588 SDLoc(SrcVec));
9589 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
9590 DAG, SDLoc(IndicesVec));
9591 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
9592 DAG, Subtarget);
9593 return extract256BitVector(Res, 0, DAG, DL);
9594 }
9595 Opcode = X86ISD::VPERMV;
9596 } else if (Subtarget.hasAVX()) {
9597 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
9598 SDValue LoLo =
9599 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
9600 SDValue HiHi =
9601 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
9602 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
9603 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
9604 if (Subtarget.hasXOP())
9605 return DAG.getBitcast(
9606 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
9607 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9608 // Permute Lo and Hi and then select based on index range.
9609 // This works as VPERMILPD only uses index bit[1] to permute elements.
9610 SDValue Res = DAG.getSelectCC(
9611 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
9612 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
9613 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
9615 return DAG.getBitcast(VT, Res);
9616 }
9617 break;
9618 case MVT::v64i8:
9619 if (Subtarget.hasVBMI())
9620 Opcode = X86ISD::VPERMV;
9621 break;
9622 case MVT::v32i16:
9623 if (Subtarget.hasBWI())
9624 Opcode = X86ISD::VPERMV;
9625 break;
9626 case MVT::v16f32:
9627 case MVT::v16i32:
9628 case MVT::v8f64:
9629 case MVT::v8i64:
9630 if (Subtarget.hasAVX512())
9631 Opcode = X86ISD::VPERMV;
9632 break;
9633 }
9634 if (!Opcode)
9635 return SDValue();
9636
9637 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
9638 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
9639 "Illegal variable permute shuffle type");
9640
9641 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
9642 if (Scale > 1)
9643 IndicesVec = ScaleIndices(IndicesVec, Scale);
9644
9645 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
9646 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
9647
9648 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
9649 SDValue Res = Opcode == X86ISD::VPERMV
9650 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
9651 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
9652 return DAG.getBitcast(VT, Res);
9653}
9654
9655// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
9656// reasoned to be a permutation of a vector by indices in a non-constant vector.
9657// (build_vector (extract_elt V, (extract_elt I, 0)),
9658// (extract_elt V, (extract_elt I, 1)),
9659// ...
9660// ->
9661// (vpermv I, V)
9662//
9663// TODO: Handle undefs
9664// TODO: Utilize pshufb and zero mask blending to support more efficient
9665// construction of vectors with constant-0 elements.
9666static SDValue
9668 SelectionDAG &DAG,
9669 const X86Subtarget &Subtarget) {
9670 SDValue SrcVec, IndicesVec;
9671
9672 auto PeekThroughFreeze = [](SDValue N) {
9673 if (N->getOpcode() == ISD::FREEZE && N.hasOneUse())
9674 return N->getOperand(0);
9675 return N;
9676 };
9677 // Check for a match of the permute source vector and permute index elements.
9678 // This is done by checking that the i-th build_vector operand is of the form:
9679 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
9680 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
9681 SDValue Op = PeekThroughFreeze(V.getOperand(Idx));
9682 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9683 return SDValue();
9684
9685 // If this is the first extract encountered in V, set the source vector,
9686 // otherwise verify the extract is from the previously defined source
9687 // vector.
9688 if (!SrcVec)
9689 SrcVec = Op.getOperand(0);
9690 else if (SrcVec != Op.getOperand(0))
9691 return SDValue();
9692 SDValue ExtractedIndex = Op->getOperand(1);
9693 // Peek through extends.
9694 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
9695 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
9696 ExtractedIndex = ExtractedIndex.getOperand(0);
9697 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9698 return SDValue();
9699
9700 // If this is the first extract from the index vector candidate, set the
9701 // indices vector, otherwise verify the extract is from the previously
9702 // defined indices vector.
9703 if (!IndicesVec)
9704 IndicesVec = ExtractedIndex.getOperand(0);
9705 else if (IndicesVec != ExtractedIndex.getOperand(0))
9706 return SDValue();
9707
9708 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
9709 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
9710 return SDValue();
9711 }
9712
9713 MVT VT = V.getSimpleValueType();
9714 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
9715}
9716
9717SDValue
9718X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
9719 SDLoc dl(Op);
9720
9721 MVT VT = Op.getSimpleValueType();
9722 MVT EltVT = VT.getVectorElementType();
9723 MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
9724 unsigned NumElems = Op.getNumOperands();
9725
9726 // Generate vectors for predicate vectors.
9727 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
9728 return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget);
9729
9730 if (VT.getVectorElementType() == MVT::bf16 &&
9731 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
9732 return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
9733
9734 if (SDValue VectorCst = materializeVectorConstant(Op, dl, DAG, Subtarget))
9735 return VectorCst;
9736
9737 unsigned EVTBits = EltVT.getSizeInBits();
9738 APInt UndefMask = APInt::getZero(NumElems);
9739 APInt FrozenUndefMask = APInt::getZero(NumElems);
9740 APInt ZeroMask = APInt::getZero(NumElems);
9741 APInt NonZeroMask = APInt::getZero(NumElems);
9742 bool IsAllConstants = true;
9743 bool OneUseFrozenUndefs = true;
9744 SmallSet<SDValue, 8> Values;
9745 unsigned NumConstants = NumElems;
9746 for (unsigned i = 0; i < NumElems; ++i) {
9747 SDValue Elt = Op.getOperand(i);
9748 if (Elt.isUndef()) {
9749 UndefMask.setBit(i);
9750 continue;
9751 }
9752 if (ISD::isFreezeUndef(Elt.getNode())) {
9753 OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse();
9754 FrozenUndefMask.setBit(i);
9755 continue;
9756 }
9757 Values.insert(Elt);
9758 if (!isIntOrFPConstant(Elt)) {
9759 IsAllConstants = false;
9760 NumConstants--;
9761 }
9762 if (X86::isZeroNode(Elt)) {
9763 ZeroMask.setBit(i);
9764 } else {
9765 NonZeroMask.setBit(i);
9766 }
9767 }
9768
9769 // All undef vector. Return an UNDEF.
9770 if (UndefMask.isAllOnes())
9771 return DAG.getUNDEF(VT);
9772
9773 // All undef/freeze(undef) vector. Return a FREEZE UNDEF.
9774 if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())
9775 return DAG.getFreeze(DAG.getUNDEF(VT));
9776
9777 // All undef/freeze(undef)/zero vector. Return a zero vector.
9778 if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())
9779 return getZeroVector(VT, Subtarget, DAG, dl);
9780
9781 // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
9782 // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in
9783 // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
9784 // and blend the FREEZE-UNDEF operands back in.
9785 // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
9786 if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();
9787 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
9788 SmallVector<int, 16> BlendMask(NumElems, -1);
9789 SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));
9790 for (unsigned i = 0; i < NumElems; ++i) {
9791 if (UndefMask[i]) {
9792 BlendMask[i] = -1;
9793 continue;
9794 }
9795 BlendMask[i] = i;
9796 if (!FrozenUndefMask[i])
9797 Elts[i] = Op.getOperand(i);
9798 else
9799 BlendMask[i] += NumElems;
9800 }
9801 SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);
9802 SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));
9803 SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);
9804 return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);
9805 }
9806
9807 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
9808
9809 // If the upper elts of a ymm/zmm are undef/freeze(undef)/zero then we might
9810 // be better off lowering to a smaller build vector and padding with
9811 // undef/zero.
9812 if ((VT.is256BitVector() || VT.is512BitVector()) &&
9814 unsigned UpperElems = NumElems / 2;
9815 APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;
9816 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();
9817 if (NumUpperUndefsOrZeros >= UpperElems) {
9818 if (VT.is512BitVector() &&
9819 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
9820 UpperElems = NumElems - (NumElems / 4);
9821 // If freeze(undef) is in any upper elements, force to zero.
9822 bool UndefUpper = UndefMask.countl_one() >= UpperElems;
9823 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
9824 SDValue NewBV =
9825 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
9826 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
9827 }
9828 }
9829
9830 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, dl, Subtarget, DAG))
9831 return AddSub;
9832 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, dl, Subtarget, DAG))
9833 return HorizontalOp;
9834 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, dl, Subtarget, DAG))
9835 return Broadcast;
9836 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG))
9837 return BitOp;
9838 if (SDValue Blend = lowerBuildVectorAsBlend(BV, dl, Subtarget, DAG))
9839 return Blend;
9840 if (SDValue WideBV = widenBuildVector(BV, dl, Subtarget, DAG))
9841 return WideBV;
9842
9843 unsigned NumZero = ZeroMask.popcount();
9844 unsigned NumNonZero = NonZeroMask.popcount();
9845
9846 // If we are inserting one variable into a vector of non-zero constants, try
9847 // to avoid loading each constant element as a scalar. Load the constants as a
9848 // vector and then insert the variable scalar element. If insertion is not
9849 // supported, fall back to a shuffle to get the scalar blended with the
9850 // constants. Insertion into a zero vector is handled as a special-case
9851 // somewhere below here.
9852 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
9853 FrozenUndefMask.isZero() &&
9856 // Create an all-constant vector. The variable element in the old
9857 // build vector is replaced by undef in the constant vector. Save the
9858 // variable scalar element and its index for use in the insertelement.
9859 LLVMContext &Context = *DAG.getContext();
9860 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
9861 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
9862 SDValue VarElt;
9863 SDValue InsIndex;
9864 for (unsigned i = 0; i != NumElems; ++i) {
9865 SDValue Elt = Op.getOperand(i);
9866 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
9867 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
9868 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
9869 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
9870 else if (!Elt.isUndef()) {
9871 assert(!VarElt.getNode() && !InsIndex.getNode() &&
9872 "Expected one variable element in this vector");
9873 VarElt = Elt;
9874 InsIndex = DAG.getVectorIdxConstant(i, dl);
9875 }
9876 }
9877 Constant *CV = ConstantVector::get(ConstVecOps);
9878 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
9879
9880 // The constants we just created may not be legal (eg, floating point). We
9881 // must lower the vector right here because we can not guarantee that we'll
9882 // legalize it before loading it. This is also why we could not just create
9883 // a new build vector here. If the build vector contains illegal constants,
9884 // it could get split back up into a series of insert elements.
9885 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
9886 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
9887 MachineFunction &MF = DAG.getMachineFunction();
9888 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
9889 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
9890 unsigned InsertC = InsIndex->getAsZExtVal();
9891 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
9892 if (InsertC < NumEltsInLow128Bits)
9893 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
9894
9895 // There's no good way to insert into the high elements of a >128-bit
9896 // vector, so use shuffles to avoid an extract/insert sequence.
9897 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
9898 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
9899 SmallVector<int, 8> ShuffleMask;
9900 unsigned NumElts = VT.getVectorNumElements();
9901 for (unsigned i = 0; i != NumElts; ++i)
9902 ShuffleMask.push_back(i == InsertC ? NumElts : i);
9903 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
9904 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
9905 }
9906
9907 // Special case for single non-zero, non-undef, element.
9908 if (NumNonZero == 1) {
9909 unsigned Idx = NonZeroMask.countr_zero();
9910 SDValue Item = Op.getOperand(Idx);
9911
9912 // If we have a constant or non-constant insertion into the low element of
9913 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
9914 // the rest of the elements. This will be matched as movd/movq/movss/movsd
9915 // depending on what the source datatype is.
9916 if (Idx == 0) {
9917 if (NumZero == 0)
9918 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9919
9920 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
9921 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
9922 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
9923 assert((VT.is128BitVector() || VT.is256BitVector() ||
9924 VT.is512BitVector()) &&
9925 "Expected an SSE value type!");
9926 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9927 // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
9928 // zero vector.
9929 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9930 }
9931
9932 // We can't directly insert an i8 or i16 into a vector, so zero extend
9933 // it to i32 first.
9934 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
9935 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
9936 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
9937 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
9938 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9939 return DAG.getBitcast(VT, Item);
9940 }
9941 }
9942
9943 // Is it a vector logical left shift?
9944 if (NumElems == 2 && Idx == 1 &&
9945 X86::isZeroNode(Op.getOperand(0)) &&
9946 !X86::isZeroNode(Op.getOperand(1))) {
9947 unsigned NumBits = VT.getSizeInBits();
9948 return getVShift(true, VT,
9950 VT, Op.getOperand(1)),
9951 NumBits/2, DAG, *this, dl);
9952 }
9953
9954 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
9955 return SDValue();
9956
9957 // Otherwise, if this is a vector with i32 or f32 elements, and the element
9958 // is a non-constant being inserted into an element other than the low one,
9959 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
9960 // movd/movss) to move this into the low element, then shuffle it into
9961 // place.
9962 if (EVTBits == 32) {
9963 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9964 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
9965 }
9966 }
9967
9968 // Splat is obviously ok. Let legalizer expand it to a shuffle.
9969 if (Values.size() == 1) {
9970 if (EVTBits == 32) {
9971 // Instead of a shuffle like this:
9972 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
9973 // Check if it's possible to issue this instead.
9974 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
9975 unsigned Idx = NonZeroMask.countr_zero();
9976 SDValue Item = Op.getOperand(Idx);
9977 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
9978 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
9979 }
9980 return SDValue();
9981 }
9982
9983 // A vector full of immediates; various special cases are already
9984 // handled, so this is best done with a single constant-pool load.
9985 if (IsAllConstants)
9986 return SDValue();
9987
9988 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, dl, DAG, Subtarget))
9989 return V;
9990
9991 // See if we can use a vector load to get all of the elements.
9992 {
9993 SmallVector<SDValue, 64> Ops(Op->ops().take_front(NumElems));
9994 if (SDValue LD =
9995 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
9996 return LD;
9997 }
9998
9999 // If this is a splat of pairs of 32-bit elements, we can use a narrower
10000 // build_vector and broadcast it.
10001 // TODO: We could probably generalize this more.
10002 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
10003 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
10004 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
10005 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
10006 // Make sure all the even/odd operands match.
10007 for (unsigned i = 2; i != NumElems; ++i)
10008 if (Ops[i % 2] != Op.getOperand(i))
10009 return false;
10010 return true;
10011 };
10012 if (CanSplat(Op, NumElems, Ops)) {
10013 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
10014 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
10015 // Create a new build vector and cast to v2i64/v2f64.
10016 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
10017 DAG.getBuildVector(NarrowVT, dl, Ops));
10018 // Broadcast from v2i64/v2f64 and cast to final VT.
10019 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
10020 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
10021 NewBV));
10022 }
10023 }
10024
10025 // For AVX-length vectors, build the individual 128-bit pieces and use
10026 // shuffles to put them in place.
10027 if (VT.getSizeInBits() > 128) {
10028 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
10029
10030 // Build both the lower and upper subvector.
10031 SDValue Lower =
10032 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
10034 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
10035
10036 // Recreate the wider vector with the lower and upper part.
10037 return concatSubVectors(Lower, Upper, DAG, dl);
10038 }
10039
10040 // Let legalizer expand 2-wide build_vectors.
10041 if (EVTBits == 64) {
10042 if (NumNonZero == 1) {
10043 // One half is zero or undef.
10044 unsigned Idx = NonZeroMask.countr_zero();
10045 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
10046 Op.getOperand(Idx));
10047 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
10048 }
10049 return SDValue();
10050 }
10051
10052 // If element VT is < 32 bits, convert it to inserts into a zero vector.
10053 if (EVTBits == 8 && NumElems == 16)
10054 if (SDValue V = LowerBuildVectorv16i8(Op, dl, NonZeroMask, NumNonZero,
10055 NumZero, DAG, Subtarget))
10056 return V;
10057
10058 if (EltVT == MVT::i16 && NumElems == 8)
10059 if (SDValue V = LowerBuildVectorv8i16(Op, dl, NonZeroMask, NumNonZero,
10060 NumZero, DAG, Subtarget))
10061 return V;
10062
10063 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
10064 if (EVTBits == 32 && NumElems == 4)
10065 if (SDValue V = LowerBuildVectorv4x32(Op, dl, DAG, Subtarget))
10066 return V;
10067
10068 // If element VT is == 32 bits, turn it into a number of shuffles.
10069 if (NumElems == 4 && NumZero > 0) {
10070 SmallVector<SDValue, 8> Ops(NumElems);
10071 for (unsigned i = 0; i < 4; ++i) {
10072 bool isZero = !NonZeroMask[i];
10073 if (isZero)
10074 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
10075 else
10076 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
10077 }
10078
10079 for (unsigned i = 0; i < 2; ++i) {
10080 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
10081 default: llvm_unreachable("Unexpected NonZero count");
10082 case 0:
10083 Ops[i] = Ops[i*2]; // Must be a zero vector.
10084 break;
10085 case 1:
10086 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
10087 break;
10088 case 2:
10089 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
10090 break;
10091 case 3:
10092 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
10093 break;
10094 }
10095 }
10096
10097 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
10098 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
10099 int MaskVec[] = {
10100 Reverse1 ? 1 : 0,
10101 Reverse1 ? 0 : 1,
10102 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
10103 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
10104 };
10105 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
10106 }
10107
10108 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
10109
10110 // Check for a build vector from mostly shuffle plus few inserting.
10111 if (SDValue Sh = buildFromShuffleMostly(Op, dl, DAG))
10112 return Sh;
10113
10114 // For SSE 4.1, use insertps to put the high elements into the low element.
10115 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
10117 if (!Op.getOperand(0).isUndef())
10118 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
10119 else
10120 Result = DAG.getUNDEF(VT);
10121
10122 for (unsigned i = 1; i < NumElems; ++i) {
10123 if (Op.getOperand(i).isUndef()) continue;
10124 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
10125 Op.getOperand(i), DAG.getVectorIdxConstant(i, dl));
10126 }
10127 return Result;
10128 }
10129
10130 // Otherwise, expand into a number of unpckl*, start by extending each of
10131 // our (non-undef) elements to the full vector width with the element in the
10132 // bottom slot of the vector (which generates no code for SSE).
10133 SmallVector<SDValue, 8> Ops(NumElems);
10134 for (unsigned i = 0; i < NumElems; ++i) {
10135 if (!Op.getOperand(i).isUndef())
10136 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
10137 else
10138 Ops[i] = DAG.getUNDEF(VT);
10139 }
10140
10141 // Next, we iteratively mix elements, e.g. for v4f32:
10142 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
10143 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
10144 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
10145 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
10146 // Generate scaled UNPCKL shuffle mask.
10147 SmallVector<int, 16> Mask;
10148 for(unsigned i = 0; i != Scale; ++i)
10149 Mask.push_back(i);
10150 for (unsigned i = 0; i != Scale; ++i)
10151 Mask.push_back(NumElems+i);
10152 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
10153
10154 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
10155 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
10156 }
10157 return Ops[0];
10158}
10159
10160// 256-bit AVX can use the vinsertf128 instruction
10161// to create 256-bit vectors from two other 128-bit ones.
10162// TODO: Detect subvector broadcast here instead of DAG combine?
10164 SelectionDAG &DAG,
10165 const X86Subtarget &Subtarget) {
10166 MVT ResVT = Op.getSimpleValueType();
10167 assert((ResVT.is256BitVector() || ResVT.is512BitVector()) &&
10168 "Value type must be 256-/512-bit wide");
10169
10170 unsigned NumOperands = Op.getNumOperands();
10171 unsigned NumFreezeUndef = 0;
10172 unsigned NumZero = 0;
10173 unsigned NumNonZero = 0;
10174 unsigned NonZeros = 0;
10175 SmallSet<SDValue, 4> Undefs;
10176 for (unsigned i = 0; i != NumOperands; ++i) {
10177 SDValue SubVec = Op.getOperand(i);
10178 if (SubVec.isUndef())
10179 continue;
10180 if (ISD::isFreezeUndef(SubVec.getNode())) {
10181 // If the freeze(undef) has multiple uses then we must fold to zero.
10182 if (SubVec.hasOneUse()) {
10183 ++NumFreezeUndef;
10184 } else {
10185 ++NumZero;
10186 Undefs.insert(SubVec);
10187 }
10188 }
10189 else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
10190 ++NumZero;
10191 else {
10192 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
10193 NonZeros |= 1 << i;
10194 ++NumNonZero;
10195 }
10196 }
10197
10198 // If we have more than 2 non-zeros, build each half separately.
10199 if (NumNonZero > 2) {
10200 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
10201 ArrayRef<SDUse> Ops = Op->ops();
10202 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10203 Ops.slice(0, NumOperands/2));
10204 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10205 Ops.slice(NumOperands/2));
10206 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
10207 }
10208
10209 // Otherwise, build it up through insert_subvectors.
10210 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
10211 : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
10212 : DAG.getUNDEF(ResVT));
10213
10214 // Replace Undef operands with ZeroVector.
10215 for (SDValue U : Undefs)
10217 U, getZeroVector(U.getSimpleValueType(), Subtarget, DAG, dl));
10218
10219 MVT SubVT = Op.getOperand(0).getSimpleValueType();
10220 unsigned NumSubElems = SubVT.getVectorNumElements();
10221 for (unsigned i = 0; i != NumOperands; ++i) {
10222 if ((NonZeros & (1 << i)) == 0)
10223 continue;
10224
10225 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(i),
10226 DAG.getVectorIdxConstant(i * NumSubElems, dl));
10227 }
10228
10229 return Vec;
10230}
10231
10232// Returns true if the given node is a type promotion (by concatenating i1
10233// zeros) of the result of a node that already zeros all upper bits of
10234// k-register.
10235// TODO: Merge this with LowerAVXCONCAT_VECTORS?
10237 const X86Subtarget &Subtarget,
10238 SelectionDAG & DAG) {
10239 MVT ResVT = Op.getSimpleValueType();
10240 unsigned NumOperands = Op.getNumOperands();
10241 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
10242 "Unexpected number of operands in CONCAT_VECTORS");
10243
10244 uint64_t Zeros = 0;
10245 uint64_t NonZeros = 0;
10246 for (unsigned i = 0; i != NumOperands; ++i) {
10247 SDValue SubVec = Op.getOperand(i);
10248 if (SubVec.isUndef())
10249 continue;
10250 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
10251 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
10252 Zeros |= (uint64_t)1 << i;
10253 else
10254 NonZeros |= (uint64_t)1 << i;
10255 }
10256
10257 unsigned NumElems = ResVT.getVectorNumElements();
10258
10259 // If we are inserting non-zero vector and there are zeros in LSBs and undef
10260 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
10261 // insert_subvector will give us two kshifts.
10262 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
10263 Log2_64(NonZeros) != NumOperands - 1) {
10264 unsigned Idx = Log2_64(NonZeros);
10265 SDValue SubVec = Op.getOperand(Idx);
10266 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
10267 MVT ShiftVT = widenMaskVectorType(ResVT, Subtarget);
10268 Op = widenSubVector(ShiftVT, SubVec, false, Subtarget, DAG, dl);
10269 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, Op,
10270 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
10271 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
10272 DAG.getVectorIdxConstant(0, dl));
10273 }
10274
10275 // If there are zero or one non-zeros we can handle this very simply.
10276 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
10277 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
10278 if (!NonZeros)
10279 return Vec;
10280 unsigned Idx = Log2_64(NonZeros);
10281 SDValue SubVec = Op.getOperand(Idx);
10282 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
10283 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
10284 DAG.getVectorIdxConstant(Idx * SubVecNumElts, dl));
10285 }
10286
10287 if (NumOperands > 2) {
10288 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
10289 ArrayRef<SDUse> Ops = Op->ops();
10290 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10291 Ops.slice(0, NumOperands / 2));
10292 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10293 Ops.slice(NumOperands / 2));
10294 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
10295 }
10296
10297 assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?");
10298
10299 if (ResVT.getVectorNumElements() >= 16)
10300 return Op; // The operation is legal with KUNPCK
10301
10302 SDValue Vec =
10303 DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, DAG.getUNDEF(ResVT),
10304 Op.getOperand(0), DAG.getVectorIdxConstant(0, dl));
10305 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
10306 DAG.getVectorIdxConstant(NumElems / 2, dl));
10307}
10308
10310 const X86Subtarget &Subtarget,
10311 SelectionDAG &DAG) {
10312 SDLoc DL(Op);
10313 MVT VT = Op.getSimpleValueType();
10314 if (VT.getVectorElementType() == MVT::i1)
10315 return LowerCONCAT_VECTORSvXi1(Op, DL, Subtarget, DAG);
10316
10317 // AVX can use the vinsertf128 instruction to create 256-bit vectors
10318 // from two other 128-bit ones.
10319 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
10320 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
10321 (VT.is512BitVector() &&
10322 (Op.getNumOperands() == 2 || Op.getNumOperands() == 4)));
10323 return LowerAVXCONCAT_VECTORS(Op, DL, DAG, Subtarget);
10324}
10325
10326//===----------------------------------------------------------------------===//
10327// Vector shuffle lowering
10328//
10329// This is an experimental code path for lowering vector shuffles on x86. It is
10330// designed to handle arbitrary vector shuffles and blends, gracefully
10331// degrading performance as necessary. It works hard to recognize idiomatic
10332// shuffles and lower them to optimal instruction patterns without leaving
10333// a framework that allows reasonably efficient handling of all vector shuffle
10334// patterns.
10335//===----------------------------------------------------------------------===//
10336
10337/// Checks whether the vector elements referenced by two shuffle masks are
10338/// equivalent.
10339static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
10340 int Idx, int ExpectedIdx) {
10341 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
10342 ExpectedIdx < MaskSize && "Out of range element index");
10343 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
10344 return false;
10345
10346 EVT VT = Op.getValueType();
10347 EVT ExpectedVT = ExpectedOp.getValueType();
10348
10349 // Sources must be vectors and match the mask's element count.
10350 if (!VT.isVector() || !ExpectedVT.isVector() ||
10351 (int)VT.getVectorNumElements() != MaskSize ||
10352 (int)ExpectedVT.getVectorNumElements() != MaskSize)
10353 return false;
10354
10355 // Exact match.
10356 if (Idx == ExpectedIdx && Op == ExpectedOp)
10357 return true;
10358
10359 switch (Op.getOpcode()) {
10360 case ISD::BUILD_VECTOR:
10361 // If the values are build vectors, we can look through them to find
10362 // equivalent inputs that make the shuffles equivalent.
10363 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
10364 case ISD::BITCAST: {
10366 EVT SrcVT = Src.getValueType();
10367 if (Op == ExpectedOp && SrcVT.isVector()) {
10368 if ((SrcVT.getScalarSizeInBits() % VT.getScalarSizeInBits()) == 0) {
10369 unsigned Scale = SrcVT.getScalarSizeInBits() / VT.getScalarSizeInBits();
10370 return (Idx % Scale) == (ExpectedIdx % Scale) &&
10371 IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
10372 Idx / Scale, ExpectedIdx / Scale);
10373 }
10374 if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0) {
10375 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
10376 for (unsigned I = 0; I != Scale; ++I)
10377 if (!IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
10378 (Idx * Scale) + I,
10379 (ExpectedIdx * Scale) + I))
10380 return false;
10381 return true;
10382 }
10383 }
10384 break;
10385 }
10386 case ISD::VECTOR_SHUFFLE: {
10387 auto *SVN = cast<ShuffleVectorSDNode>(Op);
10388 return Op == ExpectedOp &&
10389 SVN->getMaskElt(Idx) == SVN->getMaskElt(ExpectedIdx);
10390 }
10391 case X86ISD::VBROADCAST:
10392 case X86ISD::VBROADCAST_LOAD:
10393 return Op == ExpectedOp;
10394 case X86ISD::SUBV_BROADCAST_LOAD:
10395 if (Op == ExpectedOp) {
10396 auto *MemOp = cast<MemSDNode>(Op);
10397 unsigned NumMemElts = MemOp->getMemoryVT().getVectorNumElements();
10398 return (Idx % NumMemElts) == (ExpectedIdx % NumMemElts);
10399 }
10400 break;
10401 case X86ISD::VPERMI: {
10402 if (Op == ExpectedOp) {
10404 DecodeVPERMMask(MaskSize, Op.getConstantOperandVal(1), Mask);
10405 SDValue Src = Op.getOperand(0);
10406 return IsElementEquivalent(MaskSize, Src, Src, Mask[Idx],
10407 Mask[ExpectedIdx]);
10408 }
10409 break;
10410 }
10411 case X86ISD::HADD:
10412 case X86ISD::HSUB:
10413 case X86ISD::FHADD:
10414 case X86ISD::FHSUB:
10415 case X86ISD::PACKSS:
10416 case X86ISD::PACKUS:
10417 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
10418 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
10419 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
10420 int NumElts = VT.getVectorNumElements();
10421 int NumLanes = VT.getSizeInBits() / 128;
10422 int NumEltsPerLane = NumElts / NumLanes;
10423 int NumHalfEltsPerLane = NumEltsPerLane / 2;
10424 bool SameLane = (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
10425 bool SameElt =
10426 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
10427 return SameLane && SameElt;
10428 }
10429 break;
10430 }
10431
10432 return false;
10433}
10434
10435/// Tiny helper function to identify a no-op mask.
10436///
10437/// This is a somewhat boring predicate function. It checks whether the mask
10438/// array input, which is assumed to be a single-input shuffle mask of the kind
10439/// used by the X86 shuffle instructions (not a fully general
10440/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
10441/// in-place shuffle are 'no-op's.
10443 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10444 assert(Mask[i] >= -1 && "Out of bound mask element!");
10445 if (Mask[i] >= 0 && Mask[i] != i)
10446 return false;
10447 }
10448 return true;
10449}
10450
10451/// Test whether there are elements crossing LaneSizeInBits lanes in this
10452/// shuffle mask.
10453///
10454/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
10455/// and we routinely test for these.
10456static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
10457 unsigned ScalarSizeInBits,
10458 ArrayRef<int> Mask) {
10459 assert(LaneSizeInBits && ScalarSizeInBits &&
10460 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
10461 "Illegal shuffle lane size");
10462 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
10463 int Size = Mask.size();
10464 for (int i = 0; i < Size; ++i)
10465 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
10466 return true;
10467 return false;
10468}
10469
10470/// Test whether there are elements crossing 128-bit lanes in this
10471/// shuffle mask.
10473 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
10474}
10475
10476/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
10477/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
10478/// better support 'repeated mask + lane permute' style shuffles.
10479static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
10480 unsigned ScalarSizeInBits,
10481 ArrayRef<int> Mask) {
10482 assert(LaneSizeInBits && ScalarSizeInBits &&
10483 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
10484 "Illegal shuffle lane size");
10485 int NumElts = Mask.size();
10486 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
10487 int NumLanes = NumElts / NumEltsPerLane;
10488 if (NumLanes > 1) {
10489 for (int i = 0; i != NumLanes; ++i) {
10490 int SrcLane = -1;
10491 for (int j = 0; j != NumEltsPerLane; ++j) {
10492 int M = Mask[(i * NumEltsPerLane) + j];
10493 if (M < 0)
10494 continue;
10495 int Lane = (M % NumElts) / NumEltsPerLane;
10496 if (SrcLane >= 0 && SrcLane != Lane)
10497 return true;
10498 SrcLane = Lane;
10499 }
10500 }
10501 }
10502 return false;
10503}
10504
10505/// Test whether a shuffle mask is equivalent within each sub-lane.
10506///
10507/// This checks a shuffle mask to see if it is performing the same
10508/// lane-relative shuffle in each sub-lane. This trivially implies
10509/// that it is also not lane-crossing. It may however involve a blend from the
10510/// same lane of a second vector.
10511///
10512/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
10513/// non-trivial to compute in the face of undef lanes. The representation is
10514/// suitable for use with existing 128-bit shuffles as entries from the second
10515/// vector have been remapped to [LaneSize, 2*LaneSize).
10516static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
10517 ArrayRef<int> Mask,
10518 SmallVectorImpl<int> &RepeatedMask) {
10519 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
10520 RepeatedMask.assign(LaneSize, -1);
10521 int Size = Mask.size();
10522 for (int i = 0; i < Size; ++i) {
10523 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
10524 if (Mask[i] < 0)
10525 continue;
10526 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10527 // This entry crosses lanes, so there is no way to model this shuffle.
10528 return false;
10529
10530 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
10531 // Adjust second vector indices to start at LaneSize instead of Size.
10532 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
10533 : Mask[i] % LaneSize + LaneSize;
10534 if (RepeatedMask[i % LaneSize] < 0)
10535 // This is the first non-undef entry in this slot of a 128-bit lane.
10536 RepeatedMask[i % LaneSize] = LocalM;
10537 else if (RepeatedMask[i % LaneSize] != LocalM)
10538 // Found a mismatch with the repeated mask.
10539 return false;
10540 }
10541 return true;
10542}
10543
10544/// Test whether a shuffle mask is equivalent within each 128-bit lane.
10545static bool
10547 SmallVectorImpl<int> &RepeatedMask) {
10548 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10549}
10550
10551static bool
10553 SmallVector<int, 32> RepeatedMask;
10554 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10555}
10556
10557/// Test whether a shuffle mask is equivalent within each 256-bit lane.
10558static bool
10560 SmallVectorImpl<int> &RepeatedMask) {
10561 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
10562}
10563
10564/// Test whether a target shuffle mask is equivalent within each sub-lane.
10565/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10566static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
10567 unsigned EltSizeInBits,
10568 ArrayRef<int> Mask,
10569 SmallVectorImpl<int> &RepeatedMask) {
10570 int LaneSize = LaneSizeInBits / EltSizeInBits;
10571 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
10572 int Size = Mask.size();
10573 for (int i = 0; i < Size; ++i) {
10574 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
10575 if (Mask[i] == SM_SentinelUndef)
10576 continue;
10577 if (Mask[i] == SM_SentinelZero) {
10578 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
10579 return false;
10580 RepeatedMask[i % LaneSize] = SM_SentinelZero;
10581 continue;
10582 }
10583 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10584 // This entry crosses lanes, so there is no way to model this shuffle.
10585 return false;
10586
10587 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
10588 // later vector indices to start at multiples of LaneSize instead of Size.
10589 int LaneM = Mask[i] / Size;
10590 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
10591 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
10592 // This is the first non-undef entry in this slot of a 128-bit lane.
10593 RepeatedMask[i % LaneSize] = LocalM;
10594 else if (RepeatedMask[i % LaneSize] != LocalM)
10595 // Found a mismatch with the repeated mask.
10596 return false;
10597 }
10598 return true;
10599}
10600
10601/// Test whether a target shuffle mask is equivalent within each sub-lane.
10602/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10603static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
10604 ArrayRef<int> Mask,
10605 SmallVectorImpl<int> &RepeatedMask) {
10606 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
10607 Mask, RepeatedMask);
10608}
10609
10610/// Checks whether a shuffle mask is equivalent to an explicit list of
10611/// arguments.
10612///
10613/// This is a fast way to test a shuffle mask against a fixed pattern:
10614///
10615/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
10616///
10617/// It returns true if the mask is exactly as wide as the argument list, and
10618/// each element of the mask is either -1 (signifying undef) or the value given
10619/// in the argument.
10620static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
10621 SDValue V1 = SDValue(),
10622 SDValue V2 = SDValue()) {
10623 int Size = Mask.size();
10624 if (Size != (int)ExpectedMask.size())
10625 return false;
10626
10627 for (int i = 0; i < Size; ++i) {
10628 assert(Mask[i] >= -1 && "Out of bound mask element!");
10629 int MaskIdx = Mask[i];
10630 int ExpectedIdx = ExpectedMask[i];
10631 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
10632 SDValue MaskV = MaskIdx < Size ? V1 : V2;
10633 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10634 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
10635 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10636 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
10637 return false;
10638 }
10639 }
10640 return true;
10641}
10642
10643/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
10644///
10645/// The masks must be exactly the same width.
10646///
10647/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
10648/// value in ExpectedMask is always accepted. Otherwise the indices must match.
10649///
10650/// SM_SentinelZero is accepted as a valid negative index but must match in
10651/// both, or via a known bits test.
10653 ArrayRef<int> ExpectedMask,
10654 const SelectionDAG &DAG,
10655 SDValue V1 = SDValue(),
10656 SDValue V2 = SDValue()) {
10657 int Size = Mask.size();
10658 if (Size != (int)ExpectedMask.size())
10659 return false;
10660 assert(llvm::all_of(ExpectedMask,
10661 [Size](int M) {
10662 return M == SM_SentinelZero ||
10663 isInRange(M, 0, 2 * Size);
10664 }) &&
10665 "Illegal target shuffle mask");
10666
10667 // Check for out-of-range target shuffle mask indices.
10668 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
10669 return false;
10670
10671 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
10672 if (V1 && (V1.getValueSizeInBits() != VT.getSizeInBits() ||
10673 !V1.getValueType().isVector()))
10674 V1 = SDValue();
10675 if (V2 && (V2.getValueSizeInBits() != VT.getSizeInBits() ||
10676 !V2.getValueType().isVector()))
10677 V2 = SDValue();
10678
10679 APInt ZeroV1 = APInt::getZero(Size);
10680 APInt ZeroV2 = APInt::getZero(Size);
10681
10682 for (int i = 0; i < Size; ++i) {
10683 int MaskIdx = Mask[i];
10684 int ExpectedIdx = ExpectedMask[i];
10685 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
10686 continue;
10687 // If we failed to match an expected SM_SentinelZero then early out.
10688 if (ExpectedIdx < 0)
10689 return false;
10690 if (MaskIdx == SM_SentinelZero) {
10691 // If we need this expected index to be a zero element, then update the
10692 // relevant zero mask and perform the known bits at the end to minimize
10693 // repeated computes.
10694 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10695 if (ExpectedV &&
10696 Size == (int)ExpectedV.getValueType().getVectorNumElements()) {
10697 int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10698 APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;
10699 ZeroMask.setBit(BitIdx);
10700 continue;
10701 }
10702 }
10703 if (MaskIdx >= 0) {
10704 SDValue MaskV = MaskIdx < Size ? V1 : V2;
10705 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10706 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
10707 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10708 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
10709 continue;
10710 }
10711 return false;
10712 }
10713 return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&
10714 (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));
10715}
10716
10717// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
10718// instructions.
10720 const SelectionDAG &DAG) {
10721 if (VT != MVT::v8i32 && VT != MVT::v8f32)
10722 return false;
10723
10724 SmallVector<int, 8> Unpcklwd;
10725 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
10726 /* Unary = */ false);
10727 SmallVector<int, 8> Unpckhwd;
10728 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
10729 /* Unary = */ false);
10730 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||
10731 isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));
10732 return IsUnpackwdMask;
10733}
10734
10736 const SelectionDAG &DAG) {
10737 // Create 128-bit vector type based on mask size.
10738 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
10739 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
10740
10741 // We can't assume a canonical shuffle mask, so try the commuted version too.
10742 SmallVector<int, 4> CommutedMask(Mask);
10744
10745 // Match any of unary/binary or low/high.
10746 for (unsigned i = 0; i != 4; ++i) {
10747 SmallVector<int, 16> UnpackMask;
10748 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
10749 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||
10750 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))
10751 return true;
10752 }
10753 return false;
10754}
10755
10756/// Return true if a shuffle mask chooses elements identically in its top and
10757/// bottom halves. For example, any splat mask has the same top and bottom
10758/// halves. If an element is undefined in only one half of the mask, the halves
10759/// are not considered identical.
10761 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
10762 unsigned HalfSize = Mask.size() / 2;
10763 for (unsigned i = 0; i != HalfSize; ++i) {
10764 if (Mask[i] != Mask[i + HalfSize])
10765 return false;
10766 }
10767 return true;
10768}
10769
10770/// Get a 4-lane 8-bit shuffle immediate for a mask.
10771///
10772/// This helper function produces an 8-bit shuffle immediate corresponding to
10773/// the ubiquitous shuffle encoding scheme used in x86 instructions for
10774/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
10775/// example.
10776///
10777/// NB: We rely heavily on "undef" masks preserving the input lane.
10778static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
10779 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
10780 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
10781 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
10782 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
10783 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
10784
10785 // If the mask only uses one non-undef element, then fully 'splat' it to
10786 // improve later broadcast matching.
10787 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10788 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
10789
10790 int FirstElt = Mask[FirstIndex];
10791 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
10792 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
10793
10794 unsigned Imm = 0;
10795 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
10796 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
10797 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
10798 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
10799 return Imm;
10800}
10801
10803 SelectionDAG &DAG) {
10804 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
10805}
10806
10807// Canonicalize SHUFPD mask to improve chances of further folding.
10808// Mask elements are assumed to be -1, 0 or 1 to match the SHUFPD lo/hi pattern.
10809static unsigned getSHUFPDImm(ArrayRef<int> Mask) {
10810 assert((Mask.size() == 2 || Mask.size() == 4 || Mask.size() == 8) &&
10811 "Unexpected SHUFPD mask size");
10812 assert(all_of(Mask, [](int M) { return -1 <= M && M <= 1; }) &&
10813 "Unexpected SHUFPD mask elements");
10814
10815 // If the mask only uses one non-undef element, then fully 'splat' it to
10816 // improve later broadcast matching.
10817 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10818 assert(0 <= FirstIndex && FirstIndex < (int)Mask.size() &&
10819 "All undef shuffle mask");
10820
10821 int FirstElt = Mask[FirstIndex];
10822 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }) &&
10823 count_if(Mask, [FirstElt](int M) { return M == FirstElt; }) > 1) {
10824 unsigned Imm = 0;
10825 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10826 Imm |= FirstElt << I;
10827 return Imm;
10828 }
10829
10830 // Attempt to keep any undef elements in place to improve chances of the
10831 // shuffle becoming a (commutative) blend.
10832 unsigned Imm = 0;
10833 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10834 Imm |= (Mask[I] < 0 ? (I & 1) : Mask[I]) << I;
10835
10836 return Imm;
10837}
10838
10840 SelectionDAG &DAG) {
10841 return DAG.getTargetConstant(getSHUFPDImm(Mask), DL, MVT::i8);
10842}
10843
10844// The Shuffle result is as follow:
10845// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
10846// Each Zeroable's element correspond to a particular Mask's element.
10847// As described in computeZeroableShuffleElements function.
10848//
10849// The function looks for a sub-mask that the nonzero elements are in
10850// increasing order. If such sub-mask exist. The function returns true.
10851static bool isNonZeroElementsInOrder(const APInt &Zeroable,
10852 ArrayRef<int> Mask, const EVT &VectorType,
10853 bool &IsZeroSideLeft) {
10854 int NextElement = -1;
10855 // Check if the Mask's nonzero elements are in increasing order.
10856 for (int i = 0, e = Mask.size(); i < e; i++) {
10857 // Checks if the mask's zeros elements are built from only zeros.
10858 assert(Mask[i] >= -1 && "Out of bound mask element!");
10859 if (Mask[i] < 0)
10860 return false;
10861 if (Zeroable[i])
10862 continue;
10863 // Find the lowest non zero element
10864 if (NextElement < 0) {
10865 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
10866 IsZeroSideLeft = NextElement != 0;
10867 }
10868 // Exit if the mask's non zero elements are not in increasing order.
10869 if (NextElement != Mask[i])
10870 return false;
10871 NextElement++;
10872 }
10873 return true;
10874}
10875
10876static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
10878 const X86Subtarget &Subtarget,
10879 unsigned Depth = 0);
10880
10881/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
10883 ArrayRef<int> Mask, SDValue V1,
10884 SDValue V2, const APInt &Zeroable,
10885 const X86Subtarget &Subtarget,
10886 SelectionDAG &DAG) {
10887 int Size = Mask.size();
10888 int LaneSize = 128 / VT.getScalarSizeInBits();
10889 const int NumBytes = VT.getSizeInBits() / 8;
10890 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
10891
10892 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
10893 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
10894 (Subtarget.hasBWI() && VT.is512BitVector()));
10895
10896 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
10897 // Sign bit set in i8 mask means zero element.
10898 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
10899
10900 SDValue V;
10901 for (int i = 0; i < NumBytes; ++i) {
10902 int M = Mask[i / NumEltBytes];
10903 if (M < 0) {
10904 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
10905 continue;
10906 }
10907 if (Zeroable[i / NumEltBytes]) {
10908 PSHUFBMask[i] = ZeroMask;
10909 continue;
10910 }
10911
10912 // We can only use a single input of V1 or V2.
10913 SDValue SrcV = (M >= Size ? V2 : V1);
10914 if (V && V != SrcV)
10915 return SDValue();
10916 V = SrcV;
10917 M %= Size;
10918
10919 // PSHUFB can't cross lanes, ensure this doesn't happen.
10920 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
10921 return SDValue();
10922
10923 M = M % LaneSize;
10924 M = M * NumEltBytes + (i % NumEltBytes);
10925 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
10926 }
10927 assert(V && "Failed to find a source input");
10928
10929 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
10930 return DAG.getBitcast(
10931 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
10932 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
10933}
10934
10935static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
10936 const X86Subtarget &Subtarget, SelectionDAG &DAG,
10937 const SDLoc &dl);
10938
10939// X86 has dedicated shuffle that can be lowered to VEXPAND
10941 SDValue V2, ArrayRef<int> Mask,
10942 const APInt &Zeroable,
10943 const X86Subtarget &Subtarget,
10944 SelectionDAG &DAG) {
10945 bool IsLeftZeroSide = true;
10946 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
10947 IsLeftZeroSide))
10948 return SDValue();
10949 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
10951 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
10952 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
10953 unsigned NumElts = VT.getVectorNumElements();
10954 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
10955 "Unexpected number of vector elements");
10956 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
10957 Subtarget, DAG, DL);
10958 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
10959 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
10960 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
10961}
10962
10963static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
10964 unsigned &UnpackOpcode, bool IsUnary,
10965 ArrayRef<int> TargetMask, const SDLoc &DL,
10966 SelectionDAG &DAG,
10967 const X86Subtarget &Subtarget) {
10968 int NumElts = VT.getVectorNumElements();
10969
10970 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
10971 for (int i = 0; i != NumElts; i += 2) {
10972 int M1 = TargetMask[i + 0];
10973 int M2 = TargetMask[i + 1];
10974 Undef1 &= (SM_SentinelUndef == M1);
10975 Undef2 &= (SM_SentinelUndef == M2);
10976 Zero1 &= isUndefOrZero(M1);
10977 Zero2 &= isUndefOrZero(M2);
10978 }
10979 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
10980 "Zeroable shuffle detected");
10981
10982 // Attempt to match the target mask against the unpack lo/hi mask patterns.
10983 SmallVector<int, 64> Unpckl, Unpckh;
10984 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
10985 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,
10986 (IsUnary ? V1 : V2))) {
10987 UnpackOpcode = X86ISD::UNPCKL;
10988 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10989 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10990 return true;
10991 }
10992
10993 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
10994 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,
10995 (IsUnary ? V1 : V2))) {
10996 UnpackOpcode = X86ISD::UNPCKH;
10997 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10998 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10999 return true;
11000 }
11001
11002 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
11003 if (IsUnary && (Zero1 || Zero2)) {
11004 // Don't bother if we can blend instead.
11005 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
11006 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
11007 return false;
11008
11009 bool MatchLo = true, MatchHi = true;
11010 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
11011 int M = TargetMask[i];
11012
11013 // Ignore if the input is known to be zero or the index is undef.
11014 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
11015 (M == SM_SentinelUndef))
11016 continue;
11017
11018 MatchLo &= (M == Unpckl[i]);
11019 MatchHi &= (M == Unpckh[i]);
11020 }
11021
11022 if (MatchLo || MatchHi) {
11023 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11024 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
11025 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
11026 return true;
11027 }
11028 }
11029
11030 // If a binary shuffle, commute and try again.
11031 if (!IsUnary) {
11033 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {
11034 UnpackOpcode = X86ISD::UNPCKL;
11035 std::swap(V1, V2);
11036 return true;
11037 }
11038
11040 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {
11041 UnpackOpcode = X86ISD::UNPCKH;
11042 std::swap(V1, V2);
11043 return true;
11044 }
11045 }
11046
11047 return false;
11048}
11049
11050// X86 has dedicated unpack instructions that can handle specific blend
11051// operations: UNPCKH and UNPCKL.
11053 SDValue V2, ArrayRef<int> Mask,
11054 SelectionDAG &DAG) {
11055 SmallVector<int, 8> Unpckl;
11056 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
11057 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11058 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
11059
11060 SmallVector<int, 8> Unpckh;
11061 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
11062 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11063 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
11064
11065 // Commute and try again.
11067 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11068 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
11069
11071 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11072 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
11073
11074 return SDValue();
11075}
11076
11077/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
11078/// followed by unpack 256-bit.
11080 SDValue V2, ArrayRef<int> Mask,
11081 SelectionDAG &DAG) {
11082 SmallVector<int, 32> Unpckl, Unpckh;
11083 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
11084 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
11085
11086 unsigned UnpackOpcode;
11087 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11088 UnpackOpcode = X86ISD::UNPCKL;
11089 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11090 UnpackOpcode = X86ISD::UNPCKH;
11091 else
11092 return SDValue();
11093
11094 // This is a "natural" unpack operation (rather than the 128-bit sectored
11095 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
11096 // input in order to use the x86 instruction.
11097 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
11098 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
11099 V1 = DAG.getBitcast(VT, V1);
11100 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
11101}
11102
11103// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
11104// source into the lower elements and zeroing the upper elements.
11105static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
11106 ArrayRef<int> Mask, const APInt &Zeroable,
11107 const X86Subtarget &Subtarget) {
11108 if (!VT.is512BitVector() && !Subtarget.hasVLX())
11109 return false;
11110
11111 unsigned NumElts = Mask.size();
11112 unsigned EltSizeInBits = VT.getScalarSizeInBits();
11113 unsigned MaxScale = 64 / EltSizeInBits;
11114
11115 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11116 unsigned SrcEltBits = EltSizeInBits * Scale;
11117 if (SrcEltBits < 32 && !Subtarget.hasBWI())
11118 continue;
11119 unsigned NumSrcElts = NumElts / Scale;
11120 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
11121 continue;
11122 unsigned UpperElts = NumElts - NumSrcElts;
11123 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
11124 continue;
11125 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
11126 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
11127 DstVT = MVT::getIntegerVT(EltSizeInBits);
11128 if ((NumSrcElts * EltSizeInBits) >= 128) {
11129 // ISD::TRUNCATE
11130 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
11131 } else {
11132 // X86ISD::VTRUNC
11133 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
11134 }
11135 return true;
11136 }
11137
11138 return false;
11139}
11140
11141// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
11142// element padding to the final DstVT.
11143static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
11144 const X86Subtarget &Subtarget,
11145 SelectionDAG &DAG, bool ZeroUppers) {
11146 MVT SrcVT = Src.getSimpleValueType();
11147 MVT DstSVT = DstVT.getScalarType();
11148 unsigned NumDstElts = DstVT.getVectorNumElements();
11149 unsigned NumSrcElts = SrcVT.getVectorNumElements();
11150 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
11151
11152 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
11153 return SDValue();
11154
11155 // Perform a direct ISD::TRUNCATE if possible.
11156 if (NumSrcElts == NumDstElts)
11157 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
11158
11159 if (NumSrcElts > NumDstElts) {
11160 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
11161 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
11162 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
11163 }
11164
11165 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
11166 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
11167 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
11168 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
11169 DstVT.getSizeInBits());
11170 }
11171
11172 // Non-VLX targets must truncate from a 512-bit type, so we need to
11173 // widen, truncate and then possibly extract the original subvector.
11174 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
11175 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
11176 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
11177 }
11178
11179 // Fallback to a X86ISD::VTRUNC, padding if necessary.
11180 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
11181 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
11182 if (DstVT != TruncVT)
11183 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
11184 DstVT.getSizeInBits());
11185 return Trunc;
11186}
11187
11188// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
11189//
11190// An example is the following:
11191//
11192// t0: ch = EntryToken
11193// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
11194// t25: v4i32 = truncate t2
11195// t41: v8i16 = bitcast t25
11196// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
11197// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
11198// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
11199// t18: v2i64 = bitcast t51
11200//
11201// One can just use a single vpmovdw instruction, without avx512vl we need to
11202// use the zmm variant and extract the lower subvector, padding with zeroes.
11203// TODO: Merge with lowerShuffleAsVTRUNC.
11205 SDValue V2, ArrayRef<int> Mask,
11206 const APInt &Zeroable,
11207 const X86Subtarget &Subtarget,
11208 SelectionDAG &DAG) {
11209 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
11210 if (!Subtarget.hasAVX512())
11211 return SDValue();
11212
11213 unsigned NumElts = VT.getVectorNumElements();
11214 unsigned EltSizeInBits = VT.getScalarSizeInBits();
11215 unsigned MaxScale = 64 / EltSizeInBits;
11216 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11217 unsigned SrcEltBits = EltSizeInBits * Scale;
11218 unsigned NumSrcElts = NumElts / Scale;
11219 unsigned UpperElts = NumElts - NumSrcElts;
11220 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
11221 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
11222 continue;
11223
11224 // Attempt to find a matching source truncation, but as a fall back VLX
11225 // cases can use the VPMOV directly.
11226 SDValue Src = peekThroughBitcasts(V1);
11227 if (Src.getOpcode() == ISD::TRUNCATE &&
11228 Src.getScalarValueSizeInBits() == SrcEltBits) {
11229 Src = Src.getOperand(0);
11230 } else if (Subtarget.hasVLX()) {
11231 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
11232 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
11233 Src = DAG.getBitcast(SrcVT, Src);
11234 // Don't do this if PACKSS/PACKUS could perform it cheaper.
11235 if (Scale == 2 &&
11236 ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
11237 (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
11238 return SDValue();
11239 } else
11240 return SDValue();
11241
11242 // VPMOVWB is only available with avx512bw.
11243 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
11244 return SDValue();
11245
11246 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
11247 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
11248 }
11249
11250 return SDValue();
11251}
11252
11253// Attempt to match binary shuffle patterns as a truncate.
11255 SDValue V2, ArrayRef<int> Mask,
11256 const APInt &Zeroable,
11257 const X86Subtarget &Subtarget,
11258 SelectionDAG &DAG) {
11259 assert((VT.is128BitVector() || VT.is256BitVector()) &&
11260 "Unexpected VTRUNC type");
11261 if (!Subtarget.hasAVX512() ||
11262 (VT.is256BitVector() && !Subtarget.useAVX512Regs()))
11263 return SDValue();
11264
11265 unsigned NumElts = VT.getVectorNumElements();
11266 unsigned EltSizeInBits = VT.getScalarSizeInBits();
11267 unsigned MaxScale = 64 / EltSizeInBits;
11268 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11269 // TODO: Support non-BWI VPMOVWB truncations?
11270 unsigned SrcEltBits = EltSizeInBits * Scale;
11271 if (SrcEltBits < 32 && !Subtarget.hasBWI())
11272 continue;
11273
11274 // Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>
11275 // Bail if the V2 elements are undef.
11276 unsigned NumHalfSrcElts = NumElts / Scale;
11277 unsigned NumSrcElts = 2 * NumHalfSrcElts;
11278 for (unsigned Offset = 0; Offset != Scale; ++Offset) {
11279 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||
11280 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
11281 continue;
11282
11283 // The elements beyond the truncation must be undef/zero.
11284 unsigned UpperElts = NumElts - NumSrcElts;
11285 if (UpperElts > 0 &&
11286 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
11287 continue;
11288 bool UndefUppers =
11289 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
11290
11291 // As we're using both sources then we need to concat them together
11292 // and truncate from the double-sized src.
11293 MVT ConcatVT = VT.getDoubleNumVectorElementsVT();
11294
11295 // For offset truncations, ensure that the concat is cheap.
11296 SDValue Src =
11297 combineConcatVectorOps(DL, ConcatVT, {V1, V2}, DAG, Subtarget);
11298 if (!Src) {
11299 if (Offset)
11300 continue;
11301 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
11302 }
11303
11304 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
11305 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
11306 Src = DAG.getBitcast(SrcVT, Src);
11307
11308 // Shift the offset'd elements into place for the truncation.
11309 // TODO: Use getTargetVShiftByConstNode.
11310 if (Offset)
11311 Src = DAG.getNode(
11312 X86ISD::VSRLI, DL, SrcVT, Src,
11313 DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));
11314
11315 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
11316 }
11317 }
11318
11319 return SDValue();
11320}
11321
11322/// Check whether a compaction lowering can be done by dropping even/odd
11323/// elements and compute how many times even/odd elements must be dropped.
11324///
11325/// This handles shuffles which take every Nth element where N is a power of
11326/// two. Example shuffle masks:
11327///
11328/// (even)
11329/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
11330/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
11331/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
11332/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
11333/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
11334/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
11335///
11336/// (odd)
11337/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14
11338/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
11339///
11340/// Any of these lanes can of course be undef.
11341///
11342/// This routine only supports N <= 3.
11343/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
11344/// for larger N.
11345///
11346/// \returns N above, or the number of times even/odd elements must be dropped
11347/// if there is such a number. Otherwise returns zero.
11348static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
11349 bool IsSingleInput) {
11350 // The modulus for the shuffle vector entries is based on whether this is
11351 // a single input or not.
11352 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
11353 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
11354 "We should only be called with masks with a power-of-2 size!");
11355
11356 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
11357 int Offset = MatchEven ? 0 : 1;
11358
11359 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
11360 // and 2^3 simultaneously. This is because we may have ambiguity with
11361 // partially undef inputs.
11362 bool ViableForN[3] = {true, true, true};
11363
11364 for (int i = 0, e = Mask.size(); i < e; ++i) {
11365 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
11366 // want.
11367 if (Mask[i] < 0)
11368 continue;
11369
11370 bool IsAnyViable = false;
11371 for (unsigned j = 0; j != std::size(ViableForN); ++j)
11372 if (ViableForN[j]) {
11373 uint64_t N = j + 1;
11374
11375 // The shuffle mask must be equal to (i * 2^N) % M.
11376 if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
11377 IsAnyViable = true;
11378 else
11379 ViableForN[j] = false;
11380 }
11381 // Early exit if we exhaust the possible powers of two.
11382 if (!IsAnyViable)
11383 break;
11384 }
11385
11386 for (unsigned j = 0; j != std::size(ViableForN); ++j)
11387 if (ViableForN[j])
11388 return j + 1;
11389
11390 // Return 0 as there is no viable power of two.
11391 return 0;
11392}
11393
11394// X86 has dedicated pack instructions that can handle specific truncation
11395// operations: PACKSS and PACKUS.
11396// Checks for compaction shuffle masks if MaxStages > 1.
11397// TODO: Add support for matching multiple PACKSS/PACKUS stages.
11398static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
11399 unsigned &PackOpcode, ArrayRef<int> TargetMask,
11400 const SelectionDAG &DAG,
11401 const X86Subtarget &Subtarget,
11402 unsigned MaxStages = 1) {
11403 unsigned NumElts = VT.getVectorNumElements();
11404 unsigned BitSize = VT.getScalarSizeInBits();
11405 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
11406 "Illegal maximum compaction");
11407
11408 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
11409 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
11410 unsigned NumPackedBits = NumSrcBits - BitSize;
11411 N1 = peekThroughBitcasts(N1);
11412 N2 = peekThroughBitcasts(N2);
11413 unsigned NumBits1 = N1.getScalarValueSizeInBits();
11414 unsigned NumBits2 = N2.getScalarValueSizeInBits();
11415 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
11416 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
11417 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
11418 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
11419 return false;
11420 if (Subtarget.hasSSE41() || BitSize == 8) {
11421 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
11422 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
11423 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
11424 V1 = N1;
11425 V2 = N2;
11426 SrcVT = PackVT;
11427 PackOpcode = X86ISD::PACKUS;
11428 return true;
11429 }
11430 }
11431 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
11432 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
11433 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
11434 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
11435 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
11436 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
11437 V1 = N1;
11438 V2 = N2;
11439 SrcVT = PackVT;
11440 PackOpcode = X86ISD::PACKSS;
11441 return true;
11442 }
11443 return false;
11444 };
11445
11446 // Attempt to match against wider and wider compaction patterns.
11447 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
11448 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
11449 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
11450
11451 // Try binary shuffle.
11452 SmallVector<int, 32> BinaryMask;
11453 createPackShuffleMask(VT, BinaryMask, false, NumStages);
11454 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))
11455 if (MatchPACK(V1, V2, PackVT))
11456 return true;
11457
11458 // Try unary shuffle.
11459 SmallVector<int, 32> UnaryMask;
11460 createPackShuffleMask(VT, UnaryMask, true, NumStages);
11461 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))
11462 if (MatchPACK(V1, V1, PackVT))
11463 return true;
11464 }
11465
11466 return false;
11467}
11468
11470 SDValue V2, ArrayRef<int> Mask,
11471 const X86Subtarget &Subtarget,
11472 SelectionDAG &DAG) {
11473 MVT PackVT;
11474 unsigned PackOpcode;
11475 unsigned SizeBits = VT.getSizeInBits();
11476 unsigned EltBits = VT.getScalarSizeInBits();
11477 unsigned MaxStages = Log2_32(64 / EltBits);
11478 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
11479 Subtarget, MaxStages))
11480 return SDValue();
11481
11482 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
11483 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
11484
11485 // Don't lower multi-stage packs on AVX512, truncation is better.
11486 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
11487 return SDValue();
11488
11489 // Pack to the largest type possible:
11490 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
11491 unsigned MaxPackBits = 16;
11492 if (CurrentEltBits > 16 &&
11493 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
11494 MaxPackBits = 32;
11495
11496 // Repeatedly pack down to the target size.
11497 SDValue Res;
11498 for (unsigned i = 0; i != NumStages; ++i) {
11499 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
11500 unsigned NumSrcElts = SizeBits / SrcEltBits;
11501 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
11502 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
11503 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
11504 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
11505 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
11506 DAG.getBitcast(SrcVT, V2));
11507 V1 = V2 = Res;
11508 CurrentEltBits /= 2;
11509 }
11510 assert(Res && Res.getValueType() == VT &&
11511 "Failed to lower compaction shuffle");
11512 return Res;
11513}
11514
11515/// Try to emit a bitmask instruction for a shuffle.
11516///
11517/// This handles cases where we can model a blend exactly as a bitmask due to
11518/// one of the inputs being zeroable.
11520 SDValue V2, ArrayRef<int> Mask,
11521 const APInt &Zeroable,
11522 const X86Subtarget &Subtarget,
11523 SelectionDAG &DAG) {
11524 MVT MaskVT = VT;
11525 MVT EltVT = VT.getVectorElementType();
11526 SDValue Zero, AllOnes;
11527 // Use f64 if i64 isn't legal.
11528 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
11529 EltVT = MVT::f64;
11530 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
11531 }
11532
11533 MVT LogicVT = VT;
11534 if (EltVT.isFloatingPoint()) {
11535 Zero = DAG.getConstantFP(0.0, DL, EltVT);
11536 APFloat AllOnesValue = APFloat::getAllOnesValue(EltVT.getFltSemantics());
11537 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
11538 LogicVT = MVT::getVectorVT(EltVT.changeTypeToInteger(), Mask.size());
11539 } else {
11540 Zero = DAG.getConstant(0, DL, EltVT);
11541 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11542 }
11543
11544 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
11545 SDValue V;
11546 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11547 if (Zeroable[i])
11548 continue;
11549 if (Mask[i] % Size != i)
11550 return SDValue(); // Not a blend.
11551 if (!V)
11552 V = Mask[i] < Size ? V1 : V2;
11553 else if (V != (Mask[i] < Size ? V1 : V2))
11554 return SDValue(); // Can only let one input through the mask.
11555
11556 VMaskOps[i] = AllOnes;
11557 }
11558 if (!V)
11559 return SDValue(); // No non-zeroable elements!
11560
11561 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
11562 VMask = DAG.getBitcast(LogicVT, VMask);
11563 V = DAG.getBitcast(LogicVT, V);
11564 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
11565 return DAG.getBitcast(VT, And);
11566}
11567
11568/// Try to emit a blend instruction for a shuffle using bit math.
11569///
11570/// This is used as a fallback approach when first class blend instructions are
11571/// unavailable. Currently it is only suitable for integer vectors, but could
11572/// be generalized for floating point vectors if desirable.
11574 SDValue V2, ArrayRef<int> Mask,
11575 SelectionDAG &DAG) {
11576 assert(VT.isInteger() && "Only supports integer vector types!");
11577 MVT EltVT = VT.getVectorElementType();
11578 SDValue Zero = DAG.getConstant(0, DL, EltVT);
11579 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11581 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11582 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
11583 return SDValue(); // Shuffled input!
11584 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
11585 }
11586
11587 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
11588 return getBitSelect(DL, VT, V1, V2, V1Mask, DAG);
11589}
11590
11592 SDValue PreservedSrc,
11593 const X86Subtarget &Subtarget,
11594 SelectionDAG &DAG);
11595
11598 const APInt &Zeroable, bool &ForceV1Zero,
11599 bool &ForceV2Zero, uint64_t &BlendMask) {
11600 bool V1IsZeroOrUndef =
11602 bool V2IsZeroOrUndef =
11604
11605 BlendMask = 0;
11606 ForceV1Zero = false, ForceV2Zero = false;
11607 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
11608
11609 int NumElts = Mask.size();
11610 int NumLanes = VT.getSizeInBits() / 128;
11611 int NumEltsPerLane = NumElts / NumLanes;
11612 assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch");
11613
11614 // For 32/64-bit elements, if we only reference one input (plus any undefs),
11615 // then ensure the blend mask part for that lane just references that input.
11616 bool ForceWholeLaneMasks =
11617 VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;
11618
11619 // Attempt to generate the binary blend mask. If an input is zero then
11620 // we can use any lane.
11621 for (int Lane = 0; Lane != NumLanes; ++Lane) {
11622 // Keep track of the inputs used per lane.
11623 bool LaneV1InUse = false;
11624 bool LaneV2InUse = false;
11625 uint64_t LaneBlendMask = 0;
11626 for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
11627 int Elt = (Lane * NumEltsPerLane) + LaneElt;
11628 int M = Mask[Elt];
11629 if (M == SM_SentinelUndef)
11630 continue;
11631 if (M == Elt || (0 <= M && M < NumElts &&
11632 IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
11633 Mask[Elt] = Elt;
11634 LaneV1InUse = true;
11635 continue;
11636 }
11637 if (M == (Elt + NumElts) ||
11638 (NumElts <= M &&
11639 IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {
11640 LaneBlendMask |= 1ull << LaneElt;
11641 Mask[Elt] = Elt + NumElts;
11642 LaneV2InUse = true;
11643 continue;
11644 }
11645 if (Zeroable[Elt]) {
11646 if (V1IsZeroOrUndef) {
11647 ForceV1Zero = true;
11648 Mask[Elt] = Elt;
11649 LaneV1InUse = true;
11650 continue;
11651 }
11652 if (V2IsZeroOrUndef) {
11653 ForceV2Zero = true;
11654 LaneBlendMask |= 1ull << LaneElt;
11655 Mask[Elt] = Elt + NumElts;
11656 LaneV2InUse = true;
11657 continue;
11658 }
11659 }
11660 return false;
11661 }
11662
11663 // If we only used V2 then splat the lane blend mask to avoid any demanded
11664 // elts from V1 in this lane (the V1 equivalent is implicit with a zero
11665 // blend mask bit).
11666 if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
11667 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
11668
11669 BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
11670 }
11671 return true;
11672}
11673
11674/// Try to emit a blend instruction for a shuffle.
11675///
11676/// This doesn't do any checks for the availability of instructions for blending
11677/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
11678/// be matched in the backend with the type given. What it does check for is
11679/// that the shuffle mask is a blend, or convertible into a blend with zero.
11681 SDValue V2, ArrayRef<int> Original,
11682 const APInt &Zeroable,
11683 const X86Subtarget &Subtarget,
11684 SelectionDAG &DAG) {
11685 uint64_t BlendMask = 0;
11686 bool ForceV1Zero = false, ForceV2Zero = false;
11687 SmallVector<int, 64> Mask(Original);
11688 if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
11689 BlendMask))
11690 return SDValue();
11691
11692 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
11693 if (ForceV1Zero)
11694 V1 = getZeroVector(VT, Subtarget, DAG, DL);
11695 if (ForceV2Zero)
11696 V2 = getZeroVector(VT, Subtarget, DAG, DL);
11697
11698 unsigned NumElts = VT.getVectorNumElements();
11699
11700 switch (VT.SimpleTy) {
11701 case MVT::v4i64:
11702 case MVT::v8i32:
11703 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
11704 [[fallthrough]];
11705 case MVT::v4f64:
11706 case MVT::v8f32:
11707 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
11708 [[fallthrough]];
11709 case MVT::v2f64:
11710 case MVT::v2i64:
11711 case MVT::v4f32:
11712 case MVT::v4i32:
11713 case MVT::v8i16:
11714 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
11715 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
11716 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11717 case MVT::v16i16: {
11718 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
11719 SmallVector<int, 8> RepeatedMask;
11720 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
11721 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
11722 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
11723 BlendMask = 0;
11724 for (int i = 0; i < 8; ++i)
11725 if (RepeatedMask[i] >= 8)
11726 BlendMask |= 1ull << i;
11727 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11728 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11729 }
11730 // Use PBLENDW for lower/upper lanes and then blend lanes.
11731 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
11732 // merge to VSELECT where useful.
11733 uint64_t LoMask = BlendMask & 0xFF;
11734 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
11735 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
11736 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11737 DAG.getTargetConstant(LoMask, DL, MVT::i8));
11738 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11739 DAG.getTargetConstant(HiMask, DL, MVT::i8));
11740 return DAG.getVectorShuffle(
11741 MVT::v16i16, DL, Lo, Hi,
11742 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
11743 }
11744 [[fallthrough]];
11745 }
11746 case MVT::v32i8:
11747 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
11748 [[fallthrough]];
11749 case MVT::v16i8: {
11750 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
11751
11752 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
11753 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11754 Subtarget, DAG))
11755 return Masked;
11756
11757 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
11758 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11759 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11760 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11761 }
11762
11763 // If we have VPTERNLOG, we can use that as a bit blend.
11764 if (Subtarget.hasVLX())
11765 if (SDValue BitBlend =
11766 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
11767 return BitBlend;
11768
11769 // Scale the blend by the number of bytes per element.
11770 int Scale = VT.getScalarSizeInBits() / 8;
11771
11772 // This form of blend is always done on bytes. Compute the byte vector
11773 // type.
11774 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11775
11776 // x86 allows load folding with blendvb from the 2nd source operand. But
11777 // we are still using LLVM select here (see comment below), so that's V1.
11778 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
11779 // allow that load-folding possibility.
11780 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
11782 std::swap(V1, V2);
11783 }
11784
11785 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
11786 // mix of LLVM's code generator and the x86 backend. We tell the code
11787 // generator that boolean values in the elements of an x86 vector register
11788 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
11789 // mapping a select to operand #1, and 'false' mapping to operand #2. The
11790 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
11791 // of the element (the remaining are ignored) and 0 in that high bit would
11792 // mean operand #1 while 1 in the high bit would mean operand #2. So while
11793 // the LLVM model for boolean values in vector elements gets the relevant
11794 // bit set, it is set backwards and over constrained relative to x86's
11795 // actual model.
11796 SmallVector<SDValue, 32> VSELECTMask;
11797 for (int i = 0, Size = Mask.size(); i < Size; ++i)
11798 for (int j = 0; j < Scale; ++j)
11799 VSELECTMask.push_back(
11800 Mask[i] < 0
11801 ? DAG.getUNDEF(MVT::i8)
11802 : DAG.getSignedConstant(Mask[i] < Size ? -1 : 0, DL, MVT::i8));
11803
11804 V1 = DAG.getBitcast(BlendVT, V1);
11805 V2 = DAG.getBitcast(BlendVT, V2);
11806 return DAG.getBitcast(
11807 VT,
11808 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
11809 V1, V2));
11810 }
11811 case MVT::v16f32:
11812 case MVT::v8f64:
11813 case MVT::v8i64:
11814 case MVT::v16i32:
11815 case MVT::v32i16:
11816 case MVT::v64i8: {
11817 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
11818 bool OptForSize = DAG.shouldOptForSize();
11819 if (!OptForSize) {
11820 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11821 Subtarget, DAG))
11822 return Masked;
11823 }
11824
11825 // Otherwise load an immediate into a GPR, cast to k-register, and use a
11826 // masked move.
11827 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11828 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11829 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11830 }
11831 default:
11832 llvm_unreachable("Not a supported integer vector type!");
11833 }
11834}
11835
11836/// Try to lower as a blend of elements from two inputs followed by
11837/// a single-input permutation.
11838///
11839/// This matches the pattern where we can blend elements from two inputs and
11840/// then reduce the shuffle to a single-input permutation.
11842 SDValue V1, SDValue V2,
11843 ArrayRef<int> Mask,
11844 SelectionDAG &DAG,
11845 bool ImmBlends = false) {
11846 // We build up the blend mask while checking whether a blend is a viable way
11847 // to reduce the shuffle.
11848 SmallVector<int, 32> BlendMask(Mask.size(), -1);
11849 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
11850
11851 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11852 if (Mask[i] < 0)
11853 continue;
11854
11855 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
11856
11857 if (BlendMask[Mask[i] % Size] < 0)
11858 BlendMask[Mask[i] % Size] = Mask[i];
11859 else if (BlendMask[Mask[i] % Size] != Mask[i])
11860 return SDValue(); // Can't blend in the needed input!
11861
11862 PermuteMask[i] = Mask[i] % Size;
11863 }
11864
11865 // If only immediate blends, then bail if the blend mask can't be widened to
11866 // i16.
11867 unsigned EltSize = VT.getScalarSizeInBits();
11868 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
11869 return SDValue();
11870
11871 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
11872 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
11873}
11874
11875/// Try to lower as an unpack of elements from two inputs followed by
11876/// a single-input permutation.
11877///
11878/// This matches the pattern where we can unpack elements from two inputs and
11879/// then reduce the shuffle to a single-input (wider) permutation.
11881 SDValue V1, SDValue V2,
11882 ArrayRef<int> Mask,
11883 SelectionDAG &DAG) {
11884 int NumElts = Mask.size();
11885 int NumLanes = VT.getSizeInBits() / 128;
11886 int NumLaneElts = NumElts / NumLanes;
11887 int NumHalfLaneElts = NumLaneElts / 2;
11888
11889 bool MatchLo = true, MatchHi = true;
11890 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
11891
11892 // Determine UNPCKL/UNPCKH type and operand order.
11893 for (int Elt = 0; Elt != NumElts; ++Elt) {
11894 int M = Mask[Elt];
11895 if (M < 0)
11896 continue;
11897
11898 // Normalize the mask value depending on whether it's V1 or V2.
11899 int NormM = M;
11900 SDValue &Op = Ops[Elt & 1];
11901 if (M < NumElts && (Op.isUndef() || Op == V1))
11902 Op = V1;
11903 else if (NumElts <= M && (Op.isUndef() || Op == V2)) {
11904 Op = V2;
11905 NormM -= NumElts;
11906 } else
11907 return SDValue();
11908
11909 bool MatchLoAnyLane = false, MatchHiAnyLane = false;
11910 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
11911 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
11912 MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid);
11913 MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi);
11914 if (MatchLoAnyLane || MatchHiAnyLane) {
11915 assert((MatchLoAnyLane ^ MatchHiAnyLane) &&
11916 "Failed to match UNPCKLO/UNPCKHI");
11917 break;
11918 }
11919 }
11920 MatchLo &= MatchLoAnyLane;
11921 MatchHi &= MatchHiAnyLane;
11922 if (!MatchLo && !MatchHi)
11923 return SDValue();
11924 }
11925 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
11926
11927 // Element indices have changed after unpacking. Calculate permute mask
11928 // so that they will be put back to the position as dictated by the
11929 // original shuffle mask indices.
11930 SmallVector<int, 32> PermuteMask(NumElts, -1);
11931 for (int Elt = 0; Elt != NumElts; ++Elt) {
11932 int M = Mask[Elt];
11933 if (M < 0)
11934 continue;
11935 int NormM = M;
11936 if (NumElts <= M)
11937 NormM -= NumElts;
11938 bool IsFirstOp = M < NumElts;
11939 int BaseMaskElt =
11940 NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));
11941 if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0]))
11942 PermuteMask[Elt] = BaseMaskElt;
11943 else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1]))
11944 PermuteMask[Elt] = BaseMaskElt + 1;
11945 assert(PermuteMask[Elt] != -1 &&
11946 "Input mask element is defined but failed to assign permute mask");
11947 }
11948
11949 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11950 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
11951 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
11952}
11953
11954/// Try to lower a shuffle as a permute of the inputs followed by an
11955/// UNPCK instruction.
11956///
11957/// This specifically targets cases where we end up with alternating between
11958/// the two inputs, and so can permute them into something that feeds a single
11959/// UNPCK instruction. Note that this routine only targets integer vectors
11960/// because for floating point vectors we have a generalized SHUFPS lowering
11961/// strategy that handles everything that doesn't *exactly* match an unpack,
11962/// making this clever lowering unnecessary.
11964 SDValue V1, SDValue V2,
11965 ArrayRef<int> Mask,
11966 const X86Subtarget &Subtarget,
11967 SelectionDAG &DAG) {
11968 int Size = Mask.size();
11969 assert(Mask.size() >= 2 && "Single element masks are invalid.");
11970
11971 // This routine only supports 128-bit integer dual input vectors.
11972 if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())
11973 return SDValue();
11974
11975 int NumLoInputs =
11976 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
11977 int NumHiInputs =
11978 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
11979
11980 bool UnpackLo = NumLoInputs >= NumHiInputs;
11981
11982 auto TryUnpack = [&](int ScalarSize, int Scale) {
11983 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
11984 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
11985
11986 for (int i = 0; i < Size; ++i) {
11987 if (Mask[i] < 0)
11988 continue;
11989
11990 // Each element of the unpack contains Scale elements from this mask.
11991 int UnpackIdx = i / Scale;
11992
11993 // We only handle the case where V1 feeds the first slots of the unpack.
11994 // We rely on canonicalization to ensure this is the case.
11995 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
11996 return SDValue();
11997
11998 // Setup the mask for this input. The indexing is tricky as we have to
11999 // handle the unpack stride.
12000 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
12001 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
12002 Mask[i] % Size;
12003 }
12004
12005 // If we will have to shuffle both inputs to use the unpack, check whether
12006 // we can just unpack first and shuffle the result. If so, skip this unpack.
12007 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
12008 !isNoopShuffleMask(V2Mask))
12009 return SDValue();
12010
12011 // Shuffle the inputs into place.
12012 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
12013 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
12014
12015 // Cast the inputs to the type we will use to unpack them.
12016 MVT UnpackVT =
12017 MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
12018 V1 = DAG.getBitcast(UnpackVT, V1);
12019 V2 = DAG.getBitcast(UnpackVT, V2);
12020
12021 // Unpack the inputs and cast the result back to the desired type.
12022 return DAG.getBitcast(
12023 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
12024 UnpackVT, V1, V2));
12025 };
12026
12027 // We try each unpack from the largest to the smallest to try and find one
12028 // that fits this mask.
12029 int OrigScalarSize = VT.getScalarSizeInBits();
12030 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
12031 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
12032 return Unpack;
12033
12034 // If we're shuffling with a zero vector then we're better off not doing
12035 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
12038 return SDValue();
12039
12040 // If none of the unpack-rooted lowerings worked (or were profitable) try an
12041 // initial unpack.
12042 if (NumLoInputs == 0 || NumHiInputs == 0) {
12043 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
12044 "We have to have *some* inputs!");
12045 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
12046
12047 // FIXME: We could consider the total complexity of the permute of each
12048 // possible unpacking. Or at the least we should consider how many
12049 // half-crossings are created.
12050 // FIXME: We could consider commuting the unpacks.
12051
12052 SmallVector<int, 32> PermMask((unsigned)Size, -1);
12053 for (int i = 0; i < Size; ++i) {
12054 if (Mask[i] < 0)
12055 continue;
12056
12057 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
12058
12059 PermMask[i] =
12060 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
12061 }
12062 return DAG.getVectorShuffle(
12063 VT, DL,
12064 DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,
12065 V1, V2),
12066 DAG.getUNDEF(VT), PermMask);
12067 }
12068
12069 return SDValue();
12070}
12071
12072/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
12073/// permuting the elements of the result in place.
12075 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12076 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12077 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
12078 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
12079 (VT.is512BitVector() && !Subtarget.hasBWI()))
12080 return SDValue();
12081
12082 // We don't currently support lane crossing permutes.
12083 if (is128BitLaneCrossingShuffleMask(VT, Mask))
12084 return SDValue();
12085
12086 int Scale = VT.getScalarSizeInBits() / 8;
12087 int NumLanes = VT.getSizeInBits() / 128;
12088 int NumElts = VT.getVectorNumElements();
12089 int NumEltsPerLane = NumElts / NumLanes;
12090
12091 // Determine range of mask elts.
12092 bool Blend1 = true;
12093 bool Blend2 = true;
12094 std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
12095 std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
12096 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
12097 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
12098 int M = Mask[Lane + Elt];
12099 if (M < 0)
12100 continue;
12101 if (M < NumElts) {
12102 Blend1 &= (M == (Lane + Elt));
12103 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
12104 M = M % NumEltsPerLane;
12105 Range1.first = std::min(Range1.first, M);
12106 Range1.second = std::max(Range1.second, M);
12107 } else {
12108 M -= NumElts;
12109 Blend2 &= (M == (Lane + Elt));
12110 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
12111 M = M % NumEltsPerLane;
12112 Range2.first = std::min(Range2.first, M);
12113 Range2.second = std::max(Range2.second, M);
12114 }
12115 }
12116 }
12117
12118 // Bail if we don't need both elements.
12119 // TODO - it might be worth doing this for unary shuffles if the permute
12120 // can be widened.
12121 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
12122 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
12123 return SDValue();
12124
12125 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
12126 return SDValue();
12127
12128 // Rotate the 2 ops so we can access both ranges, then permute the result.
12129 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
12130 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12131 SDValue Rotate = DAG.getBitcast(
12132 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
12133 DAG.getBitcast(ByteVT, Lo),
12134 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
12135 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
12136 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
12137 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
12138 int M = Mask[Lane + Elt];
12139 if (M < 0)
12140 continue;
12141 if (M < NumElts)
12142 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
12143 else
12144 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
12145 }
12146 }
12147 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
12148 };
12149
12150 // Check if the ranges are small enough to rotate from either direction.
12151 if (Range2.second < Range1.first)
12152 return RotateAndPermute(V1, V2, Range1.first, 0);
12153 if (Range1.second < Range2.first)
12154 return RotateAndPermute(V2, V1, Range2.first, NumElts);
12155 return SDValue();
12156}
12157
12159 return isUndefOrEqual(Mask, 0);
12160}
12161
12163 return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
12164}
12165
12166/// Check if the Mask consists of the same element repeated multiple times.
12168 size_t NumUndefs = 0;
12169 std::optional<int> UniqueElt;
12170 for (int Elt : Mask) {
12171 if (Elt == SM_SentinelUndef) {
12172 NumUndefs++;
12173 continue;
12174 }
12175 if (UniqueElt.has_value() && UniqueElt.value() != Elt)
12176 return false;
12177 UniqueElt = Elt;
12178 }
12179 // Make sure the element is repeated enough times by checking the number of
12180 // undefs is small.
12181 return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
12182}
12183
12184/// Generic routine to decompose a shuffle and blend into independent
12185/// blends and permutes.
12186///
12187/// This matches the extremely common pattern for handling combined
12188/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
12189/// operations. It will try to pick the best arrangement of shuffles and
12190/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
12192 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12193 const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12194 int NumElts = Mask.size();
12195 int NumLanes = VT.getSizeInBits() / 128;
12196 int NumEltsPerLane = NumElts / NumLanes;
12197
12198 // Shuffle the input elements into the desired positions in V1 and V2 and
12199 // unpack/blend them together.
12200 bool IsAlternating = true;
12201 bool V1Zero = true, V2Zero = true;
12202 SmallVector<int, 32> V1Mask(NumElts, -1);
12203 SmallVector<int, 32> V2Mask(NumElts, -1);
12204 SmallVector<int, 32> FinalMask(NumElts, -1);
12205 for (int i = 0; i < NumElts; ++i) {
12206 int M = Mask[i];
12207 if (M >= 0 && M < NumElts) {
12208 V1Mask[i] = M;
12209 FinalMask[i] = i;
12210 V1Zero &= Zeroable[i];
12211 IsAlternating &= (i & 1) == 0;
12212 } else if (M >= NumElts) {
12213 V2Mask[i] = M - NumElts;
12214 FinalMask[i] = i + NumElts;
12215 V2Zero &= Zeroable[i];
12216 IsAlternating &= (i & 1) == 1;
12217 }
12218 }
12219
12220 // If we effectively only demand the 0'th element of \p Input, and not only
12221 // as 0'th element, then broadcast said input,
12222 // and change \p InputMask to be a no-op (identity) mask.
12223 auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
12224 &DAG](SDValue &Input,
12225 MutableArrayRef<int> InputMask) {
12226 unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
12227 if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
12228 !X86::mayFoldLoad(Input, Subtarget)))
12229 return;
12230 if (isNoopShuffleMask(InputMask))
12231 return;
12232 assert(isBroadcastShuffleMask(InputMask) &&
12233 "Expected to demand only the 0'th element.");
12234 Input = DAG.getNode(X86ISD::VBROADCAST, DL, VT, Input);
12235 for (auto I : enumerate(InputMask)) {
12236 int &InputMaskElt = I.value();
12237 if (InputMaskElt >= 0)
12238 InputMaskElt = I.index();
12239 }
12240 };
12241
12242 // Currently, we may need to produce one shuffle per input, and blend results.
12243 // It is possible that the shuffle for one of the inputs is already a no-op.
12244 // See if we can simplify non-no-op shuffles into broadcasts,
12245 // which we consider to be strictly better than an arbitrary shuffle.
12246 if (isNoopOrBroadcastShuffleMask(V1Mask) &&
12248 canonicalizeBroadcastableInput(V1, V1Mask);
12249 canonicalizeBroadcastableInput(V2, V2Mask);
12250 }
12251
12252 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
12253 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
12254 // the shuffle may be able to fold with a load or other benefit. However, when
12255 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
12256 // pre-shuffle first is a better strategy.
12257 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
12258 // If we don't have blends, see if we can create a cheap unpack.
12259 if (!Subtarget.hasSSE41() && VT.is128BitVector() &&
12260 (is128BitUnpackShuffleMask(V1Mask, DAG) ||
12261 is128BitUnpackShuffleMask(V2Mask, DAG)))
12262 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
12263 DL, VT, V1, V2, Mask, Subtarget, DAG))
12264 return PermUnpack;
12265
12266 // Only prefer immediate blends to unpack/rotate.
12267 if (SDValue BlendPerm =
12268 lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG, true))
12269 return BlendPerm;
12270
12271 // If either input vector provides only a single element which is repeated
12272 // multiple times, unpacking from both input vectors would generate worse
12273 // code. e.g. for
12274 // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
12275 // it is better to process t4 first to create a vector of t4[0], then unpack
12276 // that vector with t2.
12277 if (!V1Zero && !V2Zero && !isSingleElementRepeatedMask(V1Mask) &&
12279 if (SDValue UnpackPerm =
12280 lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
12281 return UnpackPerm;
12282
12284 DL, VT, V1, V2, Mask, Subtarget, DAG))
12285 return RotatePerm;
12286
12287 // Unpack/rotate failed - try again with variable blends.
12288 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
12289 DAG))
12290 return BlendPerm;
12291
12292 if (VT.getScalarSizeInBits() >= 32)
12293 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
12294 DL, VT, V1, V2, Mask, Subtarget, DAG))
12295 return PermUnpack;
12296 }
12297
12298 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
12299 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
12300 // TODO: It doesn't have to be alternating - but each lane mustn't have more
12301 // than half the elements coming from each source.
12302 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
12303 V1Mask.assign(NumElts, -1);
12304 V2Mask.assign(NumElts, -1);
12305 FinalMask.assign(NumElts, -1);
12306 for (int i = 0; i != NumElts; i += NumEltsPerLane)
12307 for (int j = 0; j != NumEltsPerLane; ++j) {
12308 int M = Mask[i + j];
12309 if (M >= 0 && M < NumElts) {
12310 V1Mask[i + (j / 2)] = M;
12311 FinalMask[i + j] = i + (j / 2);
12312 } else if (M >= NumElts) {
12313 V2Mask[i + (j / 2)] = M - NumElts;
12314 FinalMask[i + j] = i + (j / 2) + NumElts;
12315 }
12316 }
12317 }
12318
12319 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
12320 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
12321 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
12322}
12323
12324static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
12325 const X86Subtarget &Subtarget,
12326 ArrayRef<int> Mask) {
12327 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12328 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
12329
12330 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
12331 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
12332 int MaxSubElts = 64 / EltSizeInBits;
12333 unsigned RotateAmt, NumSubElts;
12334 if (!ShuffleVectorInst::isBitRotateMask(Mask, EltSizeInBits, MinSubElts,
12335 MaxSubElts, NumSubElts, RotateAmt))
12336 return -1;
12337 unsigned NumElts = Mask.size();
12338 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
12339 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
12340 return RotateAmt;
12341}
12342
12343/// Lower shuffle using X86ISD::VROTLI rotations.
12345 ArrayRef<int> Mask,
12346 const X86Subtarget &Subtarget,
12347 SelectionDAG &DAG) {
12348 // Only XOP + AVX512 targets have bit rotation instructions.
12349 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
12350 bool IsLegal =
12351 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
12352 if (!IsLegal && Subtarget.hasSSE3())
12353 return SDValue();
12354
12355 MVT RotateVT;
12356 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
12357 Subtarget, Mask);
12358 if (RotateAmt < 0)
12359 return SDValue();
12360
12361 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
12362 // expanded to OR(SRL,SHL), will be more efficient, but if they can
12363 // widen to vXi16 or more then existing lowering should will be better.
12364 if (!IsLegal) {
12365 if ((RotateAmt % 16) == 0)
12366 return SDValue();
12367 // TODO: Use getTargetVShiftByConstNode.
12368 unsigned ShlAmt = RotateAmt;
12369 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
12370 V1 = DAG.getBitcast(RotateVT, V1);
12371 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
12372 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
12373 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
12374 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
12375 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
12376 return DAG.getBitcast(VT, Rot);
12377 }
12378
12379 SDValue Rot =
12380 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
12381 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
12382 return DAG.getBitcast(VT, Rot);
12383}
12384
12385/// Try to match a vector shuffle as an element rotation.
12386///
12387/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
12389 ArrayRef<int> Mask) {
12390 int NumElts = Mask.size();
12391
12392 // We need to detect various ways of spelling a rotation:
12393 // [11, 12, 13, 14, 15, 0, 1, 2]
12394 // [-1, 12, 13, 14, -1, -1, 1, -1]
12395 // [-1, -1, -1, -1, -1, -1, 1, 2]
12396 // [ 3, 4, 5, 6, 7, 8, 9, 10]
12397 // [-1, 4, 5, 6, -1, -1, 9, -1]
12398 // [-1, 4, 5, 6, -1, -1, -1, -1]
12399 int Rotation = 0;
12400 SDValue Lo, Hi;
12401 for (int i = 0; i < NumElts; ++i) {
12402 int M = Mask[i];
12403 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
12404 "Unexpected mask index.");
12405 if (M < 0)
12406 continue;
12407
12408 // Determine where a rotated vector would have started.
12409 int StartIdx = i - (M % NumElts);
12410 if (StartIdx == 0)
12411 // The identity rotation isn't interesting, stop.
12412 return -1;
12413
12414 // If we found the tail of a vector the rotation must be the missing
12415 // front. If we found the head of a vector, it must be how much of the
12416 // head.
12417 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
12418
12419 if (Rotation == 0)
12420 Rotation = CandidateRotation;
12421 else if (Rotation != CandidateRotation)
12422 // The rotations don't match, so we can't match this mask.
12423 return -1;
12424
12425 // Compute which value this mask is pointing at.
12426 SDValue MaskV = M < NumElts ? V1 : V2;
12427
12428 // Compute which of the two target values this index should be assigned
12429 // to. This reflects whether the high elements are remaining or the low
12430 // elements are remaining.
12431 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
12432
12433 // Either set up this value if we've not encountered it before, or check
12434 // that it remains consistent.
12435 if (!TargetV)
12436 TargetV = MaskV;
12437 else if (TargetV != MaskV)
12438 // This may be a rotation, but it pulls from the inputs in some
12439 // unsupported interleaving.
12440 return -1;
12441 }
12442
12443 // Check that we successfully analyzed the mask, and normalize the results.
12444 assert(Rotation != 0 && "Failed to locate a viable rotation!");
12445 assert((Lo || Hi) && "Failed to find a rotated input vector!");
12446 if (!Lo)
12447 Lo = Hi;
12448 else if (!Hi)
12449 Hi = Lo;
12450
12451 V1 = Lo;
12452 V2 = Hi;
12453
12454 return Rotation;
12455}
12456
12457/// Try to lower a vector shuffle as a byte rotation.
12458///
12459/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
12460/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
12461/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
12462/// try to generically lower a vector shuffle through such an pattern. It
12463/// does not check for the profitability of lowering either as PALIGNR or
12464/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
12465/// This matches shuffle vectors that look like:
12466///
12467/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
12468///
12469/// Essentially it concatenates V1 and V2, shifts right by some number of
12470/// elements, and takes the low elements as the result. Note that while this is
12471/// specified as a *right shift* because x86 is little-endian, it is a *left
12472/// rotate* of the vector lanes.
12474 ArrayRef<int> Mask) {
12475 // Don't accept any shuffles with zero elements.
12476 if (isAnyZero(Mask))
12477 return -1;
12478
12479 // PALIGNR works on 128-bit lanes.
12480 SmallVector<int, 16> RepeatedMask;
12481 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
12482 return -1;
12483
12484 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
12485 if (Rotation <= 0)
12486 return -1;
12487
12488 // PALIGNR rotates bytes, so we need to scale the
12489 // rotation based on how many bytes are in the vector lane.
12490 int NumElts = RepeatedMask.size();
12491 int Scale = 16 / NumElts;
12492 return Rotation * Scale;
12493}
12494
12496 SDValue V2, ArrayRef<int> Mask,
12497 const X86Subtarget &Subtarget,
12498 SelectionDAG &DAG) {
12499 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12500
12501 SDValue Lo = V1, Hi = V2;
12502 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
12503 if (ByteRotation <= 0)
12504 return SDValue();
12505
12506 // Cast the inputs to i8 vector of correct length to match PALIGNR or
12507 // PSLLDQ/PSRLDQ.
12508 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12509 Lo = DAG.getBitcast(ByteVT, Lo);
12510 Hi = DAG.getBitcast(ByteVT, Hi);
12511
12512 // SSSE3 targets can use the palignr instruction.
12513 if (Subtarget.hasSSSE3()) {
12514 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
12515 "512-bit PALIGNR requires BWI instructions");
12516 return DAG.getBitcast(
12517 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
12518 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
12519 }
12520
12521 assert(VT.is128BitVector() &&
12522 "Rotate-based lowering only supports 128-bit lowering!");
12523 assert(Mask.size() <= 16 &&
12524 "Can shuffle at most 16 bytes in a 128-bit vector!");
12525 assert(ByteVT == MVT::v16i8 &&
12526 "SSE2 rotate lowering only needed for v16i8!");
12527
12528 // Default SSE2 implementation
12529 int LoByteShift = 16 - ByteRotation;
12530 int HiByteShift = ByteRotation;
12531
12532 SDValue LoShift =
12533 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
12534 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
12535 SDValue HiShift =
12536 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
12537 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
12538 return DAG.getBitcast(VT,
12539 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
12540}
12541
12542/// Try to lower a vector shuffle as a dword/qword rotation.
12543///
12544/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
12545/// rotation of the concatenation of two vectors; This routine will
12546/// try to generically lower a vector shuffle through such an pattern.
12547///
12548/// Essentially it concatenates V1 and V2, shifts right by some number of
12549/// elements, and takes the low elements as the result. Note that while this is
12550/// specified as a *right shift* because x86 is little-endian, it is a *left
12551/// rotate* of the vector lanes.
12553 SDValue V2, ArrayRef<int> Mask,
12554 const APInt &Zeroable,
12555 const X86Subtarget &Subtarget,
12556 SelectionDAG &DAG) {
12557 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
12558 "Only 32-bit and 64-bit elements are supported!");
12559
12560 // 128/256-bit vectors are only supported with VLX.
12561 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
12562 && "VLX required for 128/256-bit vectors");
12563
12564 SDValue Lo = V1, Hi = V2;
12565 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
12566 if (0 < Rotation)
12567 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
12568 DAG.getTargetConstant(Rotation, DL, MVT::i8));
12569
12570 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
12571 // TODO: Pull this out as a matchShuffleAsElementShift helper?
12572 // TODO: We can probably make this more aggressive and use shift-pairs like
12573 // lowerShuffleAsByteShiftMask.
12574 unsigned NumElts = Mask.size();
12575 unsigned ZeroLo = Zeroable.countr_one();
12576 unsigned ZeroHi = Zeroable.countl_one();
12577 assert((ZeroLo + ZeroHi) < NumElts && "Zeroable shuffle detected");
12578 if (!ZeroLo && !ZeroHi)
12579 return SDValue();
12580
12581 if (ZeroLo) {
12582 SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12583 int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts;
12584 if (isSequentialOrUndefInRange(Mask, ZeroLo, NumElts - ZeroLo, Low))
12585 return DAG.getNode(X86ISD::VALIGN, DL, VT, Src,
12586 getZeroVector(VT, Subtarget, DAG, DL),
12587 DAG.getTargetConstant(NumElts - ZeroLo, DL, MVT::i8));
12588 }
12589
12590 if (ZeroHi) {
12591 SDValue Src = Mask[0] < (int)NumElts ? V1 : V2;
12592 int Low = Mask[0] < (int)NumElts ? 0 : NumElts;
12593 if (isSequentialOrUndefInRange(Mask, 0, NumElts - ZeroHi, Low + ZeroHi))
12594 return DAG.getNode(X86ISD::VALIGN, DL, VT,
12595 getZeroVector(VT, Subtarget, DAG, DL), Src,
12596 DAG.getTargetConstant(ZeroHi, DL, MVT::i8));
12597 }
12598
12599 return SDValue();
12600}
12601
12602/// Try to lower a vector shuffle as a byte shift sequence.
12604 SDValue V2, ArrayRef<int> Mask,
12605 const APInt &Zeroable,
12606 const X86Subtarget &Subtarget,
12607 SelectionDAG &DAG) {
12608 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12609 assert(VT.is128BitVector() && "Only 128-bit vectors supported");
12610
12611 // We need a shuffle that has zeros at one/both ends and a sequential
12612 // shuffle from one source within.
12613 unsigned ZeroLo = Zeroable.countr_one();
12614 unsigned ZeroHi = Zeroable.countl_one();
12615 if (!ZeroLo && !ZeroHi)
12616 return SDValue();
12617
12618 unsigned NumElts = Mask.size();
12619 unsigned Len = NumElts - (ZeroLo + ZeroHi);
12620 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
12621 return SDValue();
12622
12623 unsigned Scale = VT.getScalarSizeInBits() / 8;
12624 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
12625 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
12626 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
12627 return SDValue();
12628
12629 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12630 Res = DAG.getBitcast(MVT::v16i8, Res);
12631
12632 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
12633 // inner sequential set of elements, possibly offset:
12634 // 01234567 --> zzzzzz01 --> 1zzzzzzz
12635 // 01234567 --> 4567zzzz --> zzzzz456
12636 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
12637 if (ZeroLo == 0) {
12638 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12639 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12640 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12641 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12642 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
12643 } else if (ZeroHi == 0) {
12644 unsigned Shift = Mask[ZeroLo] % NumElts;
12645 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12646 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12647 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12648 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12649 } else if (!Subtarget.hasSSSE3()) {
12650 // If we don't have PSHUFB then its worth avoiding an AND constant mask
12651 // by performing 3 byte shifts. Shuffle combining can kick in above that.
12652 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
12653 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12654 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12655 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12656 Shift += Mask[ZeroLo] % NumElts;
12657 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12658 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12659 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12660 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12661 } else
12662 return SDValue();
12663
12664 return DAG.getBitcast(VT, Res);
12665}
12666
12667/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
12668///
12669/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
12670/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
12671/// matches elements from one of the input vectors shuffled to the left or
12672/// right with zeroable elements 'shifted in'. It handles both the strictly
12673/// bit-wise element shifts and the byte shift across an entire 128-bit double
12674/// quad word lane.
12675///
12676/// PSHL : (little-endian) left bit shift.
12677/// [ zz, 0, zz, 2 ]
12678/// [ -1, 4, zz, -1 ]
12679/// PSRL : (little-endian) right bit shift.
12680/// [ 1, zz, 3, zz]
12681/// [ -1, -1, 7, zz]
12682/// PSLLDQ : (little-endian) left byte shift
12683/// [ zz, 0, 1, 2, 3, 4, 5, 6]
12684/// [ zz, zz, -1, -1, 2, 3, 4, -1]
12685/// [ zz, zz, zz, zz, zz, zz, -1, 1]
12686/// PSRLDQ : (little-endian) right byte shift
12687/// [ 5, 6, 7, zz, zz, zz, zz, zz]
12688/// [ -1, 5, 6, 7, zz, zz, zz, zz]
12689/// [ 1, 2, -1, -1, -1, -1, zz, zz]
12690static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
12691 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
12692 int MaskOffset, const APInt &Zeroable,
12693 const X86Subtarget &Subtarget) {
12694 int Size = Mask.size();
12695 unsigned SizeInBits = Size * ScalarSizeInBits;
12696
12697 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
12698 for (int i = 0; i < Size; i += Scale)
12699 for (int j = 0; j < Shift; ++j)
12700 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
12701 return false;
12702
12703 return true;
12704 };
12705
12706 auto MatchShift = [&](int Shift, int Scale, bool Left) {
12707 for (int i = 0; i != Size; i += Scale) {
12708 unsigned Pos = Left ? i + Shift : i;
12709 unsigned Low = Left ? i : i + Shift;
12710 unsigned Len = Scale - Shift;
12711 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
12712 return -1;
12713 }
12714
12715 int ShiftEltBits = ScalarSizeInBits * Scale;
12716 bool ByteShift = ShiftEltBits > 64;
12717 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
12718 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
12719 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
12720
12721 // Normalize the scale for byte shifts to still produce an i64 element
12722 // type.
12723 Scale = ByteShift ? Scale / 2 : Scale;
12724
12725 // We need to round trip through the appropriate type for the shift.
12726 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
12727 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
12728 : MVT::getVectorVT(ShiftSVT, Size / Scale);
12729 return ShiftAmt;
12730 };
12731
12732 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
12733 // keep doubling the size of the integer elements up to that. We can
12734 // then shift the elements of the integer vector by whole multiples of
12735 // their width within the elements of the larger integer vector. Test each
12736 // multiple to see if we can find a match with the moved element indices
12737 // and that the shifted in elements are all zeroable.
12738 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
12739 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
12740 for (int Shift = 1; Shift != Scale; ++Shift)
12741 for (bool Left : {true, false})
12742 if (CheckZeros(Shift, Scale, Left)) {
12743 int ShiftAmt = MatchShift(Shift, Scale, Left);
12744 if (0 < ShiftAmt)
12745 return ShiftAmt;
12746 }
12747
12748 // no match
12749 return -1;
12750}
12751
12753 SDValue V2, ArrayRef<int> Mask,
12754 const APInt &Zeroable,
12755 const X86Subtarget &Subtarget,
12756 SelectionDAG &DAG, bool BitwiseOnly) {
12757 int Size = Mask.size();
12758 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12759
12760 MVT ShiftVT;
12761 SDValue V = V1;
12762 unsigned Opcode;
12763
12764 // Try to match shuffle against V1 shift.
12765 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12766 Mask, 0, Zeroable, Subtarget);
12767
12768 // If V1 failed, try to match shuffle against V2 shift.
12769 if (ShiftAmt < 0) {
12770 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12771 Mask, Size, Zeroable, Subtarget);
12772 V = V2;
12773 }
12774
12775 if (ShiftAmt < 0)
12776 return SDValue();
12777
12778 if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))
12779 return SDValue();
12780
12781 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
12782 "Illegal integer vector type");
12783 V = DAG.getBitcast(ShiftVT, V);
12784 V = DAG.getNode(Opcode, DL, ShiftVT, V,
12785 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
12786 return DAG.getBitcast(VT, V);
12787}
12788
12789// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
12790// Remainder of lower half result is zero and upper half is all undef.
12791static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
12792 ArrayRef<int> Mask, uint64_t &BitLen,
12793 uint64_t &BitIdx, const APInt &Zeroable) {
12794 int Size = Mask.size();
12795 int HalfSize = Size / 2;
12796 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12797 assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask");
12798
12799 // Upper half must be undefined.
12800 if (!isUndefUpperHalf(Mask))
12801 return false;
12802
12803 // Determine the extraction length from the part of the
12804 // lower half that isn't zeroable.
12805 int Len = HalfSize;
12806 for (; Len > 0; --Len)
12807 if (!Zeroable[Len - 1])
12808 break;
12809 assert(Len > 0 && "Zeroable shuffle mask");
12810
12811 // Attempt to match first Len sequential elements from the lower half.
12812 SDValue Src;
12813 int Idx = -1;
12814 for (int i = 0; i != Len; ++i) {
12815 int M = Mask[i];
12816 if (M == SM_SentinelUndef)
12817 continue;
12818 SDValue &V = (M < Size ? V1 : V2);
12819 M = M % Size;
12820
12821 // The extracted elements must start at a valid index and all mask
12822 // elements must be in the lower half.
12823 if (i > M || M >= HalfSize)
12824 return false;
12825
12826 if (Idx < 0 || (Src == V && Idx == (M - i))) {
12827 Src = V;
12828 Idx = M - i;
12829 continue;
12830 }
12831 return false;
12832 }
12833
12834 if (!Src || Idx < 0)
12835 return false;
12836
12837 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
12838 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12839 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12840 V1 = Src;
12841 return true;
12842}
12843
12844// INSERTQ: Extract lowest Len elements from lower half of second source and
12845// insert over first source, starting at Idx.
12846// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
12847static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
12848 ArrayRef<int> Mask, uint64_t &BitLen,
12849 uint64_t &BitIdx) {
12850 int Size = Mask.size();
12851 int HalfSize = Size / 2;
12852 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12853
12854 // Upper half must be undefined.
12855 if (!isUndefUpperHalf(Mask))
12856 return false;
12857
12858 for (int Idx = 0; Idx != HalfSize; ++Idx) {
12859 SDValue Base;
12860
12861 // Attempt to match first source from mask before insertion point.
12862 if (isUndefInRange(Mask, 0, Idx)) {
12863 /* EMPTY */
12864 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
12865 Base = V1;
12866 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
12867 Base = V2;
12868 } else {
12869 continue;
12870 }
12871
12872 // Extend the extraction length looking to match both the insertion of
12873 // the second source and the remaining elements of the first.
12874 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
12875 SDValue Insert;
12876 int Len = Hi - Idx;
12877
12878 // Match insertion.
12879 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
12880 Insert = V1;
12881 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
12882 Insert = V2;
12883 } else {
12884 continue;
12885 }
12886
12887 // Match the remaining elements of the lower half.
12888 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
12889 /* EMPTY */
12890 } else if ((!Base || (Base == V1)) &&
12891 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
12892 Base = V1;
12893 } else if ((!Base || (Base == V2)) &&
12894 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
12895 Size + Hi)) {
12896 Base = V2;
12897 } else {
12898 continue;
12899 }
12900
12901 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12902 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12903 V1 = Base;
12904 V2 = Insert;
12905 return true;
12906 }
12907 }
12908
12909 return false;
12910}
12911
12912/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
12914 SDValue V2, ArrayRef<int> Mask,
12915 const APInt &Zeroable, SelectionDAG &DAG) {
12916 uint64_t BitLen, BitIdx;
12917 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
12918 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
12919 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12920 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12921
12922 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
12923 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
12924 V2 ? V2 : DAG.getUNDEF(VT),
12925 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12926 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12927
12928 return SDValue();
12929}
12930
12931/// Lower a vector shuffle as an any/signed/zero extension.
12932///
12933/// Given a specific number of elements, element bit width, and extension
12934/// stride, produce either an extension based on the available
12935/// features of the subtarget. The extended elements are consecutive and
12936/// begin and can start from an offsetted element index in the input; to
12937/// avoid excess shuffling the offset must either being in the bottom lane
12938/// or at the start of a higher lane. All extended elements must be from
12939/// the same lane.
12941 int Scale, int Offset,
12942 unsigned ExtOpc, SDValue InputV,
12943 ArrayRef<int> Mask,
12944 const X86Subtarget &Subtarget,
12945 SelectionDAG &DAG) {
12946 assert(Scale > 1 && "Need a scale to extend.");
12947 assert(ISD::isExtOpcode(ExtOpc) && "Unsupported extension");
12948 int EltBits = VT.getScalarSizeInBits();
12949 int NumElements = VT.getVectorNumElements();
12950 int NumEltsPerLane = 128 / EltBits;
12951 int OffsetLane = Offset / NumEltsPerLane;
12952 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
12953 "Only 8, 16, and 32 bit elements can be extended.");
12954 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
12955 assert(0 <= Offset && "Extension offset must be positive.");
12956 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
12957 "Extension offset must be in the first lane or start an upper lane.");
12958
12959 // Check that an index is in same lane as the base offset.
12960 auto SafeOffset = [&](int Idx) {
12961 return OffsetLane == (Idx / NumEltsPerLane);
12962 };
12963
12964 // Shift along an input so that the offset base moves to the first element.
12965 auto ShuffleOffset = [&](SDValue V) {
12966 if (!Offset)
12967 return V;
12968
12969 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12970 for (int i = 0; i * Scale < NumElements; ++i) {
12971 int SrcIdx = i + Offset;
12972 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
12973 }
12974 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
12975 };
12976
12977 // Found a valid a/zext mask! Try various lowering strategies based on the
12978 // input type and available ISA extensions.
12979 if (Subtarget.hasSSE41()) {
12980 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
12981 // PUNPCK will catch this in a later shuffle match.
12982 if (Offset && Scale == 2 && VT.is128BitVector())
12983 return SDValue();
12984 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
12985 NumElements / Scale);
12986 InputV = DAG.getBitcast(VT, InputV);
12987 InputV = ShuffleOffset(InputV);
12988 InputV = getEXTEND_VECTOR_INREG(ExtOpc, DL, ExtVT, InputV, DAG);
12989 return DAG.getBitcast(VT, InputV);
12990 }
12991
12992 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
12993 InputV = DAG.getBitcast(VT, InputV);
12994 bool AnyExt = ExtOpc == ISD::ANY_EXTEND;
12995
12996 // TODO: Add pre-SSE41 SIGN_EXTEND_VECTOR_INREG handling.
12997 if (ExtOpc == ISD::SIGN_EXTEND)
12998 return SDValue();
12999
13000 // For any extends we can cheat for larger element sizes and use shuffle
13001 // instructions that can fold with a load and/or copy.
13002 if (AnyExt && EltBits == 32) {
13003 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
13004 -1};
13005 return DAG.getBitcast(
13006 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
13007 DAG.getBitcast(MVT::v4i32, InputV),
13008 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13009 }
13010 if (AnyExt && EltBits == 16 && Scale > 2) {
13011 int PSHUFDMask[4] = {Offset / 2, -1,
13012 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
13013 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
13014 DAG.getBitcast(MVT::v4i32, InputV),
13015 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13016 int PSHUFWMask[4] = {1, -1, -1, -1};
13017 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
13018 return DAG.getBitcast(
13019 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
13020 DAG.getBitcast(MVT::v8i16, InputV),
13021 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
13022 }
13023
13024 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
13025 // to 64-bits.
13026 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
13027 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
13028 assert(VT.is128BitVector() && "Unexpected vector width!");
13029
13030 int LoIdx = Offset * EltBits;
13031 SDValue Lo = DAG.getBitcast(
13032 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
13033 DAG.getTargetConstant(EltBits, DL, MVT::i8),
13034 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
13035
13036 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
13037 return DAG.getBitcast(VT, Lo);
13038
13039 int HiIdx = (Offset + 1) * EltBits;
13040 SDValue Hi = DAG.getBitcast(
13041 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
13042 DAG.getTargetConstant(EltBits, DL, MVT::i8),
13043 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
13044 return DAG.getBitcast(VT,
13045 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
13046 }
13047
13048 // If this would require more than 2 unpack instructions to expand, use
13049 // pshufb when available. We can only use more than 2 unpack instructions
13050 // when zero extending i8 elements which also makes it easier to use pshufb.
13051 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
13052 assert(NumElements == 16 && "Unexpected byte vector width!");
13053 SDValue PSHUFBMask[16];
13054 for (int i = 0; i < 16; ++i) {
13055 int Idx = Offset + (i / Scale);
13056 if ((i % Scale == 0 && SafeOffset(Idx))) {
13057 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
13058 continue;
13059 }
13060 PSHUFBMask[i] =
13061 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
13062 }
13063 InputV = DAG.getBitcast(MVT::v16i8, InputV);
13064 return DAG.getBitcast(
13065 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
13066 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
13067 }
13068
13069 // If we are extending from an offset, ensure we start on a boundary that
13070 // we can unpack from.
13071 int AlignToUnpack = Offset % (NumElements / Scale);
13072 if (AlignToUnpack) {
13073 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
13074 for (int i = AlignToUnpack; i < NumElements; ++i)
13075 ShMask[i - AlignToUnpack] = i;
13076 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
13077 Offset -= AlignToUnpack;
13078 }
13079
13080 // Otherwise emit a sequence of unpacks.
13081 do {
13082 unsigned UnpackLoHi = X86ISD::UNPCKL;
13083 if (Offset >= (NumElements / 2)) {
13084 UnpackLoHi = X86ISD::UNPCKH;
13085 Offset -= (NumElements / 2);
13086 }
13087
13088 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
13089 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
13090 : getZeroVector(InputVT, Subtarget, DAG, DL);
13091 InputV = DAG.getBitcast(InputVT, InputV);
13092 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
13093 Scale /= 2;
13094 EltBits *= 2;
13095 NumElements /= 2;
13096 } while (Scale > 1);
13097 return DAG.getBitcast(VT, InputV);
13098}
13099
13100/// Try to lower a vector shuffle as a zero extension on any microarch.
13101///
13102/// This routine will try to do everything in its power to cleverly lower
13103/// a shuffle which happens to match the pattern of a zero extend. It doesn't
13104/// check for the profitability of this lowering, it tries to aggressively
13105/// match this pattern. It will use all of the micro-architectural details it
13106/// can to emit an efficient lowering. It handles both blends with all-zero
13107/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
13108/// masking out later).
13109///
13110/// The reason we have dedicated lowering for zext-style shuffles is that they
13111/// are both incredibly common and often quite performance sensitive.
13113 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13114 const APInt &Zeroable, const X86Subtarget &Subtarget,
13115 SelectionDAG &DAG) {
13116 int Bits = VT.getSizeInBits();
13117 int NumLanes = Bits / 128;
13118 int NumElements = VT.getVectorNumElements();
13119 int NumEltsPerLane = NumElements / NumLanes;
13120 assert(VT.getScalarSizeInBits() <= 32 &&
13121 "Exceeds 32-bit integer zero extension limit");
13122 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
13123
13124 // Define a helper function to check a particular ext-scale and lower to it if
13125 // valid.
13126 auto Lower = [&](int Scale) -> SDValue {
13127 SDValue InputV;
13128 bool AnyExt = true;
13129 int Offset = 0;
13130 int Matches = 0;
13131 for (int i = 0; i < NumElements; ++i) {
13132 int M = Mask[i];
13133 if (M < 0)
13134 continue; // Valid anywhere but doesn't tell us anything.
13135 if (i % Scale != 0) {
13136 // Each of the extended elements need to be zeroable.
13137 if (!Zeroable[i])
13138 return SDValue();
13139
13140 // We no longer are in the anyext case.
13141 AnyExt = false;
13142 continue;
13143 }
13144
13145 // Each of the base elements needs to be consecutive indices into the
13146 // same input vector.
13147 SDValue V = M < NumElements ? V1 : V2;
13148 M = M % NumElements;
13149 if (!InputV) {
13150 InputV = V;
13151 Offset = M - (i / Scale);
13152 } else if (InputV != V)
13153 return SDValue(); // Flip-flopping inputs.
13154
13155 // Offset must start in the lowest 128-bit lane or at the start of an
13156 // upper lane.
13157 // FIXME: Is it ever worth allowing a negative base offset?
13158 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
13159 (Offset % NumEltsPerLane) == 0))
13160 return SDValue();
13161
13162 // If we are offsetting, all referenced entries must come from the same
13163 // lane.
13164 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
13165 return SDValue();
13166
13167 if ((M % NumElements) != (Offset + (i / Scale)))
13168 return SDValue(); // Non-consecutive strided elements.
13169 Matches++;
13170 }
13171
13172 // If we fail to find an input, we have a zero-shuffle which should always
13173 // have already been handled.
13174 // FIXME: Maybe handle this here in case during blending we end up with one?
13175 if (!InputV)
13176 return SDValue();
13177
13178 // If we are offsetting, don't extend if we only match a single input, we
13179 // can always do better by using a basic PSHUF or PUNPCK.
13180 if (Offset != 0 && Matches < 2)
13181 return SDValue();
13182
13183 unsigned ExtOpc = AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND;
13184 return lowerShuffleAsSpecificExtension(DL, VT, Scale, Offset, ExtOpc,
13185 InputV, Mask, Subtarget, DAG);
13186 };
13187
13188 // The widest scale possible for extending is to a 64-bit integer.
13189 assert(Bits % 64 == 0 &&
13190 "The number of bits in a vector must be divisible by 64 on x86!");
13191 int NumExtElements = Bits / 64;
13192
13193 // Each iteration, try extending the elements half as much, but into twice as
13194 // many elements.
13195 for (; NumExtElements < NumElements; NumExtElements *= 2) {
13196 assert(NumElements % NumExtElements == 0 &&
13197 "The input vector size must be divisible by the extended size.");
13198 if (SDValue V = Lower(NumElements / NumExtElements))
13199 return V;
13200 }
13201
13202 // General extends failed, but 128-bit vectors may be able to use MOVQ.
13203 if (Bits != 128)
13204 return SDValue();
13205
13206 // Returns one of the source operands if the shuffle can be reduced to a
13207 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
13208 auto CanZExtLowHalf = [&]() {
13209 for (int i = NumElements / 2; i != NumElements; ++i)
13210 if (!Zeroable[i])
13211 return SDValue();
13212 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
13213 return V1;
13214 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
13215 return V2;
13216 return SDValue();
13217 };
13218
13219 if (SDValue V = CanZExtLowHalf()) {
13220 V = DAG.getBitcast(MVT::v2i64, V);
13221 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
13222 return DAG.getBitcast(VT, V);
13223 }
13224
13225 // No viable ext lowering found.
13226 return SDValue();
13227}
13228
13229/// Try to get a scalar value for a specific element of a vector.
13230///
13231/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
13233 SelectionDAG &DAG) {
13234 MVT VT = V.getSimpleValueType();
13235 MVT EltVT = VT.getVectorElementType();
13236 V = peekThroughBitcasts(V);
13237
13238 // If the bitcasts shift the element size, we can't extract an equivalent
13239 // element from it.
13240 MVT NewVT = V.getSimpleValueType();
13241 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
13242 return SDValue();
13243
13244 if (V.getOpcode() == ISD::BUILD_VECTOR ||
13245 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
13246 // Ensure the scalar operand is the same size as the destination.
13247 // FIXME: Add support for scalar truncation where possible.
13248 SDValue S = V.getOperand(Idx);
13249 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
13250 return DAG.getBitcast(EltVT, S);
13251 }
13252
13253 return SDValue();
13254}
13255
13256/// Helper to test for a load that can be folded with x86 shuffles.
13257///
13258/// This is particularly important because the set of instructions varies
13259/// significantly based on whether the operand is a load or not.
13261 return V.hasOneUse() &&
13263}
13264
13265template<typename T>
13266static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
13267 T EltVT = VT.getScalarType();
13268 return (EltVT == MVT::bf16 && !Subtarget.hasAVX10_2()) ||
13269 (EltVT == MVT::f16 && !Subtarget.hasFP16());
13270}
13271
13272/// Try to lower insertion of a single element into a zero vector.
13273///
13274/// This is a common pattern that we have especially efficient patterns to lower
13275/// across all subtarget feature sets.
13277 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13278 const APInt &Zeroable, const X86Subtarget &Subtarget,
13279 SelectionDAG &DAG) {
13280 MVT ExtVT = VT;
13281 MVT EltVT = VT.getVectorElementType();
13282 unsigned NumElts = VT.getVectorNumElements();
13283 unsigned EltBits = VT.getScalarSizeInBits();
13284
13285 if (isSoftF16(EltVT, Subtarget))
13286 return SDValue();
13287
13288 int V2Index =
13289 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
13290 Mask.begin();
13291 bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr;
13292 bool IsV1Zeroable = true;
13293 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13294 if (i != V2Index && !Zeroable[i]) {
13295 IsV1Zeroable = false;
13296 break;
13297 }
13298
13299 // Bail if a non-zero V1 isn't used in place.
13300 if (!IsV1Zeroable) {
13301 SmallVector<int, 8> V1Mask(Mask);
13302 V1Mask[V2Index] = -1;
13303 if (!isNoopShuffleMask(V1Mask))
13304 return SDValue();
13305 }
13306
13307 // Check for a single input from a SCALAR_TO_VECTOR node.
13308 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
13309 // all the smarts here sunk into that routine. However, the current
13310 // lowering of BUILD_VECTOR makes that nearly impossible until the old
13311 // vector shuffle lowering is dead.
13312 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
13313 DAG);
13314 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
13315 // We need to zext the scalar if it is smaller than an i32.
13316 V2S = DAG.getBitcast(EltVT, V2S);
13317 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
13318 // Using zext to expand a narrow element won't work for non-zero
13319 // insertions. But we can use a masked constant vector if we're
13320 // inserting V2 into the bottom of V1.
13321 if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
13322 return SDValue();
13323
13324 // Zero-extend directly to i32.
13325 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
13326 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
13327
13328 // If we're inserting into a constant, mask off the inserted index
13329 // and OR with the zero-extended scalar.
13330 if (!IsV1Zeroable) {
13331 SmallVector<APInt> Bits(NumElts, APInt::getAllOnes(EltBits));
13332 Bits[V2Index] = APInt::getZero(EltBits);
13333 SDValue BitMask = getConstVector(Bits, VT, DAG, DL);
13334 V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask);
13335 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
13336 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2));
13337 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
13338 }
13339 }
13340 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
13341 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
13342 (EltVT == MVT::i16 && !Subtarget.hasAVX10_2())) {
13343 // Either not inserting from the low element of the input or the input
13344 // element size is too small to use VZEXT_MOVL to clear the high bits.
13345 return SDValue();
13346 }
13347
13348 if (!IsV1Zeroable) {
13349 // If V1 can't be treated as a zero vector we have fewer options to lower
13350 // this. We can't support integer vectors or non-zero targets cheaply.
13351 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
13352 if (!VT.isFloatingPoint() || V2Index != 0)
13353 return SDValue();
13354 if (!VT.is128BitVector())
13355 return SDValue();
13356
13357 // Otherwise, use MOVSD, MOVSS or MOVSH.
13358 unsigned MovOpc = 0;
13359 if (EltVT == MVT::f16)
13360 MovOpc = X86ISD::MOVSH;
13361 else if (EltVT == MVT::f32)
13362 MovOpc = X86ISD::MOVSS;
13363 else if (EltVT == MVT::f64)
13364 MovOpc = X86ISD::MOVSD;
13365 else
13366 llvm_unreachable("Unsupported floating point element type to handle!");
13367 return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
13368 }
13369
13370 // This lowering only works for the low element with floating point vectors.
13371 if (VT.isFloatingPoint() && V2Index != 0)
13372 return SDValue();
13373
13374 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
13375 if (ExtVT != VT)
13376 V2 = DAG.getBitcast(VT, V2);
13377
13378 if (V2Index != 0) {
13379 // If we have 4 or fewer lanes we can cheaply shuffle the element into
13380 // the desired position. Otherwise it is more efficient to do a vector
13381 // shift left. We know that we can do a vector shift left because all
13382 // the inputs are zero.
13383 if (VT.isFloatingPoint() || NumElts <= 4) {
13384 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
13385 V2Shuffle[V2Index] = 0;
13386 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
13387 } else {
13388 V2 = DAG.getBitcast(MVT::v16i8, V2);
13389 V2 = DAG.getNode(
13390 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
13391 DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8));
13392 V2 = DAG.getBitcast(VT, V2);
13393 }
13394 }
13395 return V2;
13396}
13397
13398/// Try to lower broadcast of a single - truncated - integer element,
13399/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
13400///
13401/// This assumes we have AVX2.
13403 int BroadcastIdx,
13404 const X86Subtarget &Subtarget,
13405 SelectionDAG &DAG) {
13406 assert(Subtarget.hasAVX2() &&
13407 "We can only lower integer broadcasts with AVX2!");
13408
13409 MVT EltVT = VT.getVectorElementType();
13410 MVT V0VT = V0.getSimpleValueType();
13411
13412 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
13413 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
13414
13415 MVT V0EltVT = V0VT.getVectorElementType();
13416 if (!V0EltVT.isInteger())
13417 return SDValue();
13418
13419 const unsigned EltSize = EltVT.getSizeInBits();
13420 const unsigned V0EltSize = V0EltVT.getSizeInBits();
13421
13422 // This is only a truncation if the original element type is larger.
13423 if (V0EltSize <= EltSize)
13424 return SDValue();
13425
13426 assert(((V0EltSize % EltSize) == 0) &&
13427 "Scalar type sizes must all be powers of 2 on x86!");
13428
13429 const unsigned V0Opc = V0.getOpcode();
13430 const unsigned Scale = V0EltSize / EltSize;
13431 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
13432
13433 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
13434 V0Opc != ISD::BUILD_VECTOR)
13435 return SDValue();
13436
13437 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
13438
13439 // If we're extracting non-least-significant bits, shift so we can truncate.
13440 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
13441 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
13442 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
13443 if (const int OffsetIdx = BroadcastIdx % Scale)
13444 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
13445 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
13446
13447 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
13448 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
13449}
13450
13451/// Test whether this can be lowered with a single SHUFPS instruction.
13452///
13453/// This is used to disable more specialized lowerings when the shufps lowering
13454/// will happen to be efficient.
13456 // This routine only handles 128-bit shufps.
13457 assert(Mask.size() == 4 && "Unsupported mask size!");
13458 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
13459 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
13460 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
13461 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
13462
13463 // To lower with a single SHUFPS we need to have the low half and high half
13464 // each requiring a single input.
13465 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
13466 return false;
13467 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
13468 return false;
13469
13470 return true;
13471}
13472
13473/// Test whether the specified input (0 or 1) is in-place blended by the
13474/// given mask.
13475///
13476/// This returns true if the elements from a particular input are already in the
13477/// slot required by the given mask and require no permutation.
13479 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
13480 int Size = Mask.size();
13481 for (int i = 0; i < Size; ++i)
13482 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
13483 return false;
13484
13485 return true;
13486}
13487
13488/// Test whether the specified input (0 or 1) is a broadcast/splat blended by
13489/// the given mask.
13490///
13492 int BroadcastableElement = 0) {
13493 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
13494 int Size = Mask.size();
13495 for (int i = 0; i < Size; ++i)
13496 if (Mask[i] >= 0 && Mask[i] / Size == Input &&
13497 Mask[i] % Size != BroadcastableElement)
13498 return false;
13499 return true;
13500}
13501
13502/// If we are extracting two 128-bit halves of a vector and shuffling the
13503/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
13504/// multi-shuffle lowering.
13506 SDValue N1, ArrayRef<int> Mask,
13507 SelectionDAG &DAG) {
13508 MVT VT = N0.getSimpleValueType();
13509 assert((VT.is128BitVector() &&
13510 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
13511 "VPERM* family of shuffles requires 32-bit or 64-bit elements");
13512
13513 // Check that both sources are extracts of the same source vector.
13514 if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
13516 N0.getOperand(0) != N1.getOperand(0) ||
13517 !N0.hasOneUse() || !N1.hasOneUse())
13518 return SDValue();
13519
13520 SDValue WideVec = N0.getOperand(0);
13521 MVT WideVT = WideVec.getSimpleValueType();
13522 if (!WideVT.is256BitVector())
13523 return SDValue();
13524
13525 // Match extracts of each half of the wide source vector. Commute the shuffle
13526 // if the extract of the low half is N1.
13527 unsigned NumElts = VT.getVectorNumElements();
13528 SmallVector<int, 4> NewMask(Mask);
13529 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
13530 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
13531 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
13533 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
13534 return SDValue();
13535
13536 // Final bailout: if the mask is simple, we are better off using an extract
13537 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
13538 // because that avoids a constant load from memory.
13539 if (NumElts == 4 &&
13540 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))
13541 return SDValue();
13542
13543 // Extend the shuffle mask with undef elements.
13544 NewMask.append(NumElts, -1);
13545
13546 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
13547 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
13548 NewMask);
13549 // This is free: ymm -> xmm.
13550 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
13551 DAG.getVectorIdxConstant(0, DL));
13552}
13553
13554/// Try to lower broadcast of a single element.
13555///
13556/// For convenience, this code also bundles all of the subtarget feature set
13557/// filtering. While a little annoying to re-dispatch on type here, there isn't
13558/// a convenient way to factor it out.
13560 SDValue V2, ArrayRef<int> Mask,
13561 const X86Subtarget &Subtarget,
13562 SelectionDAG &DAG) {
13563 MVT EltVT = VT.getVectorElementType();
13564 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
13565 (Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
13566 (Subtarget.hasAVX2() && (VT.isInteger() || EltVT == MVT::f16))))
13567 return SDValue();
13568
13569 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
13570 // we can only broadcast from a register with AVX2.
13571 unsigned NumEltBits = VT.getScalarSizeInBits();
13572 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
13573 ? X86ISD::MOVDDUP
13574 : X86ISD::VBROADCAST;
13575 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
13576
13577 // Check that the mask is a broadcast.
13578 int BroadcastIdx = getSplatIndex(Mask);
13579 if (BroadcastIdx < 0) {
13580 // Check for hidden broadcast.
13581 SmallVector<int, 16> BroadcastMask(VT.getVectorNumElements(), 0);
13582 if (!isShuffleEquivalent(Mask, BroadcastMask, V1, V2))
13583 return SDValue();
13584 BroadcastIdx = 0;
13585 }
13586 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
13587 "a sorted mask where the broadcast "
13588 "comes from V1.");
13589 int NumActiveElts = count_if(Mask, [](int M) { return M >= 0; });
13590
13591 // Go up the chain of (vector) values to find a scalar load that we can
13592 // combine with the broadcast.
13593 // TODO: Combine this logic with findEltLoadSrc() used by
13594 // EltsFromConsecutiveLoads().
13595 int BitOffset = BroadcastIdx * NumEltBits;
13596 SDValue V = V1;
13597 for (;;) {
13598 switch (V.getOpcode()) {
13599 case ISD::BITCAST: {
13600 V = V.getOperand(0);
13601 continue;
13602 }
13603 case ISD::CONCAT_VECTORS: {
13604 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
13605 int OpIdx = BitOffset / OpBitWidth;
13606 V = V.getOperand(OpIdx);
13607 BitOffset %= OpBitWidth;
13608 continue;
13609 }
13611 // The extraction index adds to the existing offset.
13612 unsigned EltBitWidth = V.getScalarValueSizeInBits();
13613 unsigned Idx = V.getConstantOperandVal(1);
13614 unsigned BeginOffset = Idx * EltBitWidth;
13615 BitOffset += BeginOffset;
13616 V = V.getOperand(0);
13617 continue;
13618 }
13619 case ISD::INSERT_SUBVECTOR: {
13620 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
13621 int EltBitWidth = VOuter.getScalarValueSizeInBits();
13622 int Idx = (int)V.getConstantOperandVal(2);
13623 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
13624 int BeginOffset = Idx * EltBitWidth;
13625 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
13626 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
13627 BitOffset -= BeginOffset;
13628 V = VInner;
13629 } else {
13630 V = VOuter;
13631 }
13632 continue;
13633 }
13634 }
13635 break;
13636 }
13637 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
13638 BroadcastIdx = BitOffset / NumEltBits;
13639
13640 // Do we need to bitcast the source to retrieve the original broadcast index?
13641 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
13642
13643 // Check if this is a broadcast of a scalar. We special case lowering
13644 // for scalars so that we can more effectively fold with loads.
13645 // If the original value has a larger element type than the shuffle, the
13646 // broadcast element is in essence truncated. Make that explicit to ease
13647 // folding.
13648 if (BitCastSrc && VT.isInteger())
13649 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
13650 DL, VT, V, BroadcastIdx, Subtarget, DAG))
13651 return TruncBroadcast;
13652
13653 // Also check the simpler case, where we can directly reuse the scalar.
13654 if (!BitCastSrc &&
13655 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
13656 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
13657 V = V.getOperand(BroadcastIdx);
13658
13659 // If we can't broadcast from a register, check that the input is a load.
13660 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
13661 return SDValue();
13662 } else if (ISD::isNormalLoad(V.getNode()) &&
13663 cast<LoadSDNode>(V)->isSimple()) {
13664 // We do not check for one-use of the vector load because a broadcast load
13665 // is expected to be a win for code size, register pressure, and possibly
13666 // uops even if the original vector load is not eliminated.
13667
13668 // Reduce the vector load and shuffle to a broadcasted scalar load.
13669 auto *Ld = cast<LoadSDNode>(V);
13670 SDValue BaseAddr = Ld->getBasePtr();
13671 MVT SVT = VT.getScalarType();
13672 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
13673 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
13674 SDValue NewAddr =
13676
13677 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
13678 // than MOVDDUP.
13679 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
13680 if (Opcode == X86ISD::VBROADCAST) {
13681 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
13682 SDValue Ops[] = {Ld->getChain(), NewAddr};
13683 V = DAG.getMemIntrinsicNode(
13684 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
13686 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13688 return DAG.getBitcast(VT, V);
13689 }
13690 assert(SVT == MVT::f64 && "Unexpected VT!");
13691 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
13693 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13695 } else if (!BroadcastFromReg) {
13696 // We can't broadcast from a vector register.
13697 return SDValue();
13698 } else if (BitOffset != 0) {
13699 // We can only broadcast from the zero-element of a vector register,
13700 // but it can be advantageous to broadcast from the zero-element of a
13701 // subvector.
13702 if (!VT.is256BitVector() && !VT.is512BitVector())
13703 return SDValue();
13704
13705 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
13706 if (VT == MVT::v4f64 || VT == MVT::v4i64)
13707 return SDValue();
13708
13709 // If we are broadcasting an element from the lowest 128-bit subvector, try
13710 // to move the element in position.
13711 if (BitOffset < 128 && NumActiveElts > 1 &&
13712 V.getScalarValueSizeInBits() == NumEltBits) {
13713 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13714 "Unexpected bit-offset");
13715 SmallVector<int, 16> ExtractMask(128 / NumEltBits, SM_SentinelUndef);
13716 ExtractMask[0] = BitOffset / V.getScalarValueSizeInBits();
13717 V = extractSubVector(V, 0, DAG, DL, 128);
13718 V = DAG.getVectorShuffle(V.getValueType(), DL, V, V, ExtractMask);
13719 } else {
13720 // Only broadcast the zero-element of a 128-bit subvector.
13721 if ((BitOffset % 128) != 0)
13722 return SDValue();
13723
13724 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13725 "Unexpected bit-offset");
13726 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
13727 "Unexpected vector size");
13728 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
13729 V = extract128BitVector(V, ExtractIdx, DAG, DL);
13730 }
13731 }
13732
13733 // On AVX we can use VBROADCAST directly for scalar sources.
13734 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
13735 V = DAG.getBitcast(MVT::f64, V);
13736 if (Subtarget.hasAVX()) {
13737 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
13738 return DAG.getBitcast(VT, V);
13739 }
13740 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
13741 }
13742
13743 // If this is a scalar, do the broadcast on this type and bitcast.
13744 if (!V.getValueType().isVector()) {
13745 assert(V.getScalarValueSizeInBits() == NumEltBits &&
13746 "Unexpected scalar size");
13747 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
13749 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
13750 }
13751
13752 // We only support broadcasting from 128-bit vectors to minimize the
13753 // number of patterns we need to deal with in isel. So extract down to
13754 // 128-bits, removing as many bitcasts as possible.
13755 if (V.getValueSizeInBits() > 128)
13757
13758 // Otherwise cast V to a vector with the same element type as VT, but
13759 // possibly narrower than VT. Then perform the broadcast.
13760 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
13761 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
13762 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
13763}
13764
13765// Check for whether we can use INSERTPS to perform the shuffle. We only use
13766// INSERTPS when the V1 elements are already in the correct locations
13767// because otherwise we can just always use two SHUFPS instructions which
13768// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
13769// perform INSERTPS if a single V1 element is out of place and all V2
13770// elements are zeroable.
13772 unsigned &InsertPSMask,
13773 const APInt &Zeroable,
13774 ArrayRef<int> Mask, SelectionDAG &DAG) {
13775 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
13776 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
13777 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13778
13779 // Attempt to match INSERTPS with one element from VA or VB being
13780 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
13781 // are updated.
13782 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
13783 ArrayRef<int> CandidateMask) {
13784 unsigned ZMask = 0;
13785 int VADstIndex = -1;
13786 int VBDstIndex = -1;
13787 bool VAUsedInPlace = false;
13788
13789 for (int i = 0; i < 4; ++i) {
13790 // Synthesize a zero mask from the zeroable elements (includes undefs).
13791 if (Zeroable[i]) {
13792 ZMask |= 1 << i;
13793 continue;
13794 }
13795
13796 // Flag if we use any VA inputs in place.
13797 if (i == CandidateMask[i]) {
13798 VAUsedInPlace = true;
13799 continue;
13800 }
13801
13802 // We can only insert a single non-zeroable element.
13803 if (VADstIndex >= 0 || VBDstIndex >= 0)
13804 return false;
13805
13806 if (CandidateMask[i] < 4) {
13807 // VA input out of place for insertion.
13808 VADstIndex = i;
13809 } else {
13810 // VB input for insertion.
13811 VBDstIndex = i;
13812 }
13813 }
13814
13815 // Don't bother if we have no (non-zeroable) element for insertion.
13816 if (VADstIndex < 0 && VBDstIndex < 0)
13817 return false;
13818
13819 // Determine element insertion src/dst indices. The src index is from the
13820 // start of the inserted vector, not the start of the concatenated vector.
13821 unsigned VBSrcIndex = 0;
13822 if (VADstIndex >= 0) {
13823 // If we have a VA input out of place, we use VA as the V2 element
13824 // insertion and don't use the original V2 at all.
13825 VBSrcIndex = CandidateMask[VADstIndex];
13826 VBDstIndex = VADstIndex;
13827 VB = VA;
13828 } else {
13829 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13830 }
13831
13832 // If no V1 inputs are used in place, then the result is created only from
13833 // the zero mask and the V2 insertion - so remove V1 dependency.
13834 if (!VAUsedInPlace)
13835 VA = DAG.getUNDEF(MVT::v4f32);
13836
13837 // Update V1, V2 and InsertPSMask accordingly.
13838 V1 = VA;
13839 V2 = VB;
13840
13841 // Insert the V2 element into the desired position.
13842 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
13843 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
13844 return true;
13845 };
13846
13847 if (matchAsInsertPS(V1, V2, Mask))
13848 return true;
13849
13850 // Commute and try again.
13851 SmallVector<int, 4> CommutedMask(Mask);
13853 if (matchAsInsertPS(V2, V1, CommutedMask))
13854 return true;
13855
13856 return false;
13857}
13858
13860 ArrayRef<int> Mask, const APInt &Zeroable,
13861 SelectionDAG &DAG) {
13862 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13863 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13864
13865 // Attempt to match the insertps pattern.
13866 unsigned InsertPSMask = 0;
13867 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
13868 return SDValue();
13869
13870 // Insert the V2 element into the desired position.
13871 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
13872 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
13873}
13874
13875/// Handle lowering of 2-lane 64-bit floating point shuffles.
13876///
13877/// This is the basis function for the 2-lane 64-bit shuffles as we have full
13878/// support for floating point shuffles but not integer shuffles. These
13879/// instructions will incur a domain crossing penalty on some chips though so
13880/// it is better to avoid lowering through this for integer vectors where
13881/// possible.
13883 const APInt &Zeroable, SDValue V1, SDValue V2,
13884 const X86Subtarget &Subtarget,
13885 SelectionDAG &DAG) {
13886 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13887 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13888 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13889
13890 if (V2.isUndef()) {
13891 // Check for being able to broadcast a single element.
13892 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
13893 Mask, Subtarget, DAG))
13894 return Broadcast;
13895
13896 // Straight shuffle of a single input vector. Simulate this by using the
13897 // single input as both of the "inputs" to this instruction..
13898 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
13899
13900 if (Subtarget.hasAVX()) {
13901 // If we have AVX, we can use VPERMILPS which will allow folding a load
13902 // into the shuffle.
13903 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
13904 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13905 }
13906
13907 return DAG.getNode(
13908 X86ISD::SHUFP, DL, MVT::v2f64,
13909 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13910 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13911 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13912 }
13913 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13914 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13915 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13916 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13917
13918 if (Subtarget.hasAVX2())
13919 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13920 return Extract;
13921
13922 // When loading a scalar and then shuffling it into a vector we can often do
13923 // the insertion cheaply.
13925 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13926 return Insertion;
13927 // Try inverting the insertion since for v2 masks it is easy to do and we
13928 // can't reliably sort the mask one way or the other.
13929 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
13930 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
13932 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13933 return Insertion;
13934
13935 // Try to use one of the special instruction patterns to handle two common
13936 // blend patterns if a zero-blend above didn't work.
13937 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
13938 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
13939 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
13940 // We can either use a special instruction to load over the low double or
13941 // to move just the low double.
13942 return DAG.getNode(
13943 X86ISD::MOVSD, DL, MVT::v2f64, V2,
13944 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
13945
13946 if (Subtarget.hasSSE41())
13947 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
13948 Zeroable, Subtarget, DAG))
13949 return Blend;
13950
13951 // Use dedicated unpack instructions for masks that match their pattern.
13952 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, V1, V2, Mask, DAG))
13953 return V;
13954
13955 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
13956 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
13957 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13958}
13959
13960/// Handle lowering of 2-lane 64-bit integer shuffles.
13961///
13962/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
13963/// the integer unit to minimize domain crossing penalties. However, for blends
13964/// it falls back to the floating point shuffle operation with appropriate bit
13965/// casting.
13967 const APInt &Zeroable, SDValue V1, SDValue V2,
13968 const X86Subtarget &Subtarget,
13969 SelectionDAG &DAG) {
13970 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13971 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13972 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13973
13974 if (V2.isUndef()) {
13975 // Check for being able to broadcast a single element.
13976 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
13977 Mask, Subtarget, DAG))
13978 return Broadcast;
13979
13980 // Straight shuffle of a single input vector. For everything from SSE2
13981 // onward this has a single fast instruction with no scary immediates.
13982 // We have to map the mask as it is actually a v4i32 shuffle instruction.
13983 V1 = DAG.getBitcast(MVT::v4i32, V1);
13984 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
13985 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
13986 Mask[1] < 0 ? -1 : (Mask[1] * 2),
13987 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
13988 return DAG.getBitcast(
13989 MVT::v2i64,
13990 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13991 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
13992 }
13993 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
13994 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
13995 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13996 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13997
13998 if (Subtarget.hasAVX2())
13999 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14000 return Extract;
14001
14002 // Try to use shift instructions.
14003 if (SDValue Shift =
14004 lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,
14005 DAG, /*BitwiseOnly*/ false))
14006 return Shift;
14007
14008 // When loading a scalar and then shuffling it into a vector we can often do
14009 // the insertion cheaply.
14011 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
14012 return Insertion;
14013 // Try inverting the insertion since for v2 masks it is easy to do and we
14014 // can't reliably sort the mask one way or the other.
14015 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
14017 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
14018 return Insertion;
14019
14020 // We have different paths for blend lowering, but they all must use the
14021 // *exact* same predicate.
14022 bool IsBlendSupported = Subtarget.hasSSE41();
14023 if (IsBlendSupported)
14024 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
14025 Zeroable, Subtarget, DAG))
14026 return Blend;
14027
14028 // Use dedicated unpack instructions for masks that match their pattern.
14029 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, V1, V2, Mask, DAG))
14030 return V;
14031
14032 // Try to use byte rotation instructions.
14033 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
14034 if (Subtarget.hasSSSE3()) {
14035 if (Subtarget.hasVLX())
14036 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
14037 Zeroable, Subtarget, DAG))
14038 return Rotate;
14039
14040 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
14041 Subtarget, DAG))
14042 return Rotate;
14043 }
14044
14045 // If we have direct support for blends, we should lower by decomposing into
14046 // a permute. That will be faster than the domain cross.
14047 if (IsBlendSupported)
14048 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
14049 Zeroable, Subtarget, DAG);
14050
14051 // We implement this with SHUFPD which is pretty lame because it will likely
14052 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
14053 // However, all the alternatives are still more cycles and newer chips don't
14054 // have this problem. It would be really nice if x86 had better shuffles here.
14055 V1 = DAG.getBitcast(MVT::v2f64, V1);
14056 V2 = DAG.getBitcast(MVT::v2f64, V2);
14057 return DAG.getBitcast(MVT::v2i64,
14058 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
14059}
14060
14061/// Lower a vector shuffle using the SHUFPS instruction.
14062///
14063/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
14064/// It makes no assumptions about whether this is the *best* lowering, it simply
14065/// uses it.
14067 ArrayRef<int> Mask, SDValue V1,
14068 SDValue V2, SelectionDAG &DAG) {
14069 SDValue LowV = V1, HighV = V2;
14070 SmallVector<int, 4> NewMask(Mask);
14071 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14072
14073 if (NumV2Elements == 1) {
14074 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
14075
14076 // Compute the index adjacent to V2Index and in the same half by toggling
14077 // the low bit.
14078 int V2AdjIndex = V2Index ^ 1;
14079
14080 if (Mask[V2AdjIndex] < 0) {
14081 // Handles all the cases where we have a single V2 element and an undef.
14082 // This will only ever happen in the high lanes because we commute the
14083 // vector otherwise.
14084 if (V2Index < 2)
14085 std::swap(LowV, HighV);
14086 NewMask[V2Index] -= 4;
14087 } else {
14088 // Handle the case where the V2 element ends up adjacent to a V1 element.
14089 // To make this work, blend them together as the first step.
14090 int V1Index = V2AdjIndex;
14091 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
14092 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
14093 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
14094
14095 // Now proceed to reconstruct the final blend as we have the necessary
14096 // high or low half formed.
14097 if (V2Index < 2) {
14098 LowV = V2;
14099 HighV = V1;
14100 } else {
14101 HighV = V2;
14102 }
14103 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
14104 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
14105 }
14106 } else if (NumV2Elements == 2) {
14107 if (Mask[0] < 4 && Mask[1] < 4) {
14108 // Handle the easy case where we have V1 in the low lanes and V2 in the
14109 // high lanes.
14110 NewMask[2] -= 4;
14111 NewMask[3] -= 4;
14112 } else if (Mask[2] < 4 && Mask[3] < 4) {
14113 // We also handle the reversed case because this utility may get called
14114 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
14115 // arrange things in the right direction.
14116 NewMask[0] -= 4;
14117 NewMask[1] -= 4;
14118 HighV = V1;
14119 LowV = V2;
14120 } else {
14121 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
14122 // trying to place elements directly, just blend them and set up the final
14123 // shuffle to place them.
14124
14125 // The first two blend mask elements are for V1, the second two are for
14126 // V2.
14127 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
14128 Mask[2] < 4 ? Mask[2] : Mask[3],
14129 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
14130 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
14131 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
14132 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
14133
14134 // Now we do a normal shuffle of V1 by giving V1 as both operands to
14135 // a blend.
14136 LowV = HighV = V1;
14137 NewMask[0] = Mask[0] < 4 ? 0 : 2;
14138 NewMask[1] = Mask[0] < 4 ? 2 : 0;
14139 NewMask[2] = Mask[2] < 4 ? 1 : 3;
14140 NewMask[3] = Mask[2] < 4 ? 3 : 1;
14141 }
14142 } else if (NumV2Elements == 3) {
14143 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
14144 // we can get here due to other paths (e.g repeated mask matching) that we
14145 // don't want to do another round of lowerVECTOR_SHUFFLE.
14147 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
14148 }
14149 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
14150 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
14151}
14152
14153/// Lower 4-lane 32-bit floating point shuffles.
14154///
14155/// Uses instructions exclusively from the floating point unit to minimize
14156/// domain crossing penalties, as these are sufficient to implement all v4f32
14157/// shuffles.
14159 const APInt &Zeroable, SDValue V1, SDValue V2,
14160 const X86Subtarget &Subtarget,
14161 SelectionDAG &DAG) {
14162 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
14163 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
14164 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
14165
14166 if (Subtarget.hasSSE41())
14167 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
14168 Zeroable, Subtarget, DAG))
14169 return Blend;
14170
14171 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14172
14173 if (NumV2Elements == 0) {
14174 // Check for being able to broadcast a single element.
14175 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
14176 Mask, Subtarget, DAG))
14177 return Broadcast;
14178
14179 // Use even/odd duplicate instructions for masks that match their pattern.
14180 if (Subtarget.hasSSE3()) {
14181 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
14182 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
14183 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
14184 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
14185 }
14186
14187 if (Subtarget.hasAVX()) {
14188 // If we have AVX, we can use VPERMILPS which will allow folding a load
14189 // into the shuffle.
14190 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
14191 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14192 }
14193
14194 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
14195 // in SSE1 because otherwise they are widened to v2f64 and never get here.
14196 if (!Subtarget.hasSSE2()) {
14197 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
14198 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
14199 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
14200 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
14201 }
14202
14203 // Otherwise, use a straight shuffle of a single input vector. We pass the
14204 // input vector to both operands to simulate this with a SHUFPS.
14205 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
14206 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14207 }
14208
14209 if (Subtarget.hasSSE2())
14211 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
14212 ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
14213 return ZExt;
14214 }
14215
14216 if (Subtarget.hasAVX2())
14217 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14218 return Extract;
14219
14220 // There are special ways we can lower some single-element blends. However, we
14221 // have custom ways we can lower more complex single-element blends below that
14222 // we defer to if both this and BLENDPS fail to match, so restrict this to
14223 // when the V2 input is targeting element 0 of the mask -- that is the fast
14224 // case here.
14225 if (NumV2Elements == 1 && Mask[0] >= 4)
14226 if (SDValue V = lowerShuffleAsElementInsertion(DL, MVT::v4f32, V1, V2, Mask,
14227 Zeroable, Subtarget, DAG))
14228 return V;
14229
14230 if (Subtarget.hasSSE41() && !isSingleSHUFPSMask(Mask)) {
14231 // Use INSERTPS if we can complete the shuffle efficiently.
14232 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
14233 return V;
14234
14235 if (SDValue BlendPerm =
14236 lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1, V2, Mask, DAG))
14237 return BlendPerm;
14238 }
14239
14240 // Use low/high mov instructions. These are only valid in SSE1 because
14241 // otherwise they are widened to v2f64 and never get here.
14242 if (!Subtarget.hasSSE2()) {
14243 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
14244 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
14245 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
14246 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
14247 }
14248
14249 // Use dedicated unpack instructions for masks that match their pattern.
14250 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, V1, V2, Mask, DAG))
14251 return V;
14252
14253 // Otherwise fall back to a SHUFPS lowering strategy.
14254 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
14255}
14256
14257/// Lower 4-lane i32 vector shuffles.
14258///
14259/// We try to handle these with integer-domain shuffles where we can, but for
14260/// blends we use the floating point domain blend instructions.
14262 const APInt &Zeroable, SDValue V1, SDValue V2,
14263 const X86Subtarget &Subtarget,
14264 SelectionDAG &DAG) {
14265 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
14266 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
14267 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
14268
14269 // Whenever we can lower this as a zext, that instruction is strictly faster
14270 // than any alternative. It also allows us to fold memory operands into the
14271 // shuffle in many cases.
14272 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
14273 Zeroable, Subtarget, DAG))
14274 return ZExt;
14275
14276 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14277
14278 // Try to use shift instructions if fast.
14279 if (Subtarget.preferLowerShuffleAsShift()) {
14280 if (SDValue Shift =
14281 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,
14282 Subtarget, DAG, /*BitwiseOnly*/ true))
14283 return Shift;
14284 if (NumV2Elements == 0)
14285 if (SDValue Rotate =
14286 lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))
14287 return Rotate;
14288 }
14289
14290 if (NumV2Elements == 0) {
14291 // Try to use broadcast unless the mask only has one non-undef element.
14292 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
14293 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
14294 Mask, Subtarget, DAG))
14295 return Broadcast;
14296 }
14297
14298 // Straight shuffle of a single input vector. For everything from SSE2
14299 // onward this has a single fast instruction with no scary immediates.
14300 // We coerce the shuffle pattern to be compatible with UNPCK instructions
14301 // but we aren't actually going to use the UNPCK instruction because doing
14302 // so prevents folding a load into this instruction or making a copy.
14303 const int UnpackLoMask[] = {0, 0, 1, 1};
14304 const int UnpackHiMask[] = {2, 2, 3, 3};
14305 if (!isSingleElementRepeatedMask(Mask)) {
14306 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
14307 Mask = UnpackLoMask;
14308 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
14309 Mask = UnpackHiMask;
14310 }
14311
14312 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
14313 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14314 }
14315
14316 if (Subtarget.hasAVX2())
14317 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14318 return Extract;
14319
14320 // Try to use shift instructions.
14321 if (SDValue Shift =
14322 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,
14323 DAG, /*BitwiseOnly*/ false))
14324 return Shift;
14325
14326 // There are special ways we can lower some single-element blends.
14327 if (NumV2Elements == 1)
14329 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14330 return V;
14331
14332 // We have different paths for blend lowering, but they all must use the
14333 // *exact* same predicate.
14334 bool IsBlendSupported = Subtarget.hasSSE41();
14335 if (IsBlendSupported)
14336 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
14337 Zeroable, Subtarget, DAG))
14338 return Blend;
14339
14340 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
14341 Zeroable, Subtarget, DAG))
14342 return Masked;
14343
14344 // Use dedicated unpack instructions for masks that match their pattern.
14345 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, V1, V2, Mask, DAG))
14346 return V;
14347
14348 // Try to use byte rotation instructions.
14349 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
14350 if (Subtarget.hasSSSE3()) {
14351 if (Subtarget.hasVLX())
14352 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
14353 Zeroable, Subtarget, DAG))
14354 return Rotate;
14355
14356 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
14357 Subtarget, DAG))
14358 return Rotate;
14359 }
14360
14361 // Assume that a single SHUFPS is faster than an alternative sequence of
14362 // multiple instructions (even if the CPU has a domain penalty).
14363 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
14364 if (!isSingleSHUFPSMask(Mask)) {
14365 // If we have direct support for blends, we should lower by decomposing into
14366 // a permute. That will be faster than the domain cross.
14367 if (IsBlendSupported)
14368 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
14369 Zeroable, Subtarget, DAG);
14370
14371 // Try to lower by permuting the inputs into an unpack instruction.
14372 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
14373 Mask, Subtarget, DAG))
14374 return Unpack;
14375 }
14376
14377 // We implement this with SHUFPS because it can blend from two vectors.
14378 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
14379 // up the inputs, bypassing domain shift penalties that we would incur if we
14380 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
14381 // relevant.
14382 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
14383 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
14384 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
14385 return DAG.getBitcast(MVT::v4i32, ShufPS);
14386}
14387
14388/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
14389/// shuffle lowering, and the most complex part.
14390///
14391/// The lowering strategy is to try to form pairs of input lanes which are
14392/// targeted at the same half of the final vector, and then use a dword shuffle
14393/// to place them onto the right half, and finally unpack the paired lanes into
14394/// their final position.
14395///
14396/// The exact breakdown of how to form these dword pairs and align them on the
14397/// correct sides is really tricky. See the comments within the function for
14398/// more of the details.
14399///
14400/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
14401/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
14402/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
14403/// vector, form the analogous 128-bit 8-element Mask.
14405 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
14406 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
14407 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
14408 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
14409
14410 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
14411 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
14412 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
14413
14414 // Attempt to directly match PSHUFLW or PSHUFHW.
14415 if (isUndefOrInRange(LoMask, 0, 4) &&
14416 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
14417 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14418 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14419 }
14420 if (isUndefOrInRange(HiMask, 4, 8) &&
14421 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
14422 for (int i = 0; i != 4; ++i)
14423 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
14424 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14425 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14426 }
14427
14428 SmallVector<int, 4> LoInputs;
14429 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
14430 array_pod_sort(LoInputs.begin(), LoInputs.end());
14431 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
14432 SmallVector<int, 4> HiInputs;
14433 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
14434 array_pod_sort(HiInputs.begin(), HiInputs.end());
14435 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
14436 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
14437 int NumHToL = LoInputs.size() - NumLToL;
14438 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
14439 int NumHToH = HiInputs.size() - NumLToH;
14440 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
14441 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
14442 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
14443 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
14444
14445 // If we are shuffling values from one half - check how many different DWORD
14446 // pairs we need to create. If only 1 or 2 then we can perform this as a
14447 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
14448 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
14449 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
14450 V = DAG.getNode(ShufWOp, DL, VT, V,
14451 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14452 V = DAG.getBitcast(PSHUFDVT, V);
14453 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
14454 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
14455 return DAG.getBitcast(VT, V);
14456 };
14457
14458 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
14459 int PSHUFDMask[4] = { -1, -1, -1, -1 };
14460 SmallVector<std::pair<int, int>, 4> DWordPairs;
14461 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
14462
14463 // Collect the different DWORD pairs.
14464 for (int DWord = 0; DWord != 4; ++DWord) {
14465 int M0 = Mask[2 * DWord + 0];
14466 int M1 = Mask[2 * DWord + 1];
14467 M0 = (M0 >= 0 ? M0 % 4 : M0);
14468 M1 = (M1 >= 0 ? M1 % 4 : M1);
14469 if (M0 < 0 && M1 < 0)
14470 continue;
14471
14472 bool Match = false;
14473 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
14474 auto &DWordPair = DWordPairs[j];
14475 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
14476 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
14477 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
14478 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
14479 PSHUFDMask[DWord] = DOffset + j;
14480 Match = true;
14481 break;
14482 }
14483 }
14484 if (!Match) {
14485 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
14486 DWordPairs.push_back(std::make_pair(M0, M1));
14487 }
14488 }
14489
14490 if (DWordPairs.size() <= 2) {
14491 DWordPairs.resize(2, std::make_pair(-1, -1));
14492 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
14493 DWordPairs[1].first, DWordPairs[1].second};
14494 // For splat, ensure we widen the PSHUFDMask to allow vXi64 folds.
14495 if (ShuffleVectorSDNode::isSplatMask(PSHUFDMask) &&
14496 ShuffleVectorSDNode::isSplatMask(PSHUFHalfMask)) {
14497 int SplatIdx = ShuffleVectorSDNode::getSplatMaskIndex(PSHUFHalfMask);
14498 std::fill(PSHUFHalfMask, PSHUFHalfMask + 4, SplatIdx);
14499 PSHUFDMask[0] = PSHUFDMask[2] = DOffset + 0;
14500 PSHUFDMask[1] = PSHUFDMask[3] = DOffset + 1;
14501 }
14502 if ((NumHToL + NumHToH) == 0)
14503 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
14504 if ((NumLToL + NumLToH) == 0)
14505 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
14506 }
14507 }
14508
14509 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
14510 // such inputs we can swap two of the dwords across the half mark and end up
14511 // with <=2 inputs to each half in each half. Once there, we can fall through
14512 // to the generic code below. For example:
14513 //
14514 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
14515 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
14516 //
14517 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
14518 // and an existing 2-into-2 on the other half. In this case we may have to
14519 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
14520 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
14521 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
14522 // because any other situation (including a 3-into-1 or 1-into-3 in the other
14523 // half than the one we target for fixing) will be fixed when we re-enter this
14524 // path. We will also combine away any sequence of PSHUFD instructions that
14525 // result into a single instruction. Here is an example of the tricky case:
14526 //
14527 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
14528 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
14529 //
14530 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
14531 //
14532 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
14533 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
14534 //
14535 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
14536 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
14537 //
14538 // The result is fine to be handled by the generic logic.
14539 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
14540 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
14541 int AOffset, int BOffset) {
14542 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
14543 "Must call this with A having 3 or 1 inputs from the A half.");
14544 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
14545 "Must call this with B having 1 or 3 inputs from the B half.");
14546 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
14547 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
14548
14549 bool ThreeAInputs = AToAInputs.size() == 3;
14550
14551 // Compute the index of dword with only one word among the three inputs in
14552 // a half by taking the sum of the half with three inputs and subtracting
14553 // the sum of the actual three inputs. The difference is the remaining
14554 // slot.
14555 int ADWord = 0, BDWord = 0;
14556 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
14557 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
14558 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
14559 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
14560 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
14561 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
14562 int TripleNonInputIdx =
14563 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
14564 TripleDWord = TripleNonInputIdx / 2;
14565
14566 // We use xor with one to compute the adjacent DWord to whichever one the
14567 // OneInput is in.
14568 OneInputDWord = (OneInput / 2) ^ 1;
14569
14570 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
14571 // and BToA inputs. If there is also such a problem with the BToB and AToB
14572 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
14573 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
14574 // is essential that we don't *create* a 3<-1 as then we might oscillate.
14575 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
14576 // Compute how many inputs will be flipped by swapping these DWords. We
14577 // need
14578 // to balance this to ensure we don't form a 3-1 shuffle in the other
14579 // half.
14580 int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
14581 llvm::count(AToBInputs, 2 * ADWord + 1);
14582 int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
14583 llvm::count(BToBInputs, 2 * BDWord + 1);
14584 if ((NumFlippedAToBInputs == 1 &&
14585 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
14586 (NumFlippedBToBInputs == 1 &&
14587 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
14588 // We choose whether to fix the A half or B half based on whether that
14589 // half has zero flipped inputs. At zero, we may not be able to fix it
14590 // with that half. We also bias towards fixing the B half because that
14591 // will more commonly be the high half, and we have to bias one way.
14592 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
14593 ArrayRef<int> Inputs) {
14594 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
14595 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
14596 // Determine whether the free index is in the flipped dword or the
14597 // unflipped dword based on where the pinned index is. We use this bit
14598 // in an xor to conditionally select the adjacent dword.
14599 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
14600 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14601 if (IsFixIdxInput == IsFixFreeIdxInput)
14602 FixFreeIdx += 1;
14603 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14604 assert(IsFixIdxInput != IsFixFreeIdxInput &&
14605 "We need to be changing the number of flipped inputs!");
14606 int PSHUFHalfMask[] = {0, 1, 2, 3};
14607 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
14608 V = DAG.getNode(
14609 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
14610 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
14611 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14612
14613 for (int &M : Mask)
14614 if (M >= 0 && M == FixIdx)
14615 M = FixFreeIdx;
14616 else if (M >= 0 && M == FixFreeIdx)
14617 M = FixIdx;
14618 };
14619 if (NumFlippedBToBInputs != 0) {
14620 int BPinnedIdx =
14621 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
14622 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
14623 } else {
14624 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
14625 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
14626 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
14627 }
14628 }
14629 }
14630
14631 int PSHUFDMask[] = {0, 1, 2, 3};
14632 PSHUFDMask[ADWord] = BDWord;
14633 PSHUFDMask[BDWord] = ADWord;
14634 V = DAG.getBitcast(
14635 VT,
14636 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14637 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14638
14639 // Adjust the mask to match the new locations of A and B.
14640 for (int &M : Mask)
14641 if (M >= 0 && M/2 == ADWord)
14642 M = 2 * BDWord + M % 2;
14643 else if (M >= 0 && M/2 == BDWord)
14644 M = 2 * ADWord + M % 2;
14645
14646 // Recurse back into this routine to re-compute state now that this isn't
14647 // a 3 and 1 problem.
14648 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
14649 };
14650 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
14651 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
14652 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
14653 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
14654
14655 // At this point there are at most two inputs to the low and high halves from
14656 // each half. That means the inputs can always be grouped into dwords and
14657 // those dwords can then be moved to the correct half with a dword shuffle.
14658 // We use at most one low and one high word shuffle to collect these paired
14659 // inputs into dwords, and finally a dword shuffle to place them.
14660 int PSHUFLMask[4] = {-1, -1, -1, -1};
14661 int PSHUFHMask[4] = {-1, -1, -1, -1};
14662 int PSHUFDMask[4] = {-1, -1, -1, -1};
14663
14664 // First fix the masks for all the inputs that are staying in their
14665 // original halves. This will then dictate the targets of the cross-half
14666 // shuffles.
14667 auto fixInPlaceInputs =
14668 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
14669 MutableArrayRef<int> SourceHalfMask,
14670 MutableArrayRef<int> HalfMask, int HalfOffset) {
14671 if (InPlaceInputs.empty())
14672 return;
14673 if (InPlaceInputs.size() == 1) {
14674 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14675 InPlaceInputs[0] - HalfOffset;
14676 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
14677 return;
14678 }
14679 if (IncomingInputs.empty()) {
14680 // Just fix all of the in place inputs.
14681 for (int Input : InPlaceInputs) {
14682 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
14683 PSHUFDMask[Input / 2] = Input / 2;
14684 }
14685 return;
14686 }
14687
14688 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
14689 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14690 InPlaceInputs[0] - HalfOffset;
14691 // Put the second input next to the first so that they are packed into
14692 // a dword. We find the adjacent index by toggling the low bit.
14693 int AdjIndex = InPlaceInputs[0] ^ 1;
14694 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
14695 llvm::replace(HalfMask, InPlaceInputs[1], AdjIndex);
14696 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
14697 };
14698 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
14699 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
14700
14701 // Now gather the cross-half inputs and place them into a free dword of
14702 // their target half.
14703 // FIXME: This operation could almost certainly be simplified dramatically to
14704 // look more like the 3-1 fixing operation.
14705 auto moveInputsToRightHalf = [&PSHUFDMask](
14706 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
14707 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
14708 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
14709 int DestOffset) {
14710 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
14711 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
14712 };
14713 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
14714 int Word) {
14715 int LowWord = Word & ~1;
14716 int HighWord = Word | 1;
14717 return isWordClobbered(SourceHalfMask, LowWord) ||
14718 isWordClobbered(SourceHalfMask, HighWord);
14719 };
14720
14721 if (IncomingInputs.empty())
14722 return;
14723
14724 if (ExistingInputs.empty()) {
14725 // Map any dwords with inputs from them into the right half.
14726 for (int Input : IncomingInputs) {
14727 // If the source half mask maps over the inputs, turn those into
14728 // swaps and use the swapped lane.
14729 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
14730 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
14731 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
14732 Input - SourceOffset;
14733 // We have to swap the uses in our half mask in one sweep.
14734 for (int &M : HalfMask)
14735 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
14736 M = Input;
14737 else if (M == Input)
14738 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14739 } else {
14740 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
14741 Input - SourceOffset &&
14742 "Previous placement doesn't match!");
14743 }
14744 // Note that this correctly re-maps both when we do a swap and when
14745 // we observe the other side of the swap above. We rely on that to
14746 // avoid swapping the members of the input list directly.
14747 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14748 }
14749
14750 // Map the input's dword into the correct half.
14751 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
14752 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
14753 else
14754 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
14755 Input / 2 &&
14756 "Previous placement doesn't match!");
14757 }
14758
14759 // And just directly shift any other-half mask elements to be same-half
14760 // as we will have mirrored the dword containing the element into the
14761 // same position within that half.
14762 for (int &M : HalfMask)
14763 if (M >= SourceOffset && M < SourceOffset + 4) {
14764 M = M - SourceOffset + DestOffset;
14765 assert(M >= 0 && "This should never wrap below zero!");
14766 }
14767 return;
14768 }
14769
14770 // Ensure we have the input in a viable dword of its current half. This
14771 // is particularly tricky because the original position may be clobbered
14772 // by inputs being moved and *staying* in that half.
14773 if (IncomingInputs.size() == 1) {
14774 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14775 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
14776 SourceOffset;
14777 SourceHalfMask[InputFixed - SourceOffset] =
14778 IncomingInputs[0] - SourceOffset;
14779 llvm::replace(HalfMask, IncomingInputs[0], InputFixed);
14780 IncomingInputs[0] = InputFixed;
14781 }
14782 } else if (IncomingInputs.size() == 2) {
14783 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
14784 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14785 // We have two non-adjacent or clobbered inputs we need to extract from
14786 // the source half. To do this, we need to map them into some adjacent
14787 // dword slot in the source mask.
14788 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
14789 IncomingInputs[1] - SourceOffset};
14790
14791 // If there is a free slot in the source half mask adjacent to one of
14792 // the inputs, place the other input in it. We use (Index XOR 1) to
14793 // compute an adjacent index.
14794 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
14795 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
14796 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
14797 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14798 InputsFixed[1] = InputsFixed[0] ^ 1;
14799 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
14800 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
14801 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
14802 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
14803 InputsFixed[0] = InputsFixed[1] ^ 1;
14804 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
14805 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
14806 // The two inputs are in the same DWord but it is clobbered and the
14807 // adjacent DWord isn't used at all. Move both inputs to the free
14808 // slot.
14809 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
14810 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
14811 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
14812 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
14813 } else {
14814 // The only way we hit this point is if there is no clobbering
14815 // (because there are no off-half inputs to this half) and there is no
14816 // free slot adjacent to one of the inputs. In this case, we have to
14817 // swap an input with a non-input.
14818 for (int i = 0; i < 4; ++i)
14819 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
14820 "We can't handle any clobbers here!");
14821 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
14822 "Cannot have adjacent inputs here!");
14823
14824 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14825 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
14826
14827 // We also have to update the final source mask in this case because
14828 // it may need to undo the above swap.
14829 for (int &M : FinalSourceHalfMask)
14830 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
14831 M = InputsFixed[1] + SourceOffset;
14832 else if (M == InputsFixed[1] + SourceOffset)
14833 M = (InputsFixed[0] ^ 1) + SourceOffset;
14834
14835 InputsFixed[1] = InputsFixed[0] ^ 1;
14836 }
14837
14838 // Point everything at the fixed inputs.
14839 for (int &M : HalfMask)
14840 if (M == IncomingInputs[0])
14841 M = InputsFixed[0] + SourceOffset;
14842 else if (M == IncomingInputs[1])
14843 M = InputsFixed[1] + SourceOffset;
14844
14845 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
14846 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
14847 }
14848 } else {
14849 llvm_unreachable("Unhandled input size!");
14850 }
14851
14852 // Now hoist the DWord down to the right half.
14853 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
14854 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
14855 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
14856 for (int &M : HalfMask)
14857 for (int Input : IncomingInputs)
14858 if (M == Input)
14859 M = FreeDWord * 2 + Input % 2;
14860 };
14861 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
14862 /*SourceOffset*/ 4, /*DestOffset*/ 0);
14863 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
14864 /*SourceOffset*/ 0, /*DestOffset*/ 4);
14865
14866 // Now enact all the shuffles we've computed to move the inputs into their
14867 // target half.
14868 if (!isNoopShuffleMask(PSHUFLMask))
14869 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14870 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
14871 if (!isNoopShuffleMask(PSHUFHMask))
14872 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14873 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
14874 if (!isNoopShuffleMask(PSHUFDMask))
14875 V = DAG.getBitcast(
14876 VT,
14877 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14878 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14879
14880 // At this point, each half should contain all its inputs, and we can then
14881 // just shuffle them into their final position.
14882 assert(none_of(LoMask, [](int M) { return M >= 4; }) &&
14883 "Failed to lift all the high half inputs to the low mask!");
14884 assert(none_of(HiMask, [](int M) { return M >= 0 && M < 4; }) &&
14885 "Failed to lift all the low half inputs to the high mask!");
14886
14887 // Do a half shuffle for the low mask.
14888 if (!isNoopShuffleMask(LoMask))
14889 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14890 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14891
14892 // Do a half shuffle with the high mask after shifting its values down.
14893 for (int &M : HiMask)
14894 if (M >= 0)
14895 M -= 4;
14896 if (!isNoopShuffleMask(HiMask))
14897 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14898 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14899
14900 return V;
14901}
14902
14903/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
14904/// blend if only one input is used.
14906 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14907 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
14909 "Lane crossing shuffle masks not supported");
14910
14911 int NumBytes = VT.getSizeInBits() / 8;
14912 int Size = Mask.size();
14913 int Scale = NumBytes / Size;
14914
14915 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14916 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14917 V1InUse = false;
14918 V2InUse = false;
14919
14920 for (int i = 0; i < NumBytes; ++i) {
14921 int M = Mask[i / Scale];
14922 if (M < 0)
14923 continue;
14924
14925 const int ZeroMask = 0x80;
14926 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
14927 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
14928 if (Zeroable[i / Scale])
14929 V1Idx = V2Idx = ZeroMask;
14930
14931 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
14932 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
14933 V1InUse |= (ZeroMask != V1Idx);
14934 V2InUse |= (ZeroMask != V2Idx);
14935 }
14936
14937 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
14938 if (V1InUse)
14939 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
14940 DAG.getBuildVector(ShufVT, DL, V1Mask));
14941 if (V2InUse)
14942 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
14943 DAG.getBuildVector(ShufVT, DL, V2Mask));
14944
14945 // If we need shuffled inputs from both, blend the two.
14946 SDValue V;
14947 if (V1InUse && V2InUse)
14948 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
14949 else
14950 V = V1InUse ? V1 : V2;
14951
14952 // Cast the result back to the correct type.
14953 return DAG.getBitcast(VT, V);
14954}
14955
14956/// Generic lowering of 8-lane i16 shuffles.
14957///
14958/// This handles both single-input shuffles and combined shuffle/blends with
14959/// two inputs. The single input shuffles are immediately delegated to
14960/// a dedicated lowering routine.
14961///
14962/// The blends are lowered in one of three fundamental ways. If there are few
14963/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
14964/// of the input is significantly cheaper when lowered as an interleaving of
14965/// the two inputs, try to interleave them. Otherwise, blend the low and high
14966/// halves of the inputs separately (making them have relatively few inputs)
14967/// and then concatenate them.
14969 const APInt &Zeroable, SDValue V1, SDValue V2,
14970 const X86Subtarget &Subtarget,
14971 SelectionDAG &DAG) {
14972 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14973 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14974 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14975
14976 // Whenever we can lower this as a zext, that instruction is strictly faster
14977 // than any alternative.
14978 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
14979 Zeroable, Subtarget, DAG))
14980 return ZExt;
14981
14982 // Try to use lower using a truncation.
14983 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14984 Subtarget, DAG))
14985 return V;
14986
14987 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
14988
14989 if (NumV2Inputs == 0) {
14990 // Try to use shift instructions.
14991 if (SDValue Shift =
14992 lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,
14993 Subtarget, DAG, /*BitwiseOnly*/ false))
14994 return Shift;
14995
14996 // Check for being able to broadcast a single element.
14997 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
14998 Mask, Subtarget, DAG))
14999 return Broadcast;
15000
15001 // Try to use bit rotation instructions.
15002 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
15003 Subtarget, DAG))
15004 return Rotate;
15005
15006 // Use dedicated unpack instructions for masks that match their pattern.
15007 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
15008 return V;
15009
15010 // Use dedicated pack instructions for masks that match their pattern.
15011 if (SDValue V =
15012 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
15013 return V;
15014
15015 // Try to use byte rotation instructions.
15016 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
15017 Subtarget, DAG))
15018 return Rotate;
15019
15020 // Make a copy of the mask so it can be modified.
15021 SmallVector<int, 8> MutableMask(Mask);
15022 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
15023 Subtarget, DAG);
15024 }
15025
15026 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
15027 "All single-input shuffles should be canonicalized to be V1-input "
15028 "shuffles.");
15029
15030 // Try to use shift instructions.
15031 if (SDValue Shift =
15032 lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,
15033 DAG, /*BitwiseOnly*/ false))
15034 return Shift;
15035
15036 // See if we can use SSE4A Extraction / Insertion.
15037 if (Subtarget.hasSSE4A())
15038 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
15039 Zeroable, DAG))
15040 return V;
15041
15042 // There are special ways we can lower some single-element blends.
15043 if (NumV2Inputs == 1)
15045 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
15046 return V;
15047
15048 // We have different paths for blend lowering, but they all must use the
15049 // *exact* same predicate.
15050 bool IsBlendSupported = Subtarget.hasSSE41();
15051 if (IsBlendSupported)
15052 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
15053 Zeroable, Subtarget, DAG))
15054 return Blend;
15055
15056 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
15057 Zeroable, Subtarget, DAG))
15058 return Masked;
15059
15060 // Use dedicated unpack instructions for masks that match their pattern.
15061 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
15062 return V;
15063
15064 // Use dedicated pack instructions for masks that match their pattern.
15065 if (SDValue V =
15066 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
15067 return V;
15068
15069 // Try to use lower using a truncation.
15070 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
15071 Subtarget, DAG))
15072 return V;
15073
15074 // Try to use byte rotation instructions.
15075 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
15076 Subtarget, DAG))
15077 return Rotate;
15078
15079 if (SDValue BitBlend =
15080 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
15081 return BitBlend;
15082
15083 // Try to use byte shift instructions to mask.
15084 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
15085 Zeroable, Subtarget, DAG))
15086 return V;
15087
15088 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
15089 int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
15090 if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.hasSSE41())) &&
15091 !Subtarget.hasVLX()) {
15092 // Check if this is part of a 256-bit vector truncation.
15093 unsigned PackOpc = 0;
15094 if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
15097 SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
15098 V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
15099 getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
15100 DAG.getTargetConstant(0xEE, DL, MVT::i8));
15101 V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
15102 V1 = extract128BitVector(V1V2, 0, DAG, DL);
15103 V2 = extract128BitVector(V1V2, 4, DAG, DL);
15104 PackOpc = X86ISD::PACKUS;
15105 } else if (Subtarget.hasSSE41()) {
15106 SmallVector<SDValue, 4> DWordClearOps(4,
15107 DAG.getConstant(0, DL, MVT::i32));
15108 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
15109 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
15110 SDValue DWordClearMask =
15111 DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
15112 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
15113 DWordClearMask);
15114 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
15115 DWordClearMask);
15116 PackOpc = X86ISD::PACKUS;
15117 } else if (!Subtarget.hasSSSE3()) {
15118 SDValue ShAmt = DAG.getTargetConstant(16, DL, MVT::i8);
15119 V1 = DAG.getBitcast(MVT::v4i32, V1);
15120 V2 = DAG.getBitcast(MVT::v4i32, V2);
15121 V1 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V1, ShAmt);
15122 V2 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V2, ShAmt);
15123 V1 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V1, ShAmt);
15124 V2 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V2, ShAmt);
15125 PackOpc = X86ISD::PACKSS;
15126 }
15127 if (PackOpc) {
15128 // Now pack things back together.
15129 SDValue Result = DAG.getNode(PackOpc, DL, MVT::v8i16, V1, V2);
15130 if (NumEvenDrops == 2) {
15131 Result = DAG.getBitcast(MVT::v4i32, Result);
15132 Result = DAG.getNode(PackOpc, DL, MVT::v8i16, Result, Result);
15133 }
15134 return Result;
15135 }
15136 }
15137
15138 // When compacting odd (upper) elements, use PACKSS pre-SSE41.
15139 int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
15140 if (NumOddDrops == 1) {
15141 bool HasSSE41 = Subtarget.hasSSE41();
15142 V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
15143 DAG.getBitcast(MVT::v4i32, V1),
15144 DAG.getTargetConstant(16, DL, MVT::i8));
15145 V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
15146 DAG.getBitcast(MVT::v4i32, V2),
15147 DAG.getTargetConstant(16, DL, MVT::i8));
15148 return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
15149 MVT::v8i16, V1, V2);
15150 }
15151
15152 // Try to lower by permuting the inputs into an unpack instruction.
15153 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
15154 Mask, Subtarget, DAG))
15155 return Unpack;
15156
15157 // If we can't directly blend but can use PSHUFB, that will be better as it
15158 // can both shuffle and set up the inefficient blend.
15159 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
15160 bool V1InUse, V2InUse;
15161 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
15162 Zeroable, DAG, V1InUse, V2InUse);
15163 }
15164
15165 // We can always bit-blend if we have to so the fallback strategy is to
15166 // decompose into single-input permutes and blends/unpacks.
15167 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2, Mask,
15168 Zeroable, Subtarget, DAG);
15169}
15170
15171/// Lower 8-lane 16-bit floating point shuffles.
15173 const APInt &Zeroable, SDValue V1, SDValue V2,
15174 const X86Subtarget &Subtarget,
15175 SelectionDAG &DAG) {
15176 assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
15177 assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
15178 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
15179 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
15180
15181 if (Subtarget.hasFP16()) {
15182 if (NumV2Elements == 0) {
15183 // Check for being able to broadcast a single element.
15184 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
15185 Mask, Subtarget, DAG))
15186 return Broadcast;
15187 }
15188 if (NumV2Elements == 1 && Mask[0] >= 8)
15190 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
15191 return V;
15192 }
15193
15194 V1 = DAG.getBitcast(MVT::v8i16, V1);
15195 V2 = DAG.getBitcast(MVT::v8i16, V2);
15196 return DAG.getBitcast(MVT::v8f16,
15197 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
15198}
15199
15200// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
15201// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
15202// the active subvector is extracted.
15204 ArrayRef<int> OriginalMask, SDValue V1,
15205 SDValue V2, const X86Subtarget &Subtarget,
15206 SelectionDAG &DAG) {
15207 // Commute binary inputs so V2 is a load to simplify VPERMI2/T2 folds.
15208 SmallVector<int, 32> Mask(OriginalMask);
15209 if (!V2.isUndef() && isShuffleFoldableLoad(V1) &&
15210 !isShuffleFoldableLoad(V2)) {
15212 std::swap(V1, V2);
15213 }
15214
15215 MVT MaskVT = VT.changeTypeToInteger();
15216 SDValue MaskNode;
15217 MVT ShuffleVT = VT;
15218 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
15219 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
15220 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
15221 ShuffleVT = V1.getSimpleValueType();
15222
15223 // Adjust mask to correct indices for the second input.
15224 int NumElts = VT.getVectorNumElements();
15225 unsigned Scale = 512 / VT.getSizeInBits();
15226 SmallVector<int, 32> AdjustedMask(Mask);
15227 for (int &M : AdjustedMask)
15228 if (NumElts <= M)
15229 M += (Scale - 1) * NumElts;
15230 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
15231 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
15232 } else {
15233 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
15234 }
15235
15236 SDValue Result;
15237 if (V2.isUndef())
15238 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
15239 else
15240 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
15241
15242 if (VT != ShuffleVT)
15243 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
15244
15245 return Result;
15246}
15247
15248/// Generic lowering of v16i8 shuffles.
15249///
15250/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
15251/// detect any complexity reducing interleaving. If that doesn't help, it uses
15252/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
15253/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
15254/// back together.
15256 const APInt &Zeroable, SDValue V1, SDValue V2,
15257 const X86Subtarget &Subtarget,
15258 SelectionDAG &DAG) {
15259 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
15260 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
15261 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
15262
15263 // Try to use shift instructions.
15264 if (SDValue Shift =
15265 lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,
15266 DAG, /*BitwiseOnly*/ false))
15267 return Shift;
15268
15269 // Try to use byte rotation instructions.
15270 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
15271 Subtarget, DAG))
15272 return Rotate;
15273
15274 // Use dedicated pack instructions for masks that match their pattern.
15275 if (SDValue V =
15276 lowerShuffleWithPACK(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15277 return V;
15278
15279 // Try to use a zext lowering.
15280 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
15281 Zeroable, Subtarget, DAG))
15282 return ZExt;
15283
15284 // Try to use lower using a truncation.
15285 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
15286 Subtarget, DAG))
15287 return V;
15288
15289 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
15290 Subtarget, DAG))
15291 return V;
15292
15293 // See if we can use SSE4A Extraction / Insertion.
15294 if (Subtarget.hasSSE4A())
15295 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
15296 Zeroable, DAG))
15297 return V;
15298
15299 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
15300
15301 // For single-input shuffles, there are some nicer lowering tricks we can use.
15302 if (NumV2Elements == 0) {
15303 // Check for being able to broadcast a single element.
15304 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
15305 Mask, Subtarget, DAG))
15306 return Broadcast;
15307
15308 // Try to use bit rotation instructions.
15309 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
15310 Subtarget, DAG))
15311 return Rotate;
15312
15313 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
15314 return V;
15315
15316 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
15317 // Notably, this handles splat and partial-splat shuffles more efficiently.
15318 // However, it only makes sense if the pre-duplication shuffle simplifies
15319 // things significantly. Currently, this means we need to be able to
15320 // express the pre-duplication shuffle as an i16 shuffle.
15321 //
15322 // FIXME: We should check for other patterns which can be widened into an
15323 // i16 shuffle as well.
15324 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
15325 for (int i = 0; i < 16; i += 2)
15326 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
15327 return false;
15328
15329 return true;
15330 };
15331 auto tryToWidenViaDuplication = [&]() -> SDValue {
15332 if (!canWidenViaDuplication(Mask))
15333 return SDValue();
15334 SmallVector<int, 4> LoInputs;
15335 copy_if(Mask, std::back_inserter(LoInputs),
15336 [](int M) { return M >= 0 && M < 8; });
15337 array_pod_sort(LoInputs.begin(), LoInputs.end());
15338 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
15339 SmallVector<int, 4> HiInputs;
15340 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
15341 array_pod_sort(HiInputs.begin(), HiInputs.end());
15342 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
15343
15344 bool TargetLo = LoInputs.size() >= HiInputs.size();
15345 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
15346 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
15347
15348 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
15350 for (int I : InPlaceInputs) {
15351 PreDupI16Shuffle[I/2] = I/2;
15352 LaneMap[I] = I;
15353 }
15354 int j = TargetLo ? 0 : 4, je = j + 4;
15355 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
15356 // Check if j is already a shuffle of this input. This happens when
15357 // there are two adjacent bytes after we move the low one.
15358 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
15359 // If we haven't yet mapped the input, search for a slot into which
15360 // we can map it.
15361 while (j < je && PreDupI16Shuffle[j] >= 0)
15362 ++j;
15363
15364 if (j == je)
15365 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
15366 return SDValue();
15367
15368 // Map this input with the i16 shuffle.
15369 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
15370 }
15371
15372 // Update the lane map based on the mapping we ended up with.
15373 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
15374 }
15375 V1 = DAG.getBitcast(
15376 MVT::v16i8,
15377 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
15378 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
15379
15380 // Unpack the bytes to form the i16s that will be shuffled into place.
15381 bool EvenInUse = false, OddInUse = false;
15382 for (int i = 0; i < 16; i += 2) {
15383 EvenInUse |= (Mask[i + 0] >= 0);
15384 OddInUse |= (Mask[i + 1] >= 0);
15385 if (EvenInUse && OddInUse)
15386 break;
15387 }
15388 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
15389 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
15390 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
15391
15392 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
15393 for (int i = 0; i < 16; ++i)
15394 if (Mask[i] >= 0) {
15395 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
15396 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
15397 if (PostDupI16Shuffle[i / 2] < 0)
15398 PostDupI16Shuffle[i / 2] = MappedMask;
15399 else
15400 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
15401 "Conflicting entries in the original shuffle!");
15402 }
15403 return DAG.getBitcast(
15404 MVT::v16i8,
15405 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
15406 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
15407 };
15408 if (SDValue V = tryToWidenViaDuplication())
15409 return V;
15410 }
15411
15412 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
15413 Zeroable, Subtarget, DAG))
15414 return Masked;
15415
15416 // Use dedicated unpack instructions for masks that match their pattern.
15417 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
15418 return V;
15419
15420 // Try to use byte shift instructions to mask.
15421 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
15422 Zeroable, Subtarget, DAG))
15423 return V;
15424
15425 // Check for compaction patterns.
15426 bool IsSingleInput = V2.isUndef();
15427 int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
15428
15429 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
15430 // with PSHUFB. It is important to do this before we attempt to generate any
15431 // blends but after all of the single-input lowerings. If the single input
15432 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
15433 // want to preserve that and we can DAG combine any longer sequences into
15434 // a PSHUFB in the end. But once we start blending from multiple inputs,
15435 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
15436 // and there are *very* few patterns that would actually be faster than the
15437 // PSHUFB approach because of its ability to zero lanes.
15438 //
15439 // If the mask is a binary compaction, we can more efficiently perform this
15440 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
15441 //
15442 // FIXME: The only exceptions to the above are blends which are exact
15443 // interleavings with direct instructions supporting them. We currently don't
15444 // handle those well here.
15445 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
15446 bool V1InUse = false;
15447 bool V2InUse = false;
15448
15450 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
15451
15452 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
15453 // do so. This avoids using them to handle blends-with-zero which is
15454 // important as a single pshufb is significantly faster for that.
15455 if (V1InUse && V2InUse) {
15456 if (Subtarget.hasSSE41())
15457 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
15458 Zeroable, Subtarget, DAG))
15459 return Blend;
15460
15461 // We can use an unpack to do the blending rather than an or in some
15462 // cases. Even though the or may be (very minorly) more efficient, we
15463 // preference this lowering because there are common cases where part of
15464 // the complexity of the shuffles goes away when we do the final blend as
15465 // an unpack.
15466 // FIXME: It might be worth trying to detect if the unpack-feeding
15467 // shuffles will both be pshufb, in which case we shouldn't bother with
15468 // this.
15470 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15471 return Unpack;
15472
15473 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
15474 if (Subtarget.hasVBMI())
15475 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
15476 DAG);
15477
15478 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
15479 if (Subtarget.hasXOP()) {
15480 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
15481 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
15482 }
15483
15484 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
15485 // PALIGNR will be cheaper than the second PSHUFB+OR.
15487 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15488 return V;
15489 }
15490
15491 return PSHUFB;
15492 }
15493
15494 // There are special ways we can lower some single-element blends.
15495 if (NumV2Elements == 1)
15497 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
15498 return V;
15499
15500 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
15501 return Blend;
15502
15503 // Check whether a compaction lowering can be done. This handles shuffles
15504 // which take every Nth element for some even N. See the helper function for
15505 // details.
15506 //
15507 // We special case these as they can be particularly efficiently handled with
15508 // the PACKUSB instruction on x86 and they show up in common patterns of
15509 // rearranging bytes to truncate wide elements.
15510 if (NumEvenDrops) {
15511 // NumEvenDrops is the power of two stride of the elements. Another way of
15512 // thinking about it is that we need to drop the even elements this many
15513 // times to get the original input.
15514
15515 // First we need to zero all the dropped bytes.
15516 assert(NumEvenDrops <= 3 &&
15517 "No support for dropping even elements more than 3 times.");
15518 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
15519 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
15520 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
15521 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
15522 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
15523 WordClearMask);
15524 if (!IsSingleInput)
15525 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
15526 WordClearMask);
15527
15528 // Now pack things back together.
15529 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15530 IsSingleInput ? V1 : V2);
15531 for (int i = 1; i < NumEvenDrops; ++i) {
15532 Result = DAG.getBitcast(MVT::v8i16, Result);
15533 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
15534 }
15535 return Result;
15536 }
15537
15538 int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
15539 if (NumOddDrops == 1) {
15540 V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
15541 DAG.getBitcast(MVT::v8i16, V1),
15542 DAG.getTargetConstant(8, DL, MVT::i8));
15543 if (!IsSingleInput)
15544 V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
15545 DAG.getBitcast(MVT::v8i16, V2),
15546 DAG.getTargetConstant(8, DL, MVT::i8));
15547 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15548 IsSingleInput ? V1 : V2);
15549 }
15550
15551 // Handle multi-input cases by blending/unpacking single-input shuffles.
15552 if (NumV2Elements > 0)
15553 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
15554 Zeroable, Subtarget, DAG);
15555
15556 // The fallback path for single-input shuffles widens this into two v8i16
15557 // vectors with unpacks, shuffles those, and then pulls them back together
15558 // with a pack.
15559 SDValue V = V1;
15560
15561 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15562 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15563 for (int i = 0; i < 16; ++i)
15564 if (Mask[i] >= 0)
15565 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
15566
15567 SDValue VLoHalf, VHiHalf;
15568 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
15569 // them out and avoid using UNPCK{L,H} to extract the elements of V as
15570 // i16s.
15571 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
15572 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
15573 // Use a mask to drop the high bytes.
15574 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
15575 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
15576 DAG.getConstant(0x00FF, DL, MVT::v8i16));
15577
15578 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
15579 VHiHalf = DAG.getUNDEF(MVT::v8i16);
15580
15581 // Squash the masks to point directly into VLoHalf.
15582 for (int &M : LoBlendMask)
15583 if (M >= 0)
15584 M /= 2;
15585 for (int &M : HiBlendMask)
15586 if (M >= 0)
15587 M /= 2;
15588 } else {
15589 // Otherwise just unpack the low half of V into VLoHalf and the high half into
15590 // VHiHalf so that we can blend them as i16s.
15591 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
15592
15593 VLoHalf = DAG.getBitcast(
15594 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
15595 VHiHalf = DAG.getBitcast(
15596 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
15597 }
15598
15599 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
15600 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
15601
15602 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
15603}
15604
15605/// Dispatching routine to lower various 128-bit x86 vector shuffles.
15606///
15607/// This routine breaks down the specific type of 128-bit shuffle and
15608/// dispatches to the lowering routines accordingly.
15610 MVT VT, SDValue V1, SDValue V2,
15611 const APInt &Zeroable,
15612 const X86Subtarget &Subtarget,
15613 SelectionDAG &DAG) {
15614 if (VT == MVT::v8bf16) {
15615 V1 = DAG.getBitcast(MVT::v8i16, V1);
15616 V2 = DAG.getBitcast(MVT::v8i16, V2);
15617 return DAG.getBitcast(VT,
15618 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
15619 }
15620
15621 switch (VT.SimpleTy) {
15622 case MVT::v2i64:
15623 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15624 case MVT::v2f64:
15625 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15626 case MVT::v4i32:
15627 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15628 case MVT::v4f32:
15629 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15630 case MVT::v8i16:
15631 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15632 case MVT::v8f16:
15633 return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15634 case MVT::v16i8:
15635 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15636
15637 default:
15638 llvm_unreachable("Unimplemented!");
15639 }
15640}
15641
15642/// Generic routine to split vector shuffle into half-sized shuffles.
15643///
15644/// This routine just extracts two subvectors, shuffles them independently, and
15645/// then concatenates them back together. This should work effectively with all
15646/// AVX vector shuffle types.
15648 SDValue V2, ArrayRef<int> Mask,
15649 SelectionDAG &DAG, bool SimpleOnly) {
15650 assert(VT.getSizeInBits() >= 256 &&
15651 "Only for 256-bit or wider vector shuffles!");
15652 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
15653 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
15654
15655 // If this came from the AVX1 v8i32 -> v8f32 bitcast, split using v4i32.
15656 if (VT == MVT::v8f32) {
15657 SDValue BC1 = peekThroughBitcasts(V1);
15658 SDValue BC2 = peekThroughBitcasts(V2);
15659 if (BC1.getValueType() == MVT::v8i32 && BC2.getValueType() == MVT::v8i32) {
15660 if (SDValue Split = splitAndLowerShuffle(DL, MVT::v8i32, BC1, BC2, Mask,
15661 DAG, SimpleOnly))
15662 return DAG.getBitcast(VT, Split);
15663 }
15664 }
15665
15666 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
15667 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
15668
15669 int NumElements = VT.getVectorNumElements();
15670 int SplitNumElements = NumElements / 2;
15671 MVT ScalarVT = VT.getVectorElementType();
15672 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
15673
15674 // Use splitVector/extractSubVector so that split build-vectors just build two
15675 // narrower build vectors. This helps shuffling with splats and zeros.
15676 auto SplitVector = [&](SDValue V) {
15677 SDValue LoV, HiV;
15678 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
15679 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
15680 DAG.getBitcast(SplitVT, HiV));
15681 };
15682
15683 SDValue LoV1, HiV1, LoV2, HiV2;
15684 std::tie(LoV1, HiV1) = SplitVector(V1);
15685 std::tie(LoV2, HiV2) = SplitVector(V2);
15686
15687 // Now create two 4-way blends of these half-width vectors.
15688 auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
15689 bool &UseHiV1, bool &UseLoV2,
15690 bool &UseHiV2) {
15691 UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
15692 for (int i = 0; i < SplitNumElements; ++i) {
15693 int M = HalfMask[i];
15694 if (M >= NumElements) {
15695 if (M >= NumElements + SplitNumElements)
15696 UseHiV2 = true;
15697 else
15698 UseLoV2 = true;
15699 } else if (M >= 0) {
15700 if (M >= SplitNumElements)
15701 UseHiV1 = true;
15702 else
15703 UseLoV1 = true;
15704 }
15705 }
15706 };
15707
15708 auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
15709 if (!SimpleOnly)
15710 return true;
15711
15712 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15713 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15714
15715 return !(UseHiV1 || UseHiV2);
15716 };
15717
15718 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
15719 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
15720 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
15721 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
15722 for (int i = 0; i < SplitNumElements; ++i) {
15723 int M = HalfMask[i];
15724 if (M >= NumElements) {
15725 V2BlendMask[i] = M - NumElements;
15726 BlendMask[i] = SplitNumElements + i;
15727 } else if (M >= 0) {
15728 V1BlendMask[i] = M;
15729 BlendMask[i] = i;
15730 }
15731 }
15732
15733 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15734 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15735
15736 // Because the lowering happens after all combining takes place, we need to
15737 // manually combine these blend masks as much as possible so that we create
15738 // a minimal number of high-level vector shuffle nodes.
15739 assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple");
15740
15741 // First try just blending the halves of V1 or V2.
15742 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
15743 return DAG.getUNDEF(SplitVT);
15744 if (!UseLoV2 && !UseHiV2)
15745 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15746 if (!UseLoV1 && !UseHiV1)
15747 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15748
15749 SDValue V1Blend, V2Blend;
15750 if (UseLoV1 && UseHiV1) {
15751 V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15752 } else {
15753 // We only use half of V1 so map the usage down into the final blend mask.
15754 V1Blend = UseLoV1 ? LoV1 : HiV1;
15755 for (int i = 0; i < SplitNumElements; ++i)
15756 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
15757 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
15758 }
15759 if (UseLoV2 && UseHiV2) {
15760 V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15761 } else {
15762 // We only use half of V2 so map the usage down into the final blend mask.
15763 V2Blend = UseLoV2 ? LoV2 : HiV2;
15764 for (int i = 0; i < SplitNumElements; ++i)
15765 if (BlendMask[i] >= SplitNumElements)
15766 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
15767 }
15768 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
15769 };
15770
15771 if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
15772 return SDValue();
15773
15774 SDValue Lo = HalfBlend(LoMask);
15775 SDValue Hi = HalfBlend(HiMask);
15776 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
15777}
15778
15779/// Either split a vector in halves or decompose the shuffles and the
15780/// blend/unpack.
15781///
15782/// This is provided as a good fallback for many lowerings of non-single-input
15783/// shuffles with more than one 128-bit lane. In those cases, we want to select
15784/// between splitting the shuffle into 128-bit components and stitching those
15785/// back together vs. extracting the single-input shuffles and blending those
15786/// results.
15788 SDValue V2, ArrayRef<int> Mask,
15789 const APInt &Zeroable,
15790 const X86Subtarget &Subtarget,
15791 SelectionDAG &DAG) {
15792 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
15793 "shuffles as it could then recurse on itself.");
15794 int Size = Mask.size();
15795
15796 // If this can be modeled as a broadcast of two elements followed by a blend,
15797 // prefer that lowering. This is especially important because broadcasts can
15798 // often fold with memory operands.
15799 auto DoBothBroadcast = [&] {
15800 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
15801 for (int M : Mask)
15802 if (M >= Size) {
15803 if (V2BroadcastIdx < 0)
15804 V2BroadcastIdx = M - Size;
15805 else if ((M - Size) != V2BroadcastIdx &&
15806 !IsElementEquivalent(Size, V2, V2, M - Size, V2BroadcastIdx))
15807 return false;
15808 } else if (M >= 0) {
15809 if (V1BroadcastIdx < 0)
15810 V1BroadcastIdx = M;
15811 else if (M != V1BroadcastIdx &&
15812 !IsElementEquivalent(Size, V1, V1, M, V1BroadcastIdx))
15813 return false;
15814 }
15815 return true;
15816 };
15817 if (DoBothBroadcast())
15818 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15819 Subtarget, DAG);
15820
15821 // If the inputs all stem from a single 128-bit lane of each input, then we
15822 // split them rather than blending because the split will decompose to
15823 // unusually few instructions.
15824 int LaneCount = VT.getSizeInBits() / 128;
15825 int LaneSize = Size / LaneCount;
15826 SmallBitVector LaneInputs[2];
15827 LaneInputs[0].resize(LaneCount, false);
15828 LaneInputs[1].resize(LaneCount, false);
15829 for (int i = 0; i < Size; ++i)
15830 if (Mask[i] >= 0)
15831 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
15832 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
15833 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15834 /*SimpleOnly*/ false);
15835
15836 // Without AVX2, if we can freely split the subvectors then we're better off
15837 // performing half width shuffles.
15838 if (!Subtarget.hasAVX2()) {
15839 SDValue BC1 = peekThroughBitcasts(V1);
15840 SDValue BC2 = peekThroughBitcasts(V2);
15841 bool SplatOrSplitV1 = isFreeToSplitVector(BC1, DAG) ||
15842 DAG.isSplatValue(BC1, /*AllowUndefs=*/true);
15843 bool SplatOrSplitV2 = isFreeToSplitVector(BC2, DAG) ||
15844 DAG.isSplatValue(BC2, /*AllowUndefs=*/true);
15845 if (SplatOrSplitV1 && SplatOrSplitV2)
15846 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15847 /*SimpleOnly*/ false);
15848 }
15849
15850 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
15851 // requires that the decomposed single-input shuffles don't end up here.
15852 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15853 Subtarget, DAG);
15854}
15855
15856// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15857// TODO: Extend to support v8f32 (+ 512-bit shuffles).
15859 SDValue V1, SDValue V2,
15860 ArrayRef<int> Mask,
15861 SelectionDAG &DAG) {
15862 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
15863
15864 int LHSMask[4] = {-1, -1, -1, -1};
15865 int RHSMask[4] = {-1, -1, -1, -1};
15866 int SHUFPDMask[4] = {-1, -1, -1, -1};
15867
15868 // As SHUFPD uses a single LHS/RHS element per lane, we can always
15869 // perform the shuffle once the lanes have been shuffled in place.
15870 for (int i = 0; i != 4; ++i) {
15871 int M = Mask[i];
15872 if (M < 0)
15873 continue;
15874 int LaneBase = i & ~1;
15875 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
15876 LaneMask[LaneBase + (M & 1)] = M;
15877 SHUFPDMask[i] = M & 1;
15878 }
15879
15880 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
15881 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
15882 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
15883 getSHUFPDImmForMask(SHUFPDMask, DL, DAG));
15884}
15885
15886/// Lower a vector shuffle crossing multiple 128-bit lanes as
15887/// a lane permutation followed by a per-lane permutation.
15888///
15889/// This is mainly for cases where we can have non-repeating permutes
15890/// in each lane.
15891///
15892/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
15893/// we should investigate merging them.
15895 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15896 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15897 int NumElts = VT.getVectorNumElements();
15898 int NumLanes = VT.getSizeInBits() / 128;
15899 int NumEltsPerLane = NumElts / NumLanes;
15900 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
15901
15902 /// Attempts to find a sublane permute with the given size
15903 /// that gets all elements into their target lanes.
15904 ///
15905 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
15906 /// If unsuccessful, returns false and may overwrite InLaneMask.
15907 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
15908 int NumSublanesPerLane = NumSublanes / NumLanes;
15909 int NumEltsPerSublane = NumElts / NumSublanes;
15910
15911 SmallVector<int, 16> CrossLaneMask;
15912 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
15913 // CrossLaneMask but one entry == one sublane.
15914 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
15915 APInt DemandedCrossLane = APInt::getZero(NumElts);
15916
15917 for (int i = 0; i != NumElts; ++i) {
15918 int M = Mask[i];
15919 if (M < 0)
15920 continue;
15921
15922 int SrcSublane = M / NumEltsPerSublane;
15923 int DstLane = i / NumEltsPerLane;
15924
15925 // We only need to get the elements into the right lane, not sublane.
15926 // So search all sublanes that make up the destination lane.
15927 bool Found = false;
15928 int DstSubStart = DstLane * NumSublanesPerLane;
15929 int DstSubEnd = DstSubStart + NumSublanesPerLane;
15930 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
15931 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
15932 continue;
15933
15934 Found = true;
15935 CrossLaneMaskLarge[DstSublane] = SrcSublane;
15936 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
15937 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
15938 DemandedCrossLane.setBit(InLaneMask[i]);
15939 break;
15940 }
15941 if (!Found)
15942 return SDValue();
15943 }
15944
15945 // Fill CrossLaneMask using CrossLaneMaskLarge.
15946 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
15947
15948 if (!CanUseSublanes) {
15949 // If we're only shuffling a single lowest lane and the rest are identity
15950 // then don't bother.
15951 // TODO - isShuffleMaskInputInPlace could be extended to something like
15952 // this.
15953 int NumIdentityLanes = 0;
15954 bool OnlyShuffleLowestLane = true;
15955 for (int i = 0; i != NumLanes; ++i) {
15956 int LaneOffset = i * NumEltsPerLane;
15957 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
15958 i * NumEltsPerLane))
15959 NumIdentityLanes++;
15960 else if (CrossLaneMask[LaneOffset] != 0)
15961 OnlyShuffleLowestLane = false;
15962 }
15963 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
15964 return SDValue();
15965 }
15966
15967 // Simplify CrossLaneMask based on the actual demanded elements.
15968 if (V1.hasOneUse())
15969 for (int i = 0; i != NumElts; ++i)
15970 if (!DemandedCrossLane[i])
15971 CrossLaneMask[i] = SM_SentinelUndef;
15972
15973 // Avoid returning the same shuffle operation. For example,
15974 // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
15975 // undef:v16i16
15976 if (CrossLaneMask == Mask || InLaneMask == Mask)
15977 return SDValue();
15978
15979 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
15980 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
15981 InLaneMask);
15982 };
15983
15984 // First attempt a solution with full lanes.
15985 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
15986 return V;
15987
15988 // The rest of the solutions use sublanes.
15989 if (!CanUseSublanes)
15990 return SDValue();
15991
15992 // Then attempt a solution with 64-bit sublanes (vpermq).
15993 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
15994 return V;
15995
15996 // If that doesn't work and we have fast variable cross-lane shuffle,
15997 // attempt 32-bit sublanes (vpermd).
15998 if (!Subtarget.hasFastVariableCrossLaneShuffle())
15999 return SDValue();
16000
16001 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
16002}
16003
16004/// Helper to get compute inlane shuffle mask for a complete shuffle mask.
16005static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
16006 SmallVector<int> &InLaneMask) {
16007 int Size = Mask.size();
16008 InLaneMask.assign(Mask.begin(), Mask.end());
16009 for (int i = 0; i < Size; ++i) {
16010 int &M = InLaneMask[i];
16011 if (M < 0)
16012 continue;
16013 if (((M % Size) / LaneSize) != (i / LaneSize))
16014 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
16015 }
16016}
16017
16018/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
16019/// source with a lane permutation.
16020///
16021/// This lowering strategy results in four instructions in the worst case for a
16022/// single-input cross lane shuffle which is lower than any other fully general
16023/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
16024/// shuffle pattern should be handled prior to trying this lowering.
16026 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16027 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
16028 // FIXME: This should probably be generalized for 512-bit vectors as well.
16029 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
16030 int Size = Mask.size();
16031 int LaneSize = Size / 2;
16032
16033 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16034 // Only do this if the elements aren't all from the lower lane,
16035 // otherwise we're (probably) better off doing a split.
16036 if (VT == MVT::v4f64 &&
16037 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
16038 return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);
16039
16040 // If there are only inputs from one 128-bit lane, splitting will in fact be
16041 // less expensive. The flags track whether the given lane contains an element
16042 // that crosses to another lane.
16043 bool AllLanes;
16044 if (!Subtarget.hasAVX2()) {
16045 bool LaneCrossing[2] = {false, false};
16046 for (int i = 0; i < Size; ++i)
16047 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
16048 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
16049 AllLanes = LaneCrossing[0] && LaneCrossing[1];
16050 } else {
16051 bool LaneUsed[2] = {false, false};
16052 for (int i = 0; i < Size; ++i)
16053 if (Mask[i] >= 0)
16054 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
16055 AllLanes = LaneUsed[0] && LaneUsed[1];
16056 }
16057
16058 // TODO - we could support shuffling V2 in the Flipped input.
16059 assert(V2.isUndef() &&
16060 "This last part of this routine only works on single input shuffles");
16061
16062 SmallVector<int> InLaneMask;
16063 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
16064
16065 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
16066 "In-lane shuffle mask expected");
16067
16068 // If we're not using both lanes in each lane and the inlane mask is not
16069 // repeating, then we're better off splitting.
16070 if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
16071 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
16072 /*SimpleOnly*/ false);
16073
16074 // Flip the lanes, and shuffle the results which should now be in-lane.
16075 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
16076 SDValue Flipped = DAG.getBitcast(PVT, V1);
16077 Flipped =
16078 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
16079 Flipped = DAG.getBitcast(VT, Flipped);
16080 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
16081}
16082
16083/// Handle lowering 2-lane 128-bit shuffles.
16085 SDValue V2, ArrayRef<int> Mask,
16086 const APInt &Zeroable,
16087 const X86Subtarget &Subtarget,
16088 SelectionDAG &DAG) {
16089 if (V2.isUndef()) {
16090 // Attempt to match VBROADCAST*128 subvector broadcast load.
16091 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
16092 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
16093 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
16095 MVT MemVT = VT.getHalfNumVectorElementsVT();
16096 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
16098 if (SDValue BcstLd = getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, DL,
16099 VT, MemVT, Ld, Ofs, DAG))
16100 return BcstLd;
16101 }
16102
16103 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
16104 if (Subtarget.hasAVX2())
16105 return SDValue();
16106 }
16107
16108 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
16109
16110 SmallVector<int, 4> WidenedMask;
16111 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
16112 return SDValue();
16113
16114 bool IsLowZero = (Zeroable & 0x3) == 0x3;
16115 bool IsHighZero = (Zeroable & 0xc) == 0xc;
16116
16117 // Try to use an insert into a zero vector.
16118 if (WidenedMask[0] == 0 && IsHighZero) {
16119 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16120 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
16121 DAG.getVectorIdxConstant(0, DL));
16122 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
16123 getZeroVector(VT, Subtarget, DAG, DL), LoV,
16124 DAG.getVectorIdxConstant(0, DL));
16125 }
16126
16127 // TODO: If minimizing size and one of the inputs is a zero vector and the
16128 // the zero vector has only one use, we could use a VPERM2X128 to save the
16129 // instruction bytes needed to explicitly generate the zero vector.
16130
16131 // Blends are faster and handle all the non-lane-crossing cases.
16132 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
16133 Subtarget, DAG))
16134 return Blend;
16135
16136 // If either input operand is a zero vector, use VPERM2X128 because its mask
16137 // allows us to replace the zero input with an implicit zero.
16138 if (!IsLowZero && !IsHighZero) {
16139 // Check for patterns which can be matched with a single insert of a 128-bit
16140 // subvector.
16141 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
16142 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
16143
16144 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
16145 // this will likely become vinsertf128 which can't fold a 256-bit memop.
16147 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16148 SDValue SubVec =
16149 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
16150 DAG.getVectorIdxConstant(0, DL));
16151 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
16152 DAG.getVectorIdxConstant(2, DL));
16153 }
16154 }
16155
16156 // Try to use SHUF128 if possible.
16157 if (Subtarget.hasVLX()) {
16158 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
16159 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
16160 ((WidenedMask[1] % 2) << 1);
16161 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
16162 DAG.getTargetConstant(PermMask, DL, MVT::i8));
16163 }
16164 }
16165 }
16166
16167 // Otherwise form a 128-bit permutation. After accounting for undefs,
16168 // convert the 64-bit shuffle mask selection values into 128-bit
16169 // selection bits by dividing the indexes by 2 and shifting into positions
16170 // defined by a vperm2*128 instruction's immediate control byte.
16171
16172 // The immediate permute control byte looks like this:
16173 // [1:0] - select 128 bits from sources for low half of destination
16174 // [2] - ignore
16175 // [3] - zero low half of destination
16176 // [5:4] - select 128 bits from sources for high half of destination
16177 // [6] - ignore
16178 // [7] - zero high half of destination
16179
16180 assert((WidenedMask[0] >= 0 || IsLowZero) &&
16181 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
16182
16183 unsigned PermMask = 0;
16184 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
16185 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
16186
16187 // Check the immediate mask and replace unused sources with undef.
16188 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
16189 V1 = DAG.getUNDEF(VT);
16190 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
16191 V2 = DAG.getUNDEF(VT);
16192
16193 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
16194 DAG.getTargetConstant(PermMask, DL, MVT::i8));
16195}
16196
16197/// Lower a vector shuffle by first fixing the 128-bit lanes and then
16198/// shuffling each lane.
16199///
16200/// This attempts to create a repeated lane shuffle where each lane uses one
16201/// or two of the lanes of the inputs. The lanes of the input vectors are
16202/// shuffled in one or two independent shuffles to get the lanes into the
16203/// position needed by the final shuffle.
16205 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16206 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16207 // This is only useful for binary shuffle with a non-repeating mask.
16208 if (V2.isUndef() || is128BitLaneRepeatedShuffleMask(VT, Mask))
16209 return SDValue();
16210
16211 int NumElts = Mask.size();
16212 int NumLanes = VT.getSizeInBits() / 128;
16213 int NumLaneElts = 128 / VT.getScalarSizeInBits();
16214 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
16215 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
16216
16217 // First pass will try to fill in the RepeatMask from lanes that need two
16218 // sources.
16219 for (int Lane = 0; Lane != NumLanes; ++Lane) {
16220 int Srcs[2] = {-1, -1};
16221 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
16222 for (int i = 0; i != NumLaneElts; ++i) {
16223 int M = Mask[(Lane * NumLaneElts) + i];
16224 if (M < 0)
16225 continue;
16226 // Determine which of the possible input lanes (NumLanes from each source)
16227 // this element comes from. Assign that as one of the sources for this
16228 // lane. We can assign up to 2 sources for this lane. If we run out
16229 // sources we can't do anything.
16230 int LaneSrc = M / NumLaneElts;
16231 int Src;
16232 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
16233 Src = 0;
16234 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
16235 Src = 1;
16236 else
16237 return SDValue();
16238
16239 Srcs[Src] = LaneSrc;
16240 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
16241 }
16242
16243 // If this lane has two sources, see if it fits with the repeat mask so far.
16244 if (Srcs[1] < 0)
16245 continue;
16246
16247 LaneSrcs[Lane][0] = Srcs[0];
16248 LaneSrcs[Lane][1] = Srcs[1];
16249
16250 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
16251 assert(M1.size() == M2.size() && "Unexpected mask size");
16252 for (int i = 0, e = M1.size(); i != e; ++i)
16253 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
16254 return false;
16255 return true;
16256 };
16257
16258 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
16259 assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
16260 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
16261 int M = Mask[i];
16262 if (M < 0)
16263 continue;
16264 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
16265 "Unexpected mask element");
16266 MergedMask[i] = M;
16267 }
16268 };
16269
16270 if (MatchMasks(InLaneMask, RepeatMask)) {
16271 // Merge this lane mask into the final repeat mask.
16272 MergeMasks(InLaneMask, RepeatMask);
16273 continue;
16274 }
16275
16276 // Didn't find a match. Swap the operands and try again.
16277 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
16279
16280 if (MatchMasks(InLaneMask, RepeatMask)) {
16281 // Merge this lane mask into the final repeat mask.
16282 MergeMasks(InLaneMask, RepeatMask);
16283 continue;
16284 }
16285
16286 // Couldn't find a match with the operands in either order.
16287 return SDValue();
16288 }
16289
16290 // Now handle any lanes with only one source.
16291 for (int Lane = 0; Lane != NumLanes; ++Lane) {
16292 // If this lane has already been processed, skip it.
16293 if (LaneSrcs[Lane][0] >= 0)
16294 continue;
16295
16296 for (int i = 0; i != NumLaneElts; ++i) {
16297 int M = Mask[(Lane * NumLaneElts) + i];
16298 if (M < 0)
16299 continue;
16300
16301 // If RepeatMask isn't defined yet we can define it ourself.
16302 if (RepeatMask[i] < 0)
16303 RepeatMask[i] = M % NumLaneElts;
16304
16305 if (RepeatMask[i] < NumElts) {
16306 if (RepeatMask[i] != M % NumLaneElts)
16307 return SDValue();
16308 LaneSrcs[Lane][0] = M / NumLaneElts;
16309 } else {
16310 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
16311 return SDValue();
16312 LaneSrcs[Lane][1] = M / NumLaneElts;
16313 }
16314 }
16315
16316 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
16317 return SDValue();
16318 }
16319
16320 SmallVector<int, 16> NewMask(NumElts, -1);
16321 for (int Lane = 0; Lane != NumLanes; ++Lane) {
16322 int Src = LaneSrcs[Lane][0];
16323 for (int i = 0; i != NumLaneElts; ++i) {
16324 int M = -1;
16325 if (Src >= 0)
16326 M = Src * NumLaneElts + i;
16327 NewMask[Lane * NumLaneElts + i] = M;
16328 }
16329 }
16330 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
16331 // Ensure we didn't get back the shuffle we started with.
16332 // FIXME: This is a hack to make up for some splat handling code in
16333 // getVectorShuffle.
16334 if (isa<ShuffleVectorSDNode>(NewV1) &&
16335 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
16336 return SDValue();
16337
16338 for (int Lane = 0; Lane != NumLanes; ++Lane) {
16339 int Src = LaneSrcs[Lane][1];
16340 for (int i = 0; i != NumLaneElts; ++i) {
16341 int M = -1;
16342 if (Src >= 0)
16343 M = Src * NumLaneElts + i;
16344 NewMask[Lane * NumLaneElts + i] = M;
16345 }
16346 }
16347 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
16348 // Ensure we didn't get back the shuffle we started with.
16349 // FIXME: This is a hack to make up for some splat handling code in
16350 // getVectorShuffle.
16351 if (isa<ShuffleVectorSDNode>(NewV2) &&
16352 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
16353 return SDValue();
16354
16355 for (int i = 0; i != NumElts; ++i) {
16356 if (Mask[i] < 0) {
16357 NewMask[i] = -1;
16358 continue;
16359 }
16360 NewMask[i] = RepeatMask[i % NumLaneElts];
16361 if (NewMask[i] < 0)
16362 continue;
16363
16364 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
16365 }
16366 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
16367}
16368
16369/// If the input shuffle mask results in a vector that is undefined in all upper
16370/// or lower half elements and that mask accesses only 2 halves of the
16371/// shuffle's operands, return true. A mask of half the width with mask indexes
16372/// adjusted to access the extracted halves of the original shuffle operands is
16373/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
16374/// lower half of each input operand is accessed.
16375static bool
16377 int &HalfIdx1, int &HalfIdx2) {
16378 assert((Mask.size() == HalfMask.size() * 2) &&
16379 "Expected input mask to be twice as long as output");
16380
16381 // Exactly one half of the result must be undef to allow narrowing.
16382 bool UndefLower = isUndefLowerHalf(Mask);
16383 bool UndefUpper = isUndefUpperHalf(Mask);
16384 if (UndefLower == UndefUpper)
16385 return false;
16386
16387 unsigned HalfNumElts = HalfMask.size();
16388 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
16389 HalfIdx1 = -1;
16390 HalfIdx2 = -1;
16391 for (unsigned i = 0; i != HalfNumElts; ++i) {
16392 int M = Mask[i + MaskIndexOffset];
16393 if (M < 0) {
16394 HalfMask[i] = M;
16395 continue;
16396 }
16397
16398 // Determine which of the 4 half vectors this element is from.
16399 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
16400 int HalfIdx = M / HalfNumElts;
16401
16402 // Determine the element index into its half vector source.
16403 int HalfElt = M % HalfNumElts;
16404
16405 // We can shuffle with up to 2 half vectors, set the new 'half'
16406 // shuffle mask accordingly.
16407 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
16408 HalfMask[i] = HalfElt;
16409 HalfIdx1 = HalfIdx;
16410 continue;
16411 }
16412 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
16413 HalfMask[i] = HalfElt + HalfNumElts;
16414 HalfIdx2 = HalfIdx;
16415 continue;
16416 }
16417
16418 // Too many half vectors referenced.
16419 return false;
16420 }
16421
16422 return true;
16423}
16424
16425/// Given the output values from getHalfShuffleMask(), create a half width
16426/// shuffle of extracted vectors followed by an insert back to full width.
16428 ArrayRef<int> HalfMask, int HalfIdx1,
16429 int HalfIdx2, bool UndefLower,
16430 SelectionDAG &DAG, bool UseConcat = false) {
16431 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
16432 assert(V1.getValueType().isSimple() && "Expecting only simple types");
16433
16434 MVT VT = V1.getSimpleValueType();
16435 MVT HalfVT = VT.getHalfNumVectorElementsVT();
16436 unsigned HalfNumElts = HalfVT.getVectorNumElements();
16437
16438 auto getHalfVector = [&](int HalfIdx) {
16439 if (HalfIdx < 0)
16440 return DAG.getUNDEF(HalfVT);
16441 SDValue V = (HalfIdx < 2 ? V1 : V2);
16442 HalfIdx = (HalfIdx % 2) * HalfNumElts;
16443 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
16444 DAG.getVectorIdxConstant(HalfIdx, DL));
16445 };
16446
16447 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
16448 SDValue Half1 = getHalfVector(HalfIdx1);
16449 SDValue Half2 = getHalfVector(HalfIdx2);
16450 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
16451 if (UseConcat) {
16452 SDValue Op0 = V;
16453 SDValue Op1 = DAG.getUNDEF(HalfVT);
16454 if (UndefLower)
16455 std::swap(Op0, Op1);
16456 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
16457 }
16458
16459 unsigned Offset = UndefLower ? HalfNumElts : 0;
16460 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
16462}
16463
16464/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
16465/// This allows for fast cases such as subvector extraction/insertion
16466/// or shuffling smaller vector types which can lower more efficiently.
16468 SDValue V2, ArrayRef<int> Mask,
16469 const X86Subtarget &Subtarget,
16470 SelectionDAG &DAG) {
16471 assert((VT.is256BitVector() || VT.is512BitVector()) &&
16472 "Expected 256-bit or 512-bit vector");
16473
16474 bool UndefLower = isUndefLowerHalf(Mask);
16475 if (!UndefLower && !isUndefUpperHalf(Mask))
16476 return SDValue();
16477
16478 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
16479 "Completely undef shuffle mask should have been simplified already");
16480
16481 // Upper half is undef and lower half is whole upper subvector.
16482 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
16483 MVT HalfVT = VT.getHalfNumVectorElementsVT();
16484 unsigned HalfNumElts = HalfVT.getVectorNumElements();
16485 if (!UndefLower &&
16486 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
16487 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
16488 DAG.getVectorIdxConstant(HalfNumElts, DL));
16489 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
16490 DAG.getVectorIdxConstant(0, DL));
16491 }
16492
16493 // Lower half is undef and upper half is whole lower subvector.
16494 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
16495 if (UndefLower &&
16496 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
16497 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
16498 DAG.getVectorIdxConstant(0, DL));
16499 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
16500 DAG.getVectorIdxConstant(HalfNumElts, DL));
16501 }
16502
16503 int HalfIdx1, HalfIdx2;
16504 SmallVector<int, 8> HalfMask(HalfNumElts);
16505 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
16506 return SDValue();
16507
16508 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
16509
16510 // Only shuffle the halves of the inputs when useful.
16511 unsigned NumLowerHalves =
16512 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
16513 unsigned NumUpperHalves =
16514 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
16515 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
16516
16517 // Determine the larger pattern of undef/halves, then decide if it's worth
16518 // splitting the shuffle based on subtarget capabilities and types.
16519 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
16520 if (!UndefLower) {
16521 // XXXXuuuu: no insert is needed.
16522 // Always extract lowers when setting lower - these are all free subreg ops.
16523 if (NumUpperHalves == 0)
16524 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16525 UndefLower, DAG);
16526
16527 if (NumUpperHalves == 1) {
16528 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
16529 if (Subtarget.hasAVX2()) {
16530 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
16531 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
16532 !is128BitUnpackShuffleMask(HalfMask, DAG) &&
16533 (!isSingleSHUFPSMask(HalfMask) ||
16534 Subtarget.hasFastVariableCrossLaneShuffle()))
16535 return SDValue();
16536 // If this is an unary shuffle (assume that the 2nd operand is
16537 // canonicalized to undef), then we can use vpermpd. Otherwise, we
16538 // are better off extracting the upper half of 1 operand and using a
16539 // narrow shuffle.
16540 if (EltWidth == 64 && V2.isUndef())
16541 return SDValue();
16542 // If this is an unary vXi8 shuffle with inplace halves, then perform as
16543 // full width pshufb, and then merge.
16544 if (EltWidth == 8 && HalfIdx1 == 0 && HalfIdx2 == 1)
16545 return SDValue();
16546 }
16547 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16548 if (Subtarget.hasAVX512() && VT.is512BitVector())
16549 return SDValue();
16550 // Extract + narrow shuffle is better than the wide alternative.
16551 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16552 UndefLower, DAG);
16553 }
16554
16555 // Don't extract both uppers, instead shuffle and then extract.
16556 assert(NumUpperHalves == 2 && "Half vector count went wrong");
16557 return SDValue();
16558 }
16559
16560 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
16561 if (NumUpperHalves == 0) {
16562 // AVX2 has efficient 64-bit element cross-lane shuffles.
16563 // TODO: Refine to account for unary shuffle, splat, and other masks?
16564 if (Subtarget.hasAVX2() && EltWidth == 64)
16565 return SDValue();
16566 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16567 if (Subtarget.hasAVX512() && VT.is512BitVector())
16568 return SDValue();
16569 // Narrow shuffle + insert is better than the wide alternative.
16570 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16571 UndefLower, DAG);
16572 }
16573
16574 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
16575 return SDValue();
16576}
16577
16578/// Handle case where shuffle sources are coming from the same 128-bit lane and
16579/// every lane can be represented as the same repeating mask - allowing us to
16580/// shuffle the sources with the repeating shuffle and then permute the result
16581/// to the destination lanes.
16583 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16584 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16585 int NumElts = VT.getVectorNumElements();
16586 int NumLanes = VT.getSizeInBits() / 128;
16587 int NumLaneElts = NumElts / NumLanes;
16588
16589 // On AVX2 we may be able to just shuffle the lowest elements and then
16590 // broadcast the result.
16591 if (Subtarget.hasAVX2()) {
16592 for (unsigned BroadcastSize : {16, 32, 64}) {
16593 if (BroadcastSize <= VT.getScalarSizeInBits())
16594 continue;
16595 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
16596
16597 // Attempt to match a repeating pattern every NumBroadcastElts,
16598 // accounting for UNDEFs but only references the lowest 128-bit
16599 // lane of the inputs.
16600 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
16601 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16602 for (int j = 0; j != NumBroadcastElts; ++j) {
16603 int M = Mask[i + j];
16604 if (M < 0)
16605 continue;
16606 int &R = RepeatMask[j];
16607 if (0 != ((M % NumElts) / NumLaneElts))
16608 return false;
16609 if (0 <= R && R != M)
16610 return false;
16611 R = M;
16612 }
16613 return true;
16614 };
16615
16616 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
16617 if (!FindRepeatingBroadcastMask(RepeatMask))
16618 continue;
16619
16620 // Shuffle the (lowest) repeated elements in place for broadcast.
16621 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
16622
16623 // Shuffle the actual broadcast.
16624 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
16625 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16626 for (int j = 0; j != NumBroadcastElts; ++j)
16627 BroadcastMask[i + j] = j;
16628
16629 // Avoid returning the same shuffle operation. For example,
16630 // v8i32 = vector_shuffle<0,1,0,1,0,1,0,1> t5, undef:v8i32
16631 if (BroadcastMask == Mask)
16632 return SDValue();
16633
16634 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
16635 BroadcastMask);
16636 }
16637 }
16638
16639 // Bail if the shuffle mask doesn't cross 128-bit lanes.
16640 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
16641 return SDValue();
16642
16643 // Bail if we already have a repeated lane shuffle mask.
16644 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
16645 return SDValue();
16646
16647 // Helper to look for repeated mask in each split sublane, and that those
16648 // sublanes can then be permuted into place.
16649 auto ShuffleSubLanes = [&](int SubLaneScale) {
16650 int NumSubLanes = NumLanes * SubLaneScale;
16651 int NumSubLaneElts = NumLaneElts / SubLaneScale;
16652
16653 // Check that all the sources are coming from the same lane and see if we
16654 // can form a repeating shuffle mask (local to each sub-lane). At the same
16655 // time, determine the source sub-lane for each destination sub-lane.
16656 int TopSrcSubLane = -1;
16657 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
16658 SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
16659 SubLaneScale,
16660 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
16661
16662 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
16663 // Extract the sub-lane mask, check that it all comes from the same lane
16664 // and normalize the mask entries to come from the first lane.
16665 int SrcLane = -1;
16666 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
16667 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16668 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
16669 if (M < 0)
16670 continue;
16671 int Lane = (M % NumElts) / NumLaneElts;
16672 if ((0 <= SrcLane) && (SrcLane != Lane))
16673 return SDValue();
16674 SrcLane = Lane;
16675 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
16676 SubLaneMask[Elt] = LocalM;
16677 }
16678
16679 // Whole sub-lane is UNDEF.
16680 if (SrcLane < 0)
16681 continue;
16682
16683 // Attempt to match against the candidate repeated sub-lane masks.
16684 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
16685 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
16686 for (int i = 0; i != NumSubLaneElts; ++i) {
16687 if (M1[i] < 0 || M2[i] < 0)
16688 continue;
16689 if (M1[i] != M2[i])
16690 return false;
16691 }
16692 return true;
16693 };
16694
16695 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
16696 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
16697 continue;
16698
16699 // Merge the sub-lane mask into the matching repeated sub-lane mask.
16700 for (int i = 0; i != NumSubLaneElts; ++i) {
16701 int M = SubLaneMask[i];
16702 if (M < 0)
16703 continue;
16704 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
16705 "Unexpected mask element");
16706 RepeatedSubLaneMask[i] = M;
16707 }
16708
16709 // Track the top most source sub-lane - by setting the remaining to
16710 // UNDEF we can greatly simplify shuffle matching.
16711 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
16712 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
16713 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
16714 break;
16715 }
16716
16717 // Bail if we failed to find a matching repeated sub-lane mask.
16718 if (Dst2SrcSubLanes[DstSubLane] < 0)
16719 return SDValue();
16720 }
16721 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
16722 "Unexpected source lane");
16723
16724 // Create a repeating shuffle mask for the entire vector.
16725 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
16726 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
16727 int Lane = SubLane / SubLaneScale;
16728 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
16729 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16730 int M = RepeatedSubLaneMask[Elt];
16731 if (M < 0)
16732 continue;
16733 int Idx = (SubLane * NumSubLaneElts) + Elt;
16734 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
16735 }
16736 }
16737
16738 // Shuffle each source sub-lane to its destination.
16739 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
16740 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
16741 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
16742 if (SrcSubLane < 0)
16743 continue;
16744 for (int j = 0; j != NumSubLaneElts; ++j)
16745 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
16746 }
16747
16748 // Avoid returning the same shuffle operation.
16749 // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32
16750 if (RepeatedMask == Mask || SubLaneMask == Mask)
16751 return SDValue();
16752
16753 SDValue RepeatedShuffle =
16754 DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
16755
16756 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
16757 SubLaneMask);
16758 };
16759
16760 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
16761 // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
16762 // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.
16763 // Otherwise we can only permute whole 128-bit lanes.
16764 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
16765 if (Subtarget.hasAVX2() && VT.is256BitVector()) {
16766 bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);
16767 MinSubLaneScale = 2;
16768 MaxSubLaneScale =
16769 (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
16770 }
16771 if (Subtarget.hasBWI() && VT == MVT::v64i8)
16772 MinSubLaneScale = MaxSubLaneScale = 4;
16773
16774 for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
16775 if (SDValue Shuffle = ShuffleSubLanes(Scale))
16776 return Shuffle;
16777
16778 return SDValue();
16779}
16780
16782 bool &ForceV1Zero, bool &ForceV2Zero,
16783 unsigned &ShuffleImm, ArrayRef<int> Mask,
16784 const APInt &Zeroable) {
16785 int NumElts = VT.getVectorNumElements();
16786 assert(VT.getScalarSizeInBits() == 64 &&
16787 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
16788 "Unexpected data type for VSHUFPD");
16789 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
16790 "Illegal shuffle mask");
16791
16792 bool ZeroLane[2] = { true, true };
16793 for (int i = 0; i < NumElts; ++i)
16794 ZeroLane[i & 1] &= Zeroable[i];
16795
16796 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
16797 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
16798 bool IsSHUFPD = true;
16799 bool IsCommutable = true;
16800 SmallVector<int, 8> SHUFPDMask(NumElts, -1);
16801 for (int i = 0; i < NumElts; ++i) {
16802 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
16803 continue;
16804 if (Mask[i] < 0)
16805 return false;
16806 int Val = (i & 6) + NumElts * (i & 1);
16807 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
16808 if (Mask[i] < Val || Mask[i] > Val + 1)
16809 IsSHUFPD = false;
16810 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
16811 IsCommutable = false;
16812 SHUFPDMask[i] = Mask[i] % 2;
16813 }
16814
16815 if (!IsSHUFPD && !IsCommutable)
16816 return false;
16817
16818 if (!IsSHUFPD && IsCommutable)
16819 std::swap(V1, V2);
16820
16821 ForceV1Zero = ZeroLane[0];
16822 ForceV2Zero = ZeroLane[1];
16823 ShuffleImm = getSHUFPDImm(SHUFPDMask);
16824 return true;
16825}
16826
16828 SDValue V2, ArrayRef<int> Mask,
16829 const APInt &Zeroable,
16830 const X86Subtarget &Subtarget,
16831 SelectionDAG &DAG) {
16832 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
16833 "Unexpected data type for VSHUFPD");
16834
16835 unsigned Immediate = 0;
16836 bool ForceV1Zero = false, ForceV2Zero = false;
16837 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
16838 Mask, Zeroable))
16839 return SDValue();
16840
16841 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
16842 if (ForceV1Zero)
16843 V1 = getZeroVector(VT, Subtarget, DAG, DL);
16844 if (ForceV2Zero)
16845 V2 = getZeroVector(VT, Subtarget, DAG, DL);
16846
16847 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
16848 DAG.getTargetConstant(Immediate, DL, MVT::i8));
16849}
16850
16851// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16852// by zeroable elements in the remaining 24 elements. Turn this into two
16853// vmovqb instructions shuffled together.
16855 SDValue V1, SDValue V2,
16856 ArrayRef<int> Mask,
16857 const APInt &Zeroable,
16858 SelectionDAG &DAG) {
16859 assert(VT == MVT::v32i8 && "Unexpected type!");
16860
16861 // The first 8 indices should be every 8th element.
16862 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
16863 return SDValue();
16864
16865 // Remaining elements need to be zeroable.
16866 if (Zeroable.countl_one() < (Mask.size() - 8))
16867 return SDValue();
16868
16869 V1 = DAG.getBitcast(MVT::v4i64, V1);
16870 V2 = DAG.getBitcast(MVT::v4i64, V2);
16871
16872 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
16873 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
16874
16875 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
16876 // the upper bits of the result using an unpckldq.
16877 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
16878 { 0, 1, 2, 3, 16, 17, 18, 19,
16879 4, 5, 6, 7, 20, 21, 22, 23 });
16880 // Insert the unpckldq into a zero vector to widen to v32i8.
16881 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
16882 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
16883 DAG.getVectorIdxConstant(0, DL));
16884}
16885
16886// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2
16887// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2
16888// =>
16889// ul = unpckl v1, v2
16890// uh = unpckh v1, v2
16891// a = vperm ul, uh
16892// b = vperm ul, uh
16893//
16894// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
16895// and permute. We cannot directly match v3 because it is split into two
16896// 256-bit vectors in earlier isel stages. Therefore, this function matches a
16897// pair of 256-bit shuffles and makes sure the masks are consecutive.
16898//
16899// Once unpck and permute nodes are created, the permute corresponding to this
16900// shuffle is returned, while the other permute replaces the other half of the
16901// shuffle in the selection dag.
16903 SDValue V1, SDValue V2,
16904 ArrayRef<int> Mask,
16905 SelectionDAG &DAG) {
16906 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
16907 VT != MVT::v32i8)
16908 return SDValue();
16909 // <B0, B1, B0+1, B1+1, ..., >
16910 auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,
16911 unsigned Begin1) {
16912 size_t Size = Mask.size();
16913 assert(Size % 2 == 0 && "Expected even mask size");
16914 for (unsigned I = 0; I < Size; I += 2) {
16915 if (Mask[I] != (int)(Begin0 + I / 2) ||
16916 Mask[I + 1] != (int)(Begin1 + I / 2))
16917 return false;
16918 }
16919 return true;
16920 };
16921 // Check which half is this shuffle node
16922 int NumElts = VT.getVectorNumElements();
16923 size_t FirstQtr = NumElts / 2;
16924 size_t ThirdQtr = NumElts + NumElts / 2;
16925 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
16926 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
16927 if (!IsFirstHalf && !IsSecondHalf)
16928 return SDValue();
16929
16930 // Find the intersection between shuffle users of V1 and V2.
16931 SmallVector<SDNode *, 2> Shuffles;
16932 for (SDNode *User : V1->users())
16933 if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
16934 User->getOperand(1) == V2)
16935 Shuffles.push_back(User);
16936 // Limit user size to two for now.
16937 if (Shuffles.size() != 2)
16938 return SDValue();
16939 // Find out which half of the 512-bit shuffles is each smaller shuffle
16940 auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
16941 auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
16942 SDNode *FirstHalf;
16943 SDNode *SecondHalf;
16944 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
16945 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
16946 FirstHalf = Shuffles[0];
16947 SecondHalf = Shuffles[1];
16948 } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
16949 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
16950 FirstHalf = Shuffles[1];
16951 SecondHalf = Shuffles[0];
16952 } else {
16953 return SDValue();
16954 }
16955 // Lower into unpck and perm. Return the perm of this shuffle and replace
16956 // the other.
16957 SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
16958 SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
16959 SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16960 DAG.getTargetConstant(0x20, DL, MVT::i8));
16961 SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16962 DAG.getTargetConstant(0x31, DL, MVT::i8));
16963 if (IsFirstHalf) {
16964 DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);
16965 return Perm1;
16966 }
16967 DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);
16968 return Perm2;
16969}
16970
16971/// Handle lowering of 4-lane 64-bit floating point shuffles.
16972///
16973/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
16974/// isn't available.
16976 const APInt &Zeroable, SDValue V1, SDValue V2,
16977 const X86Subtarget &Subtarget,
16978 SelectionDAG &DAG) {
16979 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16980 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16981 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16982
16983 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16984 Subtarget, DAG))
16985 return V;
16986
16987 if (V2.isUndef()) {
16988 // Check for being able to broadcast a single element.
16989 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
16990 Mask, Subtarget, DAG))
16991 return Broadcast;
16992
16993 // Use low duplicate instructions for masks that match their pattern.
16994 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
16995 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
16996
16997 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
16998 // Non-half-crossing single input shuffles can be lowered with an
16999 // interleaved permutation.
17000 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17001 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
17002 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
17003 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
17004 }
17005
17006 // With AVX2 we have direct support for this permutation.
17007 if (Subtarget.hasAVX2())
17008 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
17009 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
17010
17011 // Try to create an in-lane repeating shuffle mask and then shuffle the
17012 // results into the target lanes.
17014 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
17015 return V;
17016
17017 // Try to permute the lanes and then use a per-lane permute.
17018 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
17019 Mask, DAG, Subtarget))
17020 return V;
17021
17022 // Otherwise, fall back.
17023 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
17024 DAG, Subtarget);
17025 }
17026
17027 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
17028 Zeroable, Subtarget, DAG))
17029 return Blend;
17030
17031 // Use dedicated unpack instructions for masks that match their pattern.
17032 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, V1, V2, Mask, DAG))
17033 return V;
17034
17035 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
17036 Zeroable, Subtarget, DAG))
17037 return Op;
17038
17039 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
17040 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
17041 bool V1IsSplat = isShuffleMaskInputBroadcastable(0, Mask);
17042 bool V2IsSplat = isShuffleMaskInputBroadcastable(1, Mask);
17043
17044 // If we have lane crossing shuffles AND they don't all come from the lower
17045 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
17046 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
17047 // canonicalize to a blend of splat which isn't necessary for this combine.
17048 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
17049 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
17050 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
17051 (V2.getOpcode() != ISD::BUILD_VECTOR) &&
17052 (!Subtarget.hasAVX2() ||
17053 !((V1IsInPlace || V1IsSplat) && (V2IsInPlace || V2IsSplat))))
17054 return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
17055
17056 // If we have one input in place, then we can permute the other input and
17057 // blend the result.
17058 if (V1IsInPlace || V2IsInPlace)
17059 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
17060 Zeroable, Subtarget, DAG);
17061
17062 // Try to create an in-lane repeating shuffle mask and then shuffle the
17063 // results into the target lanes.
17065 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
17066 return V;
17067
17068 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17069 // shuffle. However, if we have AVX2 and either inputs are already in place,
17070 // we will be able to shuffle even across lanes the other input in a single
17071 // instruction so skip this pattern.
17072 if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))
17074 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
17075 return V;
17076
17077 // If we have VLX support, we can use VEXPAND.
17078 if (Subtarget.hasVLX())
17079 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4f64, V1, V2, Mask,
17080 Zeroable, Subtarget, DAG))
17081 return V;
17082
17083 // If we have AVX2 then we always want to lower with a blend because an v4 we
17084 // can fully permute the elements.
17085 if (Subtarget.hasAVX2())
17086 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
17087 Zeroable, Subtarget, DAG);
17088
17089 // Otherwise fall back on generic lowering.
17090 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
17091 Subtarget, DAG);
17092}
17093
17094/// Handle lowering of 4-lane 64-bit integer shuffles.
17095///
17096/// This routine is only called when we have AVX2 and thus a reasonable
17097/// instruction set for v4i64 shuffling..
17099 const APInt &Zeroable, SDValue V1, SDValue V2,
17100 const X86Subtarget &Subtarget,
17101 SelectionDAG &DAG) {
17102 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
17103 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
17104 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
17105 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
17106
17107 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
17108 Subtarget, DAG))
17109 return V;
17110
17111 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
17112 Zeroable, Subtarget, DAG))
17113 return Blend;
17114
17115 // Check for being able to broadcast a single element.
17116 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
17117 Subtarget, DAG))
17118 return Broadcast;
17119
17120 // Try to use shift instructions if fast.
17121 if (Subtarget.preferLowerShuffleAsShift())
17122 if (SDValue Shift =
17123 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
17124 Subtarget, DAG, /*BitwiseOnly*/ true))
17125 return Shift;
17126
17127 if (V2.isUndef()) {
17128 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17129 // can use lower latency instructions that will operate on both lanes.
17130 SmallVector<int, 2> RepeatedMask;
17131 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
17132 SmallVector<int, 4> PSHUFDMask;
17133 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
17134 return DAG.getBitcast(
17135 MVT::v4i64,
17136 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
17137 DAG.getBitcast(MVT::v8i32, V1),
17138 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17139 }
17140
17141 // AVX2 provides a direct instruction for permuting a single input across
17142 // lanes.
17143 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
17144 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
17145 }
17146
17147 // Try to use shift instructions.
17148 if (SDValue Shift =
17149 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,
17150 DAG, /*BitwiseOnly*/ false))
17151 return Shift;
17152
17153 // If we have VLX support, we can use VALIGN or VEXPAND.
17154 if (Subtarget.hasVLX()) {
17155 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
17156 Zeroable, Subtarget, DAG))
17157 return Rotate;
17158
17159 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4i64, V1, V2, Mask,
17160 Zeroable, Subtarget, DAG))
17161 return V;
17162 }
17163
17164 // Try to use PALIGNR.
17165 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
17166 Subtarget, DAG))
17167 return Rotate;
17168
17169 // Use dedicated unpack instructions for masks that match their pattern.
17170 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, V1, V2, Mask, DAG))
17171 return V;
17172
17173 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
17174 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
17175
17176 // If we have one input in place, then we can permute the other input and
17177 // blend the result.
17178 if (V1IsInPlace || V2IsInPlace)
17179 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
17180 Zeroable, Subtarget, DAG);
17181
17182 // Try to create an in-lane repeating shuffle mask and then shuffle the
17183 // results into the target lanes.
17185 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
17186 return V;
17187
17188 // Try to lower to PERMQ(BLENDD(V1,V2)).
17189 if (SDValue V =
17190 lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
17191 return V;
17192
17193 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17194 // shuffle. However, if we have AVX2 and either inputs are already in place,
17195 // we will be able to shuffle even across lanes the other input in a single
17196 // instruction so skip this pattern.
17197 if (!V1IsInPlace && !V2IsInPlace)
17199 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
17200 return Result;
17201
17202 // Otherwise fall back on generic blend lowering.
17203 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
17204 Zeroable, Subtarget, DAG);
17205}
17206
17207/// Handle lowering of 8-lane 32-bit floating point shuffles.
17208///
17209/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
17210/// isn't available.
17212 const APInt &Zeroable, SDValue V1, SDValue V2,
17213 const X86Subtarget &Subtarget,
17214 SelectionDAG &DAG) {
17215 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
17216 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
17217 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17218
17219 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
17220 Zeroable, Subtarget, DAG))
17221 return Blend;
17222
17223 // Check for being able to broadcast a single element.
17224 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
17225 Subtarget, DAG))
17226 return Broadcast;
17227
17228 if (!Subtarget.hasAVX2()) {
17229 SmallVector<int> InLaneMask;
17230 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
17231
17232 if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
17233 if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
17234 /*SimpleOnly*/ true))
17235 return R;
17236 }
17237 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
17238 Zeroable, Subtarget, DAG))
17239 return DAG.getBitcast(MVT::v8f32, ZExt);
17240
17241 // If the shuffle mask is repeated in each 128-bit lane, we have many more
17242 // options to efficiently lower the shuffle.
17243 SmallVector<int, 4> RepeatedMask;
17244 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
17245 assert(RepeatedMask.size() == 4 &&
17246 "Repeated masks must be half the mask width!");
17247
17248 // Use even/odd duplicate instructions for masks that match their pattern.
17249 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17250 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
17251 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17252 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
17253
17254 if (V2.isUndef())
17255 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
17256 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17257
17258 // Use dedicated unpack instructions for masks that match their pattern.
17259 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, V1, V2, Mask, DAG))
17260 return V;
17261
17262 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
17263 // have already handled any direct blends.
17264 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
17265 }
17266
17267 // Try to create an in-lane repeating shuffle mask and then shuffle the
17268 // results into the target lanes.
17270 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
17271 return V;
17272
17273 // If we have a single input shuffle with different shuffle patterns in the
17274 // two 128-bit lanes use the variable mask to VPERMILPS.
17275 if (V2.isUndef()) {
17276 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
17277 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17278 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
17279 }
17280 if (Subtarget.hasAVX2()) {
17281 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17282 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
17283 }
17284 // Otherwise, fall back.
17285 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
17286 DAG, Subtarget);
17287 }
17288
17289 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17290 // shuffle.
17292 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
17293 return Result;
17294
17295 // If we have VLX support, we can use VEXPAND.
17296 if (Subtarget.hasVLX())
17297 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f32, V1, V2, Mask,
17298 Zeroable, Subtarget, DAG))
17299 return V;
17300
17301 // Try to match an interleave of two v8f32s and lower them as unpck and
17302 // permutes using ymms. This needs to go before we try to split the vectors.
17303 // Don't attempt on AVX1 if we're likely to split vectors anyway.
17304 if ((Subtarget.hasAVX2() ||
17307 !Subtarget.hasAVX512())
17308 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
17309 Mask, DAG))
17310 return V;
17311
17312 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
17313 // since after split we get a more efficient code using vpunpcklwd and
17314 // vpunpckhwd instrs than vblend.
17315 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
17316 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
17317 Subtarget, DAG);
17318
17319 // If we have AVX2 then we always want to lower with a blend because at v8 we
17320 // can fully permute the elements.
17321 if (Subtarget.hasAVX2())
17322 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
17323 Zeroable, Subtarget, DAG);
17324
17325 // Otherwise fall back on generic lowering.
17326 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
17327 Subtarget, DAG);
17328}
17329
17330/// Handle lowering of 8-lane 32-bit integer shuffles.
17331///
17332/// This routine is only called when we have AVX2 and thus a reasonable
17333/// instruction set for v8i32 shuffling..
17335 const APInt &Zeroable, SDValue V1, SDValue V2,
17336 const X86Subtarget &Subtarget,
17337 SelectionDAG &DAG) {
17338 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
17339 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
17340 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17341 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
17342
17343 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
17344
17345 // Whenever we can lower this as a zext, that instruction is strictly faster
17346 // than any alternative. It also allows us to fold memory operands into the
17347 // shuffle in many cases.
17348 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
17349 Zeroable, Subtarget, DAG))
17350 return ZExt;
17351
17352 // Try to match an interleave of two v8i32s and lower them as unpck and
17353 // permutes using ymms. This needs to go before we try to split the vectors.
17354 if (!Subtarget.hasAVX512())
17355 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,
17356 Mask, DAG))
17357 return V;
17358
17359 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
17360 // since after split we get a more efficient code than vblend by using
17361 // vpunpcklwd and vpunpckhwd instrs.
17362 if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
17363 !Subtarget.hasAVX512())
17364 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
17365 Subtarget, DAG);
17366
17367 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
17368 Zeroable, Subtarget, DAG))
17369 return Blend;
17370
17371 // Check for being able to broadcast a single element.
17372 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
17373 Subtarget, DAG))
17374 return Broadcast;
17375
17376 // Try to use shift instructions if fast.
17377 if (Subtarget.preferLowerShuffleAsShift()) {
17378 if (SDValue Shift =
17379 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
17380 Subtarget, DAG, /*BitwiseOnly*/ true))
17381 return Shift;
17382 if (NumV2Elements == 0)
17383 if (SDValue Rotate =
17384 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
17385 return Rotate;
17386 }
17387
17388 // If the shuffle mask is repeated in each 128-bit lane we can use more
17389 // efficient instructions that mirror the shuffles across the two 128-bit
17390 // lanes.
17391 SmallVector<int, 4> RepeatedMask;
17392 bool Is128BitLaneRepeatedShuffle =
17393 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
17394 if (Is128BitLaneRepeatedShuffle) {
17395 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17396 if (V2.isUndef())
17397 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
17398 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17399
17400 // Use dedicated unpack instructions for masks that match their pattern.
17401 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, V1, V2, Mask, DAG))
17402 return V;
17403 }
17404
17405 // Try to use shift instructions.
17406 if (SDValue Shift =
17407 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,
17408 DAG, /*BitwiseOnly*/ false))
17409 return Shift;
17410
17411 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
17412 if (SDValue Rotate =
17413 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
17414 return Rotate;
17415
17416 // If we have VLX support, we can use VALIGN or EXPAND.
17417 if (Subtarget.hasVLX()) {
17418 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
17419 Zeroable, Subtarget, DAG))
17420 return Rotate;
17421
17422 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i32, V1, V2, Mask,
17423 Zeroable, Subtarget, DAG))
17424 return V;
17425 }
17426
17427 // Try to use byte rotation instructions.
17428 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
17429 Subtarget, DAG))
17430 return Rotate;
17431
17432 // Try to create an in-lane repeating shuffle mask and then shuffle the
17433 // results into the target lanes.
17435 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17436 return V;
17437
17438 if (V2.isUndef()) {
17439 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17440 // because that should be faster than the variable permute alternatives.
17441 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, V1, V2, Mask, DAG))
17442 return V;
17443
17444 // If the shuffle patterns aren't repeated but it's a single input, directly
17445 // generate a cross-lane VPERMD instruction.
17446 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17447 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
17448 }
17449
17450 // Assume that a single SHUFPS is faster than an alternative sequence of
17451 // multiple instructions (even if the CPU has a domain penalty).
17452 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17453 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17454 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
17455 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
17456 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
17457 CastV1, CastV2, DAG);
17458 return DAG.getBitcast(MVT::v8i32, ShufPS);
17459 }
17460
17461 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17462 // shuffle.
17464 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17465 return Result;
17466
17467 // Otherwise fall back on generic blend lowering.
17468 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
17469 Zeroable, Subtarget, DAG);
17470}
17471
17472/// Handle lowering of 16-lane 16-bit integer shuffles.
17473///
17474/// This routine is only called when we have AVX2 and thus a reasonable
17475/// instruction set for v16i16 shuffling..
17477 const APInt &Zeroable, SDValue V1, SDValue V2,
17478 const X86Subtarget &Subtarget,
17479 SelectionDAG &DAG) {
17480 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
17481 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
17482 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17483 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
17484
17485 // Whenever we can lower this as a zext, that instruction is strictly faster
17486 // than any alternative. It also allows us to fold memory operands into the
17487 // shuffle in many cases.
17489 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17490 return ZExt;
17491
17492 // Check for being able to broadcast a single element.
17493 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
17494 Subtarget, DAG))
17495 return Broadcast;
17496
17497 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
17498 Zeroable, Subtarget, DAG))
17499 return Blend;
17500
17501 // Use dedicated unpack instructions for masks that match their pattern.
17502 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, V1, V2, Mask, DAG))
17503 return V;
17504
17505 // Use dedicated pack instructions for masks that match their pattern.
17506 if (SDValue V =
17507 lowerShuffleWithPACK(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17508 return V;
17509
17510 // Try to use lower using a truncation.
17511 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17512 Subtarget, DAG))
17513 return V;
17514
17515 // Try to use shift instructions.
17516 if (SDValue Shift =
17517 lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17518 Subtarget, DAG, /*BitwiseOnly*/ false))
17519 return Shift;
17520
17521 // Try to use byte rotation instructions.
17522 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
17523 Subtarget, DAG))
17524 return Rotate;
17525
17526 // Try to create an in-lane repeating shuffle mask and then shuffle the
17527 // results into the target lanes.
17529 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17530 return V;
17531
17532 if (V2.isUndef()) {
17533 // Try to use bit rotation instructions.
17534 if (SDValue Rotate =
17535 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
17536 return Rotate;
17537
17538 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17539 // because that should be faster than the variable permute alternatives.
17540 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, V1, V2, Mask, DAG))
17541 return V;
17542
17543 // There are no generalized cross-lane shuffle operations available on i16
17544 // element types.
17545 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
17547 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17548 return V;
17549
17550 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
17551 DAG, Subtarget);
17552 }
17553
17554 SmallVector<int, 8> RepeatedMask;
17555 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
17556 // As this is a single-input shuffle, the repeated mask should be
17557 // a strictly valid v8i16 mask that we can pass through to the v8i16
17558 // lowering to handle even the v16 case.
17560 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
17561 }
17562 }
17563
17564 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
17565 Zeroable, Subtarget, DAG))
17566 return PSHUFB;
17567
17568 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
17569 if (Subtarget.hasBWI())
17570 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
17571
17572 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17573 // shuffle.
17575 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17576 return Result;
17577
17578 // Try to permute the lanes and then use a per-lane permute.
17580 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17581 return V;
17582
17583 // Try to match an interleave of two v16i16s and lower them as unpck and
17584 // permutes using ymms.
17585 if (!Subtarget.hasAVX512())
17586 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,
17587 Mask, DAG))
17588 return V;
17589
17590 // Otherwise fall back on generic lowering.
17591 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17592 Subtarget, DAG);
17593}
17594
17595/// Handle lowering of 32-lane 8-bit integer shuffles.
17596///
17597/// This routine is only called when we have AVX2 and thus a reasonable
17598/// instruction set for v32i8 shuffling..
17600 const APInt &Zeroable, SDValue V1, SDValue V2,
17601 const X86Subtarget &Subtarget,
17602 SelectionDAG &DAG) {
17603 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17604 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17605 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17606 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
17607
17608 // Whenever we can lower this as a zext, that instruction is strictly faster
17609 // than any alternative. It also allows us to fold memory operands into the
17610 // shuffle in many cases.
17611 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
17612 Zeroable, Subtarget, DAG))
17613 return ZExt;
17614
17615 // Check for being able to broadcast a single element.
17616 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
17617 Subtarget, DAG))
17618 return Broadcast;
17619
17620 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
17621 Zeroable, Subtarget, DAG))
17622 return Blend;
17623
17624 // Use dedicated unpack instructions for masks that match their pattern.
17625 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, V1, V2, Mask, DAG))
17626 return V;
17627
17628 // Use dedicated pack instructions for masks that match their pattern.
17629 if (SDValue V =
17630 lowerShuffleWithPACK(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17631 return V;
17632
17633 // Try to use lower using a truncation.
17634 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17635 Subtarget, DAG))
17636 return V;
17637
17638 // Try to use shift instructions.
17639 if (SDValue Shift =
17640 lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,
17641 DAG, /*BitwiseOnly*/ false))
17642 return Shift;
17643
17644 // Try to use byte rotation instructions.
17645 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
17646 Subtarget, DAG))
17647 return Rotate;
17648
17649 // Try to use bit rotation instructions.
17650 if (V2.isUndef())
17651 if (SDValue Rotate =
17652 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
17653 return Rotate;
17654
17655 // Try to create an in-lane repeating shuffle mask and then shuffle the
17656 // results into the target lanes.
17658 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17659 return V;
17660
17661 // There are no generalized cross-lane shuffle operations available on i8
17662 // element types.
17663 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
17664 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17665 // because that should be faster than the variable permute alternatives.
17666 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, V1, V2, Mask, DAG))
17667 return V;
17668
17670 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17671 return V;
17672
17673 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
17674 DAG, Subtarget);
17675 }
17676
17677 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
17678 Zeroable, Subtarget, DAG))
17679 return PSHUFB;
17680
17681 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
17682 if (Subtarget.hasVBMI())
17683 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
17684
17685 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17686 // shuffle.
17688 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17689 return Result;
17690
17691 // Try to permute the lanes and then use a per-lane permute.
17693 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17694 return V;
17695
17696 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
17697 // by zeroable elements in the remaining 24 elements. Turn this into two
17698 // vmovqb instructions shuffled together.
17699 if (Subtarget.hasVLX())
17700 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
17701 Mask, Zeroable, DAG))
17702 return V;
17703
17704 // Try to match an interleave of two v32i8s and lower them as unpck and
17705 // permutes using ymms.
17706 if (!Subtarget.hasAVX512())
17707 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,
17708 Mask, DAG))
17709 return V;
17710
17711 // Otherwise fall back on generic lowering.
17712 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17713 Subtarget, DAG);
17714}
17715
17716/// High-level routine to lower various 256-bit x86 vector shuffles.
17717///
17718/// This routine either breaks down the specific type of a 256-bit x86 vector
17719/// shuffle or splits it into two 128-bit shuffles and fuses the results back
17720/// together based on the available instructions.
17722 SDValue V1, SDValue V2, const APInt &Zeroable,
17723 const X86Subtarget &Subtarget,
17724 SelectionDAG &DAG) {
17725 // If we have a single input to the zero element, insert that into V1 if we
17726 // can do so cheaply.
17727 int NumElts = VT.getVectorNumElements();
17728 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17729
17730 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17732 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17733 return Insertion;
17734
17735 // Handle special cases where the lower or upper half is UNDEF.
17736 if (SDValue V =
17737 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17738 return V;
17739
17740 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
17741 // can check for those subtargets here and avoid much of the subtarget
17742 // querying in the per-vector-type lowering routines. With AVX1 we have
17743 // essentially *zero* ability to manipulate a 256-bit vector with integer
17744 // types. Since we'll use floating point types there eventually, just
17745 // immediately cast everything to a float and operate entirely in that domain.
17746 if (VT.isInteger() && !Subtarget.hasAVX2()) {
17747 int ElementBits = VT.getScalarSizeInBits();
17748 if (ElementBits < 32) {
17749 // No floating point type available, if we can't use the bit operations
17750 // for masking/blending then decompose into 128-bit vectors.
17751 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17752 Subtarget, DAG))
17753 return V;
17754 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17755 return V;
17756 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17757 }
17758
17759 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
17761 V1 = DAG.getBitcast(FpVT, V1);
17762 V2 = DAG.getBitcast(FpVT, V2);
17763 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
17764 }
17765
17766 if (VT == MVT::v16f16 || VT == MVT::v16bf16) {
17767 V1 = DAG.getBitcast(MVT::v16i16, V1);
17768 V2 = DAG.getBitcast(MVT::v16i16, V2);
17769 return DAG.getBitcast(VT,
17770 DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
17771 }
17772
17773 switch (VT.SimpleTy) {
17774 case MVT::v4f64:
17775 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17776 case MVT::v4i64:
17777 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17778 case MVT::v8f32:
17779 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17780 case MVT::v8i32:
17781 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17782 case MVT::v16i16:
17783 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17784 case MVT::v32i8:
17785 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17786
17787 default:
17788 llvm_unreachable("Not a valid 256-bit x86 vector type!");
17789 }
17790}
17791
17792/// Try to lower a vector shuffle as a 128-bit shuffles.
17794 const APInt &Zeroable, SDValue V1, SDValue V2,
17795 const X86Subtarget &Subtarget,
17796 SelectionDAG &DAG) {
17797 assert(VT.getScalarSizeInBits() == 64 &&
17798 "Unexpected element type size for 128bit shuffle.");
17799
17800 // To handle 256 bit vector requires VLX and most probably
17801 // function lowerV2X128VectorShuffle() is better solution.
17802 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
17803
17804 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
17805 SmallVector<int, 4> Widened128Mask;
17806 if (!canWidenShuffleElements(Mask, Widened128Mask))
17807 return SDValue();
17808 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
17809
17810 // Try to use an insert into a zero vector.
17811 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
17812 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
17813 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
17814 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
17815 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
17816 DAG.getVectorIdxConstant(0, DL));
17817 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17818 getZeroVector(VT, Subtarget, DAG, DL), LoV,
17819 DAG.getVectorIdxConstant(0, DL));
17820 }
17821
17822 // Check for patterns which can be matched with a single insert of a 256-bit
17823 // subvector.
17824 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
17825 if (OnlyUsesV1 ||
17826 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
17827 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
17828 SDValue SubVec =
17829 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
17830 DAG.getVectorIdxConstant(0, DL));
17831 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
17832 DAG.getVectorIdxConstant(4, DL));
17833 }
17834
17835 // See if this is an insertion of the lower 128-bits of V2 into V1.
17836 bool IsInsert = true;
17837 int V2Index = -1;
17838 for (int i = 0; i < 4; ++i) {
17839 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17840 if (Widened128Mask[i] < 0)
17841 continue;
17842
17843 // Make sure all V1 subvectors are in place.
17844 if (Widened128Mask[i] < 4) {
17845 if (Widened128Mask[i] != i) {
17846 IsInsert = false;
17847 break;
17848 }
17849 } else {
17850 // Make sure we only have a single V2 index and its the lowest 128-bits.
17851 if (V2Index >= 0 || Widened128Mask[i] != 4) {
17852 IsInsert = false;
17853 break;
17854 }
17855 V2Index = i;
17856 }
17857 }
17858 if (IsInsert && V2Index >= 0) {
17859 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17860 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
17861 DAG.getVectorIdxConstant(0, DL));
17862 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
17863 }
17864
17865 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
17866 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
17867 // possible we at least ensure the lanes stay sequential to help later
17868 // combines.
17869 SmallVector<int, 2> Widened256Mask;
17870 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
17871 Widened128Mask.clear();
17872 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
17873 }
17874
17875 // Try to lower to vshuf64x2/vshuf32x4.
17876 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
17877 int PermMask[4] = {-1, -1, -1, -1};
17878 // Ensure elements came from the same Op.
17879 for (int i = 0; i < 4; ++i) {
17880 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17881 if (Widened128Mask[i] < 0)
17882 continue;
17883
17884 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
17885 unsigned OpIndex = i / 2;
17886 if (Ops[OpIndex].isUndef())
17887 Ops[OpIndex] = Op;
17888 else if (Ops[OpIndex] != Op)
17889 return SDValue();
17890
17891 PermMask[i] = Widened128Mask[i] % 4;
17892 }
17893
17894 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
17895 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
17896}
17897
17898/// Handle lowering of 8-lane 64-bit floating point shuffles.
17900 const APInt &Zeroable, SDValue V1, SDValue V2,
17901 const X86Subtarget &Subtarget,
17902 SelectionDAG &DAG) {
17903 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17904 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17905 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17906
17907 if (V2.isUndef()) {
17908 // Use low duplicate instructions for masks that match their pattern.
17909 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
17910 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
17911
17912 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
17913 // Non-half-crossing single input shuffles can be lowered with an
17914 // interleaved permutation.
17915 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17916 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
17917 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
17918 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
17919 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
17920 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
17921 }
17922
17923 SmallVector<int, 4> RepeatedMask;
17924 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
17925 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
17926 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17927 }
17928
17929 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
17930 V2, Subtarget, DAG))
17931 return Shuf128;
17932
17933 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, V1, V2, Mask, DAG))
17934 return Unpck;
17935
17936 // Check if the blend happens to exactly fit that of SHUFPD.
17937 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
17938 Zeroable, Subtarget, DAG))
17939 return Op;
17940
17941 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f64, V1, V2, Mask, Zeroable,
17942 Subtarget, DAG))
17943 return V;
17944
17945 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
17946 Zeroable, Subtarget, DAG))
17947 return Blend;
17948
17949 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
17950}
17951
17952/// Handle lowering of 16-lane 32-bit floating point shuffles.
17954 const APInt &Zeroable, SDValue V1, SDValue V2,
17955 const X86Subtarget &Subtarget,
17956 SelectionDAG &DAG) {
17957 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17958 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17959 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17960
17961 // If the shuffle mask is repeated in each 128-bit lane, we have many more
17962 // options to efficiently lower the shuffle.
17963 SmallVector<int, 4> RepeatedMask;
17964 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
17965 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17966
17967 // Use even/odd duplicate instructions for masks that match their pattern.
17968 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17969 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
17970 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17971 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
17972
17973 if (V2.isUndef())
17974 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
17975 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17976
17977 // Use dedicated unpack instructions for masks that match their pattern.
17978 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, V1, V2, Mask, DAG))
17979 return V;
17980
17981 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17982 Zeroable, Subtarget, DAG))
17983 return Blend;
17984
17985 // Otherwise, fall back to a SHUFPS sequence.
17986 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
17987 }
17988
17989 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17990 Zeroable, Subtarget, DAG))
17991 return Blend;
17992
17994 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17995 return DAG.getBitcast(MVT::v16f32, ZExt);
17996
17997 // Try to create an in-lane repeating shuffle mask and then shuffle the
17998 // results into the target lanes.
18000 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
18001 return V;
18002
18003 // If we have a single input shuffle with different shuffle patterns in the
18004 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
18005 if (V2.isUndef() &&
18006 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
18007 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
18008 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
18009 }
18010
18011 // If we have AVX512F support, we can use VEXPAND.
18012 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16f32, V1, V2, Mask,
18013 Zeroable, Subtarget, DAG))
18014 return V;
18015
18016 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
18017}
18018
18019/// Handle lowering of 8-lane 64-bit integer shuffles.
18021 const APInt &Zeroable, SDValue V1, SDValue V2,
18022 const X86Subtarget &Subtarget,
18023 SelectionDAG &DAG) {
18024 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
18025 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
18026 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
18027
18028 // Try to use shift instructions if fast.
18029 if (Subtarget.preferLowerShuffleAsShift())
18030 if (SDValue Shift =
18031 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
18032 Subtarget, DAG, /*BitwiseOnly*/ true))
18033 return Shift;
18034
18035 if (V2.isUndef()) {
18036 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
18037 // can use lower latency instructions that will operate on all four
18038 // 128-bit lanes.
18039 SmallVector<int, 2> Repeated128Mask;
18040 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
18041 SmallVector<int, 4> PSHUFDMask;
18042 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
18043 return DAG.getBitcast(
18044 MVT::v8i64,
18045 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
18046 DAG.getBitcast(MVT::v16i32, V1),
18047 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
18048 }
18049
18050 SmallVector<int, 4> Repeated256Mask;
18051 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
18052 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
18053 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
18054 }
18055
18056 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
18057 V2, Subtarget, DAG))
18058 return Shuf128;
18059
18060 // Try to use shift instructions.
18061 if (SDValue Shift =
18062 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,
18063 DAG, /*BitwiseOnly*/ false))
18064 return Shift;
18065
18066 // Try to use VALIGN.
18067 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
18068 Zeroable, Subtarget, DAG))
18069 return Rotate;
18070
18071 // Try to use PALIGNR.
18072 if (Subtarget.hasBWI())
18073 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
18074 Subtarget, DAG))
18075 return Rotate;
18076
18077 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, V1, V2, Mask, DAG))
18078 return Unpck;
18079
18080 // If we have AVX512F support, we can use VEXPAND.
18081 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
18082 Subtarget, DAG))
18083 return V;
18084
18085 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
18086 Zeroable, Subtarget, DAG))
18087 return Blend;
18088
18089 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
18090}
18091
18092/// Handle lowering of 16-lane 32-bit integer shuffles.
18094 const APInt &Zeroable, SDValue V1, SDValue V2,
18095 const X86Subtarget &Subtarget,
18096 SelectionDAG &DAG) {
18097 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
18098 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
18099 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
18100
18101 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
18102
18103 // Whenever we can lower this as a zext, that instruction is strictly faster
18104 // than any alternative. It also allows us to fold memory operands into the
18105 // shuffle in many cases.
18107 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
18108 return ZExt;
18109
18110 // Try to use shift instructions if fast.
18111 if (Subtarget.preferLowerShuffleAsShift()) {
18112 if (SDValue Shift =
18113 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
18114 Subtarget, DAG, /*BitwiseOnly*/ true))
18115 return Shift;
18116 if (NumV2Elements == 0)
18117 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,
18118 Subtarget, DAG))
18119 return Rotate;
18120 }
18121
18122 // If the shuffle mask is repeated in each 128-bit lane we can use more
18123 // efficient instructions that mirror the shuffles across the four 128-bit
18124 // lanes.
18125 SmallVector<int, 4> RepeatedMask;
18126 bool Is128BitLaneRepeatedShuffle =
18127 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
18128 if (Is128BitLaneRepeatedShuffle) {
18129 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
18130 if (V2.isUndef())
18131 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
18132 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
18133
18134 // Use dedicated unpack instructions for masks that match their pattern.
18135 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, V1, V2, Mask, DAG))
18136 return V;
18137 }
18138
18139 // Try to use shift instructions.
18140 if (SDValue Shift =
18141 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
18142 Subtarget, DAG, /*BitwiseOnly*/ false))
18143 return Shift;
18144
18145 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
18146 if (SDValue Rotate =
18147 lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))
18148 return Rotate;
18149
18150 // Try to use VALIGN.
18151 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
18152 Zeroable, Subtarget, DAG))
18153 return Rotate;
18154
18155 // Try to use byte rotation instructions.
18156 if (Subtarget.hasBWI())
18157 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
18158 Subtarget, DAG))
18159 return Rotate;
18160
18161 // Assume that a single SHUFPS is faster than using a permv shuffle.
18162 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
18163 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
18164 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
18165 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
18166 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
18167 CastV1, CastV2, DAG);
18168 return DAG.getBitcast(MVT::v16i32, ShufPS);
18169 }
18170
18171 // Try to create an in-lane repeating shuffle mask and then shuffle the
18172 // results into the target lanes.
18174 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
18175 return V;
18176
18177 // If we have AVX512F support, we can use VEXPAND.
18178 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16i32, V1, V2, Mask,
18179 Zeroable, Subtarget, DAG))
18180 return V;
18181
18182 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
18183 Zeroable, Subtarget, DAG))
18184 return Blend;
18185
18186 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
18187}
18188
18189/// Handle lowering of 32-lane 16-bit integer shuffles.
18191 const APInt &Zeroable, SDValue V1, SDValue V2,
18192 const X86Subtarget &Subtarget,
18193 SelectionDAG &DAG) {
18194 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
18195 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
18196 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
18197 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
18198
18199 // Whenever we can lower this as a zext, that instruction is strictly faster
18200 // than any alternative. It also allows us to fold memory operands into the
18201 // shuffle in many cases.
18203 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
18204 return ZExt;
18205
18206 // Use dedicated unpack instructions for masks that match their pattern.
18207 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, V1, V2, Mask, DAG))
18208 return V;
18209
18210 // Use dedicated pack instructions for masks that match their pattern.
18211 if (SDValue V =
18212 lowerShuffleWithPACK(DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
18213 return V;
18214
18215 // Try to use shift instructions.
18216 if (SDValue Shift =
18217 lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,
18218 Subtarget, DAG, /*BitwiseOnly*/ false))
18219 return Shift;
18220
18221 // Try to use byte rotation instructions.
18222 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
18223 Subtarget, DAG))
18224 return Rotate;
18225
18226 if (V2.isUndef()) {
18227 // Try to use bit rotation instructions.
18228 if (SDValue Rotate =
18229 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
18230 return Rotate;
18231
18232 SmallVector<int, 8> RepeatedMask;
18233 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
18234 // As this is a single-input shuffle, the repeated mask should be
18235 // a strictly valid v8i16 mask that we can pass through to the v8i16
18236 // lowering to handle even the v32 case.
18237 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
18238 RepeatedMask, Subtarget, DAG);
18239 }
18240 }
18241
18242 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
18243 Zeroable, Subtarget, DAG))
18244 return Blend;
18245
18246 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
18247 Zeroable, Subtarget, DAG))
18248 return PSHUFB;
18249
18250 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18251 // shuffle.
18253 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
18254 return Result;
18255
18256 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
18257}
18258
18259/// Handle lowering of 64-lane 8-bit integer shuffles.
18261 const APInt &Zeroable, SDValue V1, SDValue V2,
18262 const X86Subtarget &Subtarget,
18263 SelectionDAG &DAG) {
18264 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
18265 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
18266 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
18267 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
18268
18269 // Whenever we can lower this as a zext, that instruction is strictly faster
18270 // than any alternative. It also allows us to fold memory operands into the
18271 // shuffle in many cases.
18273 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
18274 return ZExt;
18275
18276 // Use dedicated unpack instructions for masks that match their pattern.
18277 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, V1, V2, Mask, DAG))
18278 return V;
18279
18280 // Use dedicated pack instructions for masks that match their pattern.
18281 if (SDValue V =
18282 lowerShuffleWithPACK(DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18283 return V;
18284
18285 // Try to use shift instructions.
18286 if (SDValue Shift =
18287 lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,
18288 DAG, /*BitwiseOnly*/ false))
18289 return Shift;
18290
18291 // Try to use byte rotation instructions.
18292 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
18293 Subtarget, DAG))
18294 return Rotate;
18295
18296 // Try to use bit rotation instructions.
18297 if (V2.isUndef())
18298 if (SDValue Rotate =
18299 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
18300 return Rotate;
18301
18302 // Lower as AND if possible.
18303 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
18304 Zeroable, Subtarget, DAG))
18305 return Masked;
18306
18307 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
18308 Zeroable, Subtarget, DAG))
18309 return PSHUFB;
18310
18311 // Try to create an in-lane repeating shuffle mask and then shuffle the
18312 // results into the target lanes.
18313 // FIXME: Avoid on VBMI targets as the post lane permute often interferes
18314 // with shuffle combining (should be fixed by topological DAG sorting).
18315 if (!Subtarget.hasVBMI())
18317 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18318 return V;
18319
18321 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
18322 return Result;
18323
18324 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
18325 Zeroable, Subtarget, DAG))
18326 return Blend;
18327
18328 if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
18329 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
18330 // PALIGNR will be cheaper than the second PSHUFB+OR.
18331 if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
18332 Mask, Subtarget, DAG))
18333 return V;
18334
18335 // VBMI can use VPERMV/VPERMV3 byte shuffles more efficiently than
18336 // OR(PSHUFB,PSHUFB).
18337 if (Subtarget.hasVBMI())
18338 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget,
18339 DAG);
18340
18341 // If we can't directly blend but can use PSHUFB, that will be better as it
18342 // can both shuffle and set up the inefficient blend.
18343 bool V1InUse, V2InUse;
18344 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
18345 DAG, V1InUse, V2InUse);
18346 }
18347
18348 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18349 // shuffle.
18351 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18352 return Result;
18353
18354 // VBMI can use VPERMV/VPERMV3 byte shuffles.
18355 if (Subtarget.hasVBMI())
18356 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
18357
18358 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG,
18359 /*SimpleOnly*/ false);
18360}
18361
18362/// High-level routine to lower various 512-bit x86 vector shuffles.
18363///
18364/// This routine either breaks down the specific type of a 512-bit x86 vector
18365/// shuffle or splits it into two 256-bit shuffles and fuses the results back
18366/// together based on the available instructions.
18368 MVT VT, SDValue V1, SDValue V2,
18369 const APInt &Zeroable,
18370 const X86Subtarget &Subtarget,
18371 SelectionDAG &DAG) {
18372 assert(Subtarget.hasAVX512() &&
18373 "Cannot lower 512-bit vectors w/ basic ISA!");
18374
18375 // If we have a single input to the zero element, insert that into V1 if we
18376 // can do so cheaply.
18377 int NumElts = Mask.size();
18378 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
18379
18380 if (NumV2Elements == 1 && Mask[0] >= NumElts)
18382 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
18383 return Insertion;
18384
18385 // Handle special cases where the lower or upper half is UNDEF.
18386 if (SDValue V =
18387 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
18388 return V;
18389
18390 // Check for being able to broadcast a single element.
18391 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
18392 Subtarget, DAG))
18393 return Broadcast;
18394
18395 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
18396 // Try using bit ops for masking and blending before falling back to
18397 // splitting.
18398 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
18399 Subtarget, DAG))
18400 return V;
18401 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
18402 return V;
18403
18404 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
18405 }
18406
18407 if (VT == MVT::v32f16 || VT == MVT::v32bf16) {
18408 if (!Subtarget.hasBWI())
18409 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
18410 /*SimpleOnly*/ false);
18411
18412 V1 = DAG.getBitcast(MVT::v32i16, V1);
18413 V2 = DAG.getBitcast(MVT::v32i16, V2);
18414 return DAG.getBitcast(VT,
18415 DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
18416 }
18417
18418 // Dispatch to each element type for lowering. If we don't have support for
18419 // specific element type shuffles at 512 bits, immediately split them and
18420 // lower them. Each lowering routine of a given type is allowed to assume that
18421 // the requisite ISA extensions for that element type are available.
18422 switch (VT.SimpleTy) {
18423 case MVT::v8f64:
18424 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18425 case MVT::v16f32:
18426 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18427 case MVT::v8i64:
18428 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18429 case MVT::v16i32:
18430 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18431 case MVT::v32i16:
18432 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18433 case MVT::v64i8:
18434 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18435
18436 default:
18437 llvm_unreachable("Not a valid 512-bit x86 vector type!");
18438 }
18439}
18440
18442 MVT VT, SDValue V1, SDValue V2,
18443 const X86Subtarget &Subtarget,
18444 SelectionDAG &DAG) {
18445 // Shuffle should be unary.
18446 if (!V2.isUndef())
18447 return SDValue();
18448
18449 int ShiftAmt = -1;
18450 int NumElts = Mask.size();
18451 for (int i = 0; i != NumElts; ++i) {
18452 int M = Mask[i];
18453 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
18454 "Unexpected mask index.");
18455 if (M < 0)
18456 continue;
18457
18458 // The first non-undef element determines our shift amount.
18459 if (ShiftAmt < 0) {
18460 ShiftAmt = M - i;
18461 // Need to be shifting right.
18462 if (ShiftAmt <= 0)
18463 return SDValue();
18464 }
18465 // All non-undef elements must shift by the same amount.
18466 if (ShiftAmt != M - i)
18467 return SDValue();
18468 }
18469 assert(ShiftAmt >= 0 && "All undef?");
18470
18471 // Great we found a shift right.
18472 SDValue Res = widenMaskVector(V1, false, Subtarget, DAG, DL);
18473 Res = DAG.getNode(X86ISD::KSHIFTR, DL, Res.getValueType(), Res,
18474 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18475 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18476 DAG.getVectorIdxConstant(0, DL));
18477}
18478
18479// Determine if this shuffle can be implemented with a KSHIFT instruction.
18480// Returns the shift amount if possible or -1 if not. This is a simplified
18481// version of matchShuffleAsShift.
18482static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
18483 int MaskOffset, const APInt &Zeroable) {
18484 int Size = Mask.size();
18485
18486 auto CheckZeros = [&](int Shift, bool Left) {
18487 for (int j = 0; j < Shift; ++j)
18488 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
18489 return false;
18490
18491 return true;
18492 };
18493
18494 auto MatchShift = [&](int Shift, bool Left) {
18495 unsigned Pos = Left ? Shift : 0;
18496 unsigned Low = Left ? 0 : Shift;
18497 unsigned Len = Size - Shift;
18498 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
18499 };
18500
18501 for (int Shift = 1; Shift != Size; ++Shift)
18502 for (bool Left : {true, false})
18503 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
18504 Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;
18505 return Shift;
18506 }
18507
18508 return -1;
18509}
18510
18511
18512// Lower vXi1 vector shuffles.
18513// There is no a dedicated instruction on AVX-512 that shuffles the masks.
18514// The only way to shuffle bits is to sign-extend the mask vector to SIMD
18515// vector, shuffle and then truncate it back.
18517 MVT VT, SDValue V1, SDValue V2,
18518 const APInt &Zeroable,
18519 const X86Subtarget &Subtarget,
18520 SelectionDAG &DAG) {
18521 assert(Subtarget.hasAVX512() &&
18522 "Cannot lower 512-bit vectors w/o basic ISA!");
18523
18524 int NumElts = Mask.size();
18525 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
18526
18527 // Try to recognize shuffles that are just padding a subvector with zeros.
18528 int SubvecElts = 0;
18529 int Src = -1;
18530 for (int i = 0; i != NumElts; ++i) {
18531 if (Mask[i] >= 0) {
18532 // Grab the source from the first valid mask. All subsequent elements need
18533 // to use this same source.
18534 if (Src < 0)
18535 Src = Mask[i] / NumElts;
18536 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
18537 break;
18538 }
18539
18540 ++SubvecElts;
18541 }
18542 assert(SubvecElts != NumElts && "Identity shuffle?");
18543
18544 // Clip to a power 2.
18545 SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);
18546
18547 // Make sure the number of zeroable bits in the top at least covers the bits
18548 // not covered by the subvector.
18549 if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
18550 assert(Src >= 0 && "Expected a source!");
18551 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
18552 SDValue Extract =
18553 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Src == 0 ? V1 : V2,
18554 DAG.getVectorIdxConstant(0, DL));
18555 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
18556 DAG.getConstant(0, DL, VT), Extract,
18557 DAG.getVectorIdxConstant(0, DL));
18558 }
18559
18560 // Try a simple shift right with undef elements. Later we'll try with zeros.
18561 if (SDValue Shift =
18562 lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget, DAG))
18563 return Shift;
18564
18565 // Try to match KSHIFTs.
18566 unsigned Offset = 0;
18567 for (SDValue V : {V1, V2}) {
18568 unsigned Opcode;
18569 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
18570 if (ShiftAmt >= 0) {
18571 SDValue Res = widenMaskVector(V, false, Subtarget, DAG, DL);
18572 MVT WideVT = Res.getSimpleValueType();
18573 // Widened right shifts need two shifts to ensure we shift in zeroes.
18574 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
18575 int WideElts = WideVT.getVectorNumElements();
18576 // Shift left to put the original vector in the MSBs of the new size.
18577 Res =
18578 DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
18579 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
18580 // Increase the shift amount to account for the left shift.
18581 ShiftAmt += WideElts - NumElts;
18582 }
18583
18584 Res = DAG.getNode(Opcode, DL, WideVT, Res,
18585 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18586 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18587 DAG.getVectorIdxConstant(0, DL));
18588 }
18589 Offset += NumElts; // Increment for next iteration.
18590 }
18591
18592 // If we're performing an unary shuffle on a SETCC result, try to shuffle the
18593 // ops instead.
18594 // TODO: What other unary shuffles would benefit from this?
18595 if (NumV2Elements == 0 && V1.getOpcode() == ISD::SETCC && V1->hasOneUse()) {
18596 SDValue Op0 = V1.getOperand(0);
18597 SDValue Op1 = V1.getOperand(1);
18599 EVT OpVT = Op0.getValueType();
18600 if (OpVT.getScalarSizeInBits() >= 32 || isBroadcastShuffleMask(Mask))
18601 return DAG.getSetCC(
18602 DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
18603 DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
18604 }
18605
18606 // If this is a sequential shuffle with zero'd elements - then lower to AND.
18607 bool IsBlendWithZero = all_of(enumerate(Mask), [&Zeroable](auto M) {
18608 return Zeroable[M.index()] || (M.value() == (int)M.index());
18609 });
18610 if (IsBlendWithZero) {
18611 const unsigned Width = std::max<unsigned>(NumElts, 8u);
18612 MVT IntVT = MVT::getIntegerVT(Width);
18613
18614 APInt MaskValue = (~Zeroable).zextOrTrunc(Width);
18615 SDValue MaskNode = DAG.getConstant(MaskValue, DL, IntVT);
18616
18617 MVT MaskVecVT = MVT::getVectorVT(MVT::i1, Width);
18618 SDValue MaskVecNode = DAG.getBitcast(MaskVecVT, MaskNode);
18619
18620 SDValue MaskVec = DAG.getExtractSubvector(DL, VT, MaskVecNode, 0);
18621 return DAG.getNode(ISD::AND, DL, VT, V1, MaskVec);
18622 }
18623
18624 MVT ExtVT;
18625 switch (VT.SimpleTy) {
18626 default:
18627 llvm_unreachable("Expected a vector of i1 elements");
18628 case MVT::v2i1:
18629 ExtVT = MVT::v2i64;
18630 break;
18631 case MVT::v4i1:
18632 ExtVT = MVT::v4i32;
18633 break;
18634 case MVT::v8i1:
18635 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
18636 // shuffle.
18637 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
18638 break;
18639 case MVT::v16i1:
18640 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18641 // 256-bit operation available.
18642 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
18643 break;
18644 case MVT::v32i1:
18645 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18646 // 256-bit operation available.
18647 assert(Subtarget.hasBWI() && "Expected AVX512BW support");
18648 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
18649 break;
18650 case MVT::v64i1:
18651 // Fall back to scalarization. FIXME: We can do better if the shuffle
18652 // can be partitioned cleanly.
18653 if (!Subtarget.useBWIRegs())
18654 return SDValue();
18655 ExtVT = MVT::v64i8;
18656 break;
18657 }
18658
18659 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
18660 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
18661
18662 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
18663 // i1 was sign extended we can use X86ISD::CVT2MASK.
18664 int NumElems = VT.getVectorNumElements();
18665 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
18666 (Subtarget.hasDQI() && (NumElems < 32)))
18667 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
18668 Shuffle, ISD::SETGT);
18669
18670 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
18671}
18672
18673/// Helper function that returns true if the shuffle mask should be
18674/// commuted to improve canonicalization.
18676 int NumElements = Mask.size();
18677
18678 int NumV1Elements = 0, NumV2Elements = 0;
18679 for (int M : Mask)
18680 if (M < 0)
18681 continue;
18682 else if (M < NumElements)
18683 ++NumV1Elements;
18684 else
18685 ++NumV2Elements;
18686
18687 // Commute the shuffle as needed such that more elements come from V1 than
18688 // V2. This allows us to match the shuffle pattern strictly on how many
18689 // elements come from V1 without handling the symmetric cases.
18690 if (NumV2Elements > NumV1Elements)
18691 return true;
18692
18693 assert(NumV1Elements > 0 && "No V1 indices");
18694
18695 if (NumV2Elements == 0)
18696 return false;
18697
18698 // When the number of V1 and V2 elements are the same, try to minimize the
18699 // number of uses of V2 in the low half of the vector. When that is tied,
18700 // ensure that the sum of indices for V1 is equal to or lower than the sum
18701 // indices for V2. When those are equal, try to ensure that the number of odd
18702 // indices for V1 is lower than the number of odd indices for V2.
18703 if (NumV1Elements == NumV2Elements) {
18704 int LowV1Elements = 0, LowV2Elements = 0;
18705 for (int M : Mask.slice(0, NumElements / 2))
18706 if (M >= NumElements)
18707 ++LowV2Elements;
18708 else if (M >= 0)
18709 ++LowV1Elements;
18710 if (LowV2Elements > LowV1Elements)
18711 return true;
18712 if (LowV2Elements == LowV1Elements) {
18713 int SumV1Indices = 0, SumV2Indices = 0;
18714 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18715 if (Mask[i] >= NumElements)
18716 SumV2Indices += i;
18717 else if (Mask[i] >= 0)
18718 SumV1Indices += i;
18719 if (SumV2Indices < SumV1Indices)
18720 return true;
18721 if (SumV2Indices == SumV1Indices) {
18722 int NumV1OddIndices = 0, NumV2OddIndices = 0;
18723 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18724 if (Mask[i] >= NumElements)
18725 NumV2OddIndices += i % 2;
18726 else if (Mask[i] >= 0)
18727 NumV1OddIndices += i % 2;
18728 if (NumV2OddIndices < NumV1OddIndices)
18729 return true;
18730 }
18731 }
18732 }
18733
18734 return false;
18735}
18736
18738 const X86Subtarget &Subtarget) {
18739 if (!Subtarget.hasAVX512())
18740 return false;
18741
18742 if (!V.getValueType().isSimple())
18743 return false;
18744
18745 MVT VT = V.getSimpleValueType().getScalarType();
18746 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
18747 return false;
18748
18749 // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd
18750 // are preferable to blendw/blendvb/masked-mov.
18751 if ((VT == MVT::i16 || VT == MVT::i8) &&
18752 V.getSimpleValueType().getSizeInBits() < 512)
18753 return false;
18754
18755 auto HasMaskOperation = [&](SDValue V) {
18756 // TODO: Currently we only check limited opcode. We probably extend
18757 // it to all binary operation by checking TLI.isBinOp().
18758 switch (V->getOpcode()) {
18759 default:
18760 return false;
18761 case ISD::ADD:
18762 case ISD::SUB:
18763 case ISD::AND:
18764 case ISD::XOR:
18765 case ISD::OR:
18766 case ISD::SMAX:
18767 case ISD::SMIN:
18768 case ISD::UMAX:
18769 case ISD::UMIN:
18770 case ISD::ABS:
18771 case ISD::SHL:
18772 case ISD::SRL:
18773 case ISD::SRA:
18774 case ISD::MUL:
18775 break;
18776 }
18777 if (!V->hasOneUse())
18778 return false;
18779
18780 return true;
18781 };
18782
18783 if (HasMaskOperation(V))
18784 return true;
18785
18786 return false;
18787}
18788
18789// Forward declaration.
18792 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
18793 const X86Subtarget &Subtarget);
18794
18795 /// Top-level lowering for x86 vector shuffles.
18796///
18797/// This handles decomposition, canonicalization, and lowering of all x86
18798/// vector shuffles. Most of the specific lowering strategies are encapsulated
18799/// above in helper routines. The canonicalization attempts to widen shuffles
18800/// to involve fewer lanes of wider elements, consolidate symmetric patterns
18801/// s.t. only one of the two inputs needs to be tested, etc.
18803 SelectionDAG &DAG) {
18805 ArrayRef<int> OrigMask = SVOp->getMask();
18806 SDValue V1 = Op.getOperand(0);
18807 SDValue V2 = Op.getOperand(1);
18808 MVT VT = Op.getSimpleValueType();
18809 int NumElements = VT.getVectorNumElements();
18810 SDLoc DL(Op);
18811 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
18812
18813 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
18814 "Can't lower MMX shuffles");
18815
18816 bool V1IsUndef = V1.isUndef();
18817 bool V2IsUndef = V2.isUndef();
18818 if (V1IsUndef && V2IsUndef)
18819 return DAG.getUNDEF(VT);
18820
18821 // When we create a shuffle node we put the UNDEF node to second operand,
18822 // but in some cases the first operand may be transformed to UNDEF.
18823 // In this case we should just commute the node.
18824 if (V1IsUndef)
18825 return DAG.getCommutedVectorShuffle(*SVOp);
18826
18827 // Check for non-undef masks pointing at an undef vector and make the masks
18828 // undef as well. This makes it easier to match the shuffle based solely on
18829 // the mask.
18830 if (V2IsUndef &&
18831 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
18832 SmallVector<int, 8> NewMask(OrigMask);
18833 for (int &M : NewMask)
18834 if (M >= NumElements)
18835 M = -1;
18836 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
18837 }
18838
18839 // Check for illegal shuffle mask element index values.
18840 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
18841 (void)MaskUpperLimit;
18842 assert(llvm::all_of(OrigMask,
18843 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
18844 "Out of bounds shuffle index");
18845
18846 // We actually see shuffles that are entirely re-arrangements of a set of
18847 // zero inputs. This mostly happens while decomposing complex shuffles into
18848 // simple ones. Directly lower these as a buildvector of zeros.
18849 APInt KnownUndef, KnownZero;
18850 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
18851
18852 APInt Zeroable = KnownUndef | KnownZero;
18853 if (Zeroable.isAllOnes())
18854 return getZeroVector(VT, Subtarget, DAG, DL);
18855
18856 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
18857
18858 // Try to collapse shuffles into using a vector type with fewer elements but
18859 // wider element types. We cap this to not form integers or floating point
18860 // elements wider than 64 bits. It does not seem beneficial to form i128
18861 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
18862 SmallVector<int, 16> WidenedMask;
18863 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
18864 !canCombineAsMaskOperation(V1, Subtarget) &&
18865 !canCombineAsMaskOperation(V2, Subtarget) &&
18866 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
18867 // Shuffle mask widening should not interfere with a broadcast opportunity
18868 // by obfuscating the operands with bitcasts.
18869 // TODO: Avoid lowering directly from this top-level function: make this
18870 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
18871 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
18872 Subtarget, DAG))
18873 return Broadcast;
18874
18875 MVT NewEltVT = VT.isFloatingPoint()
18878 int NewNumElts = NumElements / 2;
18879 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
18880 // Make sure that the new vector type is legal. For example, v2f64 isn't
18881 // legal on SSE1.
18882 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
18883 if (V2IsZero) {
18884 // Modify the new Mask to take all zeros from the all-zero vector.
18885 // Choose indices that are blend-friendly.
18886 bool UsedZeroVector = false;
18887 assert(is_contained(WidenedMask, SM_SentinelZero) &&
18888 "V2's non-undef elements are used?!");
18889 for (int i = 0; i != NewNumElts; ++i)
18890 if (WidenedMask[i] == SM_SentinelZero) {
18891 WidenedMask[i] = i + NewNumElts;
18892 UsedZeroVector = true;
18893 }
18894 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
18895 // some elements to be undef.
18896 if (UsedZeroVector)
18897 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
18898 }
18899 V1 = DAG.getBitcast(NewVT, V1);
18900 V2 = DAG.getBitcast(NewVT, V2);
18901 return DAG.getBitcast(
18902 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
18903 }
18904 }
18905
18906 SmallVector<SDValue> Ops = {V1, V2};
18907 SmallVector<int> Mask(OrigMask);
18908
18909 // Canonicalize the shuffle with any horizontal ops inputs.
18910 // Don't attempt this if the shuffle can still be widened as we may lose
18911 // whole lane shuffle patterns.
18912 // NOTE: This may update Ops and Mask.
18913 if (!canWidenShuffleElements(Mask)) {
18915 Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
18916 return DAG.getBitcast(VT, HOp);
18917
18918 V1 = DAG.getBitcast(VT, Ops[0]);
18919 V2 = DAG.getBitcast(VT, Ops[1]);
18920 assert(NumElements == (int)Mask.size() &&
18921 "canonicalizeShuffleMaskWithHorizOp "
18922 "shouldn't alter the shuffle mask size");
18923 }
18924
18925 // Canonicalize zeros/ones/fp splat constants to ensure no undefs.
18926 // These will be materialized uniformly anyway, so make splat matching easier.
18927 // TODO: Allow all int constants?
18928 auto CanonicalizeConstant = [VT, &DL, &DAG](SDValue V) {
18929 if (auto *BV = dyn_cast<BuildVectorSDNode>(V)) {
18930 BitVector Undefs;
18931 if (SDValue Splat = BV->getSplatValue(&Undefs)) {
18932 if (Undefs.any() &&
18935 V = DAG.getBitcast(VT, DAG.getSplat(BV->getValueType(0), DL, Splat));
18936 }
18937 }
18938 }
18939 return V;
18940 };
18941 V1 = CanonicalizeConstant(V1);
18942 V2 = CanonicalizeConstant(V2);
18943
18944 // Commute the shuffle if it will improve canonicalization.
18947 std::swap(V1, V2);
18948 }
18949
18950 // For each vector width, delegate to a specialized lowering routine.
18951 if (VT.is128BitVector())
18952 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18953
18954 if (VT.is256BitVector())
18955 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18956
18957 if (VT.is512BitVector())
18958 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18959
18960 if (Is1BitVector)
18961 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18962
18963 llvm_unreachable("Unimplemented!");
18964}
18965
18966// As legal vpcompress instructions depend on various AVX512 extensions, try to
18967// convert illegal vector sizes to legal ones to avoid expansion.
18969 SelectionDAG &DAG) {
18970 assert(Subtarget.hasAVX512() &&
18971 "Need AVX512 for custom VECTOR_COMPRESS lowering.");
18972
18973 SDLoc DL(Op);
18974 SDValue Vec = Op.getOperand(0);
18975 SDValue Mask = Op.getOperand(1);
18976 SDValue Passthru = Op.getOperand(2);
18977
18978 EVT VecVT = Vec.getValueType();
18979 EVT ElementVT = VecVT.getVectorElementType();
18980 unsigned NumElements = VecVT.getVectorNumElements();
18981 unsigned NumVecBits = VecVT.getFixedSizeInBits();
18982 unsigned NumElementBits = ElementVT.getFixedSizeInBits();
18983
18984 // 128- and 256-bit vectors with <= 16 elements can be converted to and
18985 // compressed as 512-bit vectors in AVX512F.
18986 if (NumVecBits != 128 && NumVecBits != 256)
18987 return SDValue();
18988
18989 if (NumElementBits == 32 || NumElementBits == 64) {
18990 unsigned NumLargeElements = 512 / NumElementBits;
18991 MVT LargeVecVT =
18992 MVT::getVectorVT(ElementVT.getSimpleVT(), NumLargeElements);
18993 MVT LargeMaskVT = MVT::getVectorVT(MVT::i1, NumLargeElements);
18994
18995 Vec = widenSubVector(LargeVecVT, Vec, /*ZeroNewElements=*/false, Subtarget,
18996 DAG, DL);
18997 Mask = widenSubVector(LargeMaskVT, Mask, /*ZeroNewElements=*/true,
18998 Subtarget, DAG, DL);
18999 Passthru = Passthru.isUndef() ? DAG.getUNDEF(LargeVecVT)
19000 : widenSubVector(LargeVecVT, Passthru,
19001 /*ZeroNewElements=*/false,
19002 Subtarget, DAG, DL);
19003
19004 SDValue Compressed =
19005 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
19006 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, Compressed,
19007 DAG.getConstant(0, DL, MVT::i64));
19008 }
19009
19010 if (VecVT == MVT::v8i16 || VecVT == MVT::v8i8 || VecVT == MVT::v16i8 ||
19011 VecVT == MVT::v16i16) {
19012 MVT LageElementVT = MVT::getIntegerVT(512 / NumElements);
19013 EVT LargeVecVT = MVT::getVectorVT(LageElementVT, NumElements);
19014
19015 Vec = DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Vec);
19016 Passthru = Passthru.isUndef()
19017 ? DAG.getUNDEF(LargeVecVT)
19018 : DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Passthru);
19019
19020 SDValue Compressed =
19021 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
19022 return DAG.getNode(ISD::TRUNCATE, DL, VecVT, Compressed);
19023 }
19024
19025 return SDValue();
19026}
19027
19028/// Try to lower a VSELECT instruction to a vector shuffle.
19030 const X86Subtarget &Subtarget,
19031 SelectionDAG &DAG) {
19032 SDValue Cond = Op.getOperand(0);
19033 SDValue LHS = Op.getOperand(1);
19034 SDValue RHS = Op.getOperand(2);
19035 MVT VT = Op.getSimpleValueType();
19036
19037 // Only non-legal VSELECTs reach this lowering, convert those into generic
19038 // shuffles and re-use the shuffle lowering path for blends.
19042 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
19043 }
19044
19045 return SDValue();
19046}
19047
19048SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
19049 SDValue Cond = Op.getOperand(0);
19050 SDValue LHS = Op.getOperand(1);
19051 SDValue RHS = Op.getOperand(2);
19052
19053 SDLoc dl(Op);
19054 MVT VT = Op.getSimpleValueType();
19055 if (isSoftF16(VT, Subtarget)) {
19056 MVT NVT = VT.changeVectorElementTypeToInteger();
19057 return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
19058 DAG.getBitcast(NVT, LHS),
19059 DAG.getBitcast(NVT, RHS)));
19060 }
19061
19062 // A vselect where all conditions and data are constants can be optimized into
19063 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
19067 return SDValue();
19068
19069 // Try to lower this to a blend-style vector shuffle. This can handle all
19070 // constant condition cases.
19071 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
19072 return BlendOp;
19073
19074 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
19075 // with patterns on the mask registers on AVX-512.
19076 MVT CondVT = Cond.getSimpleValueType();
19077 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
19078 if (CondEltSize == 1)
19079 return Op;
19080
19081 // Variable blends are only legal from SSE4.1 onward.
19082 if (!Subtarget.hasSSE41())
19083 return SDValue();
19084
19085 unsigned EltSize = VT.getScalarSizeInBits();
19086 unsigned NumElts = VT.getVectorNumElements();
19087
19088 // Expand v32i16/v64i8 without BWI.
19089 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
19090 return SDValue();
19091
19092 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
19093 // into an i1 condition so that we can use the mask-based 512-bit blend
19094 // instructions.
19095 if (VT.getSizeInBits() == 512) {
19096 // Build a mask by testing the condition against zero.
19097 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
19098 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
19099 DAG.getConstant(0, dl, CondVT),
19100 ISD::SETNE);
19101 // Now return a new VSELECT using the mask.
19102 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
19103 }
19104
19105 // SEXT/TRUNC cases where the mask doesn't match the destination size.
19106 if (CondEltSize != EltSize) {
19107 // If we don't have a sign splat, rely on the expansion.
19108 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
19109 return SDValue();
19110
19111 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
19112 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
19113 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
19114 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
19115 }
19116
19117 // v16i16/v32i8 selects without AVX2, if the condition and another operand
19118 // are free to split, then better to split before expanding the
19119 // select. Don't bother with XOP as it has the fast VPCMOV instruction.
19120 // TODO: This is very similar to narrowVectorSelect.
19121 // TODO: Add Load splitting to isFreeToSplitVector ?
19122 if (EltSize < 32 && VT.is256BitVector() && !Subtarget.hasAVX2() &&
19123 !Subtarget.hasXOP()) {
19124 bool FreeCond = isFreeToSplitVector(Cond, DAG);
19125 bool FreeLHS = isFreeToSplitVector(LHS, DAG) ||
19126 (ISD::isNormalLoad(LHS.getNode()) && LHS.hasOneUse());
19127 bool FreeRHS = isFreeToSplitVector(RHS, DAG) ||
19128 (ISD::isNormalLoad(RHS.getNode()) && RHS.hasOneUse());
19129 if (FreeCond && (FreeLHS || FreeRHS))
19130 return splitVectorOp(Op, DAG, dl);
19131 }
19132
19133 // Only some types will be legal on some subtargets. If we can emit a legal
19134 // VSELECT-matching blend, return Op, and but if we need to expand, return
19135 // a null value.
19136 switch (VT.SimpleTy) {
19137 default:
19138 // Most of the vector types have blends past SSE4.1.
19139 return Op;
19140
19141 case MVT::v32i8:
19142 // The byte blends for AVX vectors were introduced only in AVX2.
19143 if (Subtarget.hasAVX2())
19144 return Op;
19145
19146 return SDValue();
19147
19148 case MVT::v8i16:
19149 case MVT::v16i16:
19150 case MVT::v8f16:
19151 case MVT::v16f16: {
19152 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
19153 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
19154 Cond = DAG.getBitcast(CastVT, Cond);
19155 LHS = DAG.getBitcast(CastVT, LHS);
19156 RHS = DAG.getBitcast(CastVT, RHS);
19157 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
19158 return DAG.getBitcast(VT, Select);
19159 }
19160 }
19161}
19162
19164 MVT VT = Op.getSimpleValueType();
19165 SDValue Vec = Op.getOperand(0);
19166 SDValue Idx = Op.getOperand(1);
19167 assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
19168 SDLoc dl(Op);
19169
19171 return SDValue();
19172
19173 if (VT.getSizeInBits() == 8) {
19174 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
19175 // we're going to zero extend the register or fold the store.
19178 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
19179 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
19180 DAG.getBitcast(MVT::v4i32, Vec), Idx));
19181
19182 unsigned IdxVal = Idx->getAsZExtVal();
19183 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
19184 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19185 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
19186 }
19187
19188 if (VT == MVT::f32) {
19189 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
19190 // the result back to FR32 register. It's only worth matching if the
19191 // result has a single use which is a store or a bitcast to i32. And in
19192 // the case of a store, it's not worth it if the index is a constant 0,
19193 // because a MOVSSmr can be used instead, which is smaller and faster.
19194 if (!Op.hasOneUse())
19195 return SDValue();
19196 SDNode *User = *Op.getNode()->user_begin();
19197 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
19198 (User->getOpcode() != ISD::BITCAST ||
19199 User->getValueType(0) != MVT::i32))
19200 return SDValue();
19201 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
19202 DAG.getBitcast(MVT::v4i32, Vec), Idx);
19203 return DAG.getBitcast(MVT::f32, Extract);
19204 }
19205
19206 if (VT == MVT::i32 || VT == MVT::i64)
19207 return Op;
19208
19209 return SDValue();
19210}
19211
19212/// Extract one bit from mask vector, like v16i1 or v8i1.
19213/// AVX-512 feature.
19215 const X86Subtarget &Subtarget) {
19216 SDValue Vec = Op.getOperand(0);
19217 SDLoc dl(Vec);
19218 MVT VecVT = Vec.getSimpleValueType();
19219 SDValue Idx = Op.getOperand(1);
19220 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
19221 MVT EltVT = Op.getSimpleValueType();
19222
19223 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
19224 "Unexpected vector type in ExtractBitFromMaskVector");
19225
19226 // variable index can't be handled in mask registers,
19227 // extend vector to VR512/128
19228 if (!IdxC) {
19229 unsigned NumElts = VecVT.getVectorNumElements();
19230 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
19231 // than extending to 128/256bit.
19232 if (NumElts == 1) {
19233 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
19235 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, DAG.getBitcast(IntVT, Vec));
19236 }
19237 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
19238 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
19239 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
19240 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
19241 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
19242 }
19243
19244 unsigned IdxVal = IdxC->getZExtValue();
19245 if (IdxVal == 0) // the operation is legal
19246 return Op;
19247
19248 // Extend to natively supported kshift.
19249 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
19250
19251 // Use kshiftr instruction to move to the lower element.
19252 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
19253 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19254
19255 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
19256 DAG.getVectorIdxConstant(0, dl));
19257}
19258
19259// Helper to find all the extracted elements from a vector.
19261 MVT VT = N->getSimpleValueType(0);
19262 unsigned NumElts = VT.getVectorNumElements();
19263 APInt DemandedElts = APInt::getZero(NumElts);
19264 for (SDNode *User : N->users()) {
19265 switch (User->getOpcode()) {
19266 case X86ISD::PEXTRB:
19267 case X86ISD::PEXTRW:
19270 DemandedElts.setAllBits();
19271 return DemandedElts;
19272 }
19273 DemandedElts.setBit(User->getConstantOperandVal(1));
19274 break;
19275 case ISD::BITCAST: {
19276 if (!User->getValueType(0).isSimple() ||
19277 !User->getValueType(0).isVector()) {
19278 DemandedElts.setAllBits();
19279 return DemandedElts;
19280 }
19281 APInt DemandedSrcElts = getExtractedDemandedElts(User);
19282 DemandedElts |= APIntOps::ScaleBitMask(DemandedSrcElts, NumElts);
19283 break;
19284 }
19285 default:
19286 DemandedElts.setAllBits();
19287 return DemandedElts;
19288 }
19289 }
19290 return DemandedElts;
19291}
19292
19293SDValue
19294X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
19295 SelectionDAG &DAG) const {
19296 SDLoc dl(Op);
19297 SDValue Vec = Op.getOperand(0);
19298 MVT VecVT = Vec.getSimpleValueType();
19299 SDValue Idx = Op.getOperand(1);
19300 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
19301
19302 if (VecVT.getVectorElementType() == MVT::i1)
19303 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
19304
19305 if (!IdxC) {
19306 // Its more profitable to go through memory (1 cycles throughput)
19307 // than using VMOVD + VPERMV/PSHUFB sequence (2/3 cycles throughput)
19308 // IACA tool was used to get performance estimation
19309 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
19310 //
19311 // example : extractelement <16 x i8> %a, i32 %i
19312 //
19313 // Block Throughput: 3.00 Cycles
19314 // Throughput Bottleneck: Port5
19315 //
19316 // | Num Of | Ports pressure in cycles | |
19317 // | Uops | 0 - DV | 5 | 6 | 7 | |
19318 // ---------------------------------------------
19319 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
19320 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
19321 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
19322 // Total Num Of Uops: 4
19323 //
19324 //
19325 // Block Throughput: 1.00 Cycles
19326 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
19327 //
19328 // | | Ports pressure in cycles | |
19329 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
19330 // ---------------------------------------------------------
19331 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
19332 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
19333 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
19334 // Total Num Of Uops: 4
19335
19336 return SDValue();
19337 }
19338
19339 unsigned IdxVal = IdxC->getZExtValue();
19340
19341 // If this is a 256-bit vector result, first extract the 128-bit vector and
19342 // then extract the element from the 128-bit vector.
19343 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
19344 // Get the 128-bit vector.
19345 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
19346 MVT EltVT = VecVT.getVectorElementType();
19347
19348 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
19349 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
19350
19351 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
19352 // this can be done with a mask.
19353 IdxVal &= ElemsPerChunk - 1;
19354 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
19355 DAG.getVectorIdxConstant(IdxVal, dl));
19356 }
19357
19358 assert(VecVT.is128BitVector() && "Unexpected vector length");
19359
19360 MVT VT = Op.getSimpleValueType();
19361
19362 if (VT == MVT::i16) {
19363 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
19364 // we're going to zero extend the register or fold the store (SSE41 only).
19365 if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
19366 !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
19367 if (Subtarget.hasFP16())
19368 return Op;
19369
19370 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
19371 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
19372 DAG.getBitcast(MVT::v4i32, Vec), Idx));
19373 }
19374
19375 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
19376 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19377 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
19378 }
19379
19380 if (Subtarget.hasSSE41())
19381 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
19382 return Res;
19383
19384 // Only extract a single element from a v16i8 source - determine the common
19385 // DWORD/WORD that all extractions share, and extract the sub-byte.
19386 // TODO: Add QWORD MOVQ extraction?
19387 if (VT == MVT::i8) {
19388 APInt DemandedElts = getExtractedDemandedElts(Vec.getNode());
19389 assert(DemandedElts.getBitWidth() == 16 && "Vector width mismatch");
19390
19391 // Extract either the lowest i32 or any i16, and extract the sub-byte.
19392 int DWordIdx = IdxVal / 4;
19393 if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
19394 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
19395 DAG.getBitcast(MVT::v4i32, Vec),
19396 DAG.getVectorIdxConstant(DWordIdx, dl));
19397 int ShiftVal = (IdxVal % 4) * 8;
19398 if (ShiftVal != 0)
19399 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
19400 DAG.getConstant(ShiftVal, dl, MVT::i8));
19401 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
19402 }
19403
19404 int WordIdx = IdxVal / 2;
19405 if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
19406 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
19407 DAG.getBitcast(MVT::v8i16, Vec),
19408 DAG.getVectorIdxConstant(WordIdx, dl));
19409 int ShiftVal = (IdxVal % 2) * 8;
19410 if (ShiftVal != 0)
19411 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
19412 DAG.getConstant(ShiftVal, dl, MVT::i8));
19413 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
19414 }
19415 }
19416
19417 if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
19418 if (IdxVal == 0)
19419 return Op;
19420
19421 // Shuffle the element to the lowest element, then movss or movsh.
19422 SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);
19423 Mask[0] = static_cast<int>(IdxVal);
19424 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
19425 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
19426 DAG.getVectorIdxConstant(0, dl));
19427 }
19428
19429 if (VT.getSizeInBits() == 64) {
19430 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
19431 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
19432 // to match extract_elt for f64.
19433 if (IdxVal == 0)
19434 return Op;
19435
19436 // UNPCKHPD the element to the lowest double word, then movsd.
19437 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
19438 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
19439 int Mask[2] = { 1, -1 };
19440 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
19441 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
19442 DAG.getVectorIdxConstant(0, dl));
19443 }
19444
19445 return SDValue();
19446}
19447
19448/// Insert one bit to mask vector, like v16i1 or v8i1.
19449/// AVX-512 feature.
19451 const X86Subtarget &Subtarget) {
19452 SDLoc dl(Op);
19453 SDValue Vec = Op.getOperand(0);
19454 SDValue Elt = Op.getOperand(1);
19455 SDValue Idx = Op.getOperand(2);
19456 MVT VecVT = Vec.getSimpleValueType();
19457
19458 if (!isa<ConstantSDNode>(Idx)) {
19459 // Non constant index. Extend source and destination,
19460 // insert element and then truncate the result.
19461 unsigned NumElts = VecVT.getVectorNumElements();
19462 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
19463 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
19464 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
19465 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
19466 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
19467 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
19468 }
19469
19470 // Copy into a k-register, extract to v1i1 and insert_subvector.
19471 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
19472 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
19473}
19474
19475SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
19476 SelectionDAG &DAG) const {
19477 MVT VT = Op.getSimpleValueType();
19478 MVT EltVT = VT.getVectorElementType();
19479 unsigned NumElts = VT.getVectorNumElements();
19480 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
19481
19482 if (EltVT == MVT::i1)
19483 return InsertBitToMaskVector(Op, DAG, Subtarget);
19484
19485 SDLoc dl(Op);
19486 SDValue N0 = Op.getOperand(0);
19487 SDValue N1 = Op.getOperand(1);
19488 SDValue N2 = Op.getOperand(2);
19489 auto *N2C = dyn_cast<ConstantSDNode>(N2);
19490
19491 if (EltVT == MVT::bf16) {
19492 MVT IVT = VT.changeVectorElementTypeToInteger();
19493 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT,
19494 DAG.getBitcast(IVT, N0),
19495 DAG.getBitcast(MVT::i16, N1), N2);
19496 return DAG.getBitcast(VT, Res);
19497 }
19498
19499 if (!N2C) {
19500 // Variable insertion indices, usually we're better off spilling to stack,
19501 // but AVX512 can use a variable compare+select by comparing against all
19502 // possible vector indices, and FP insertion has less gpr->simd traffic.
19503 if (!(Subtarget.hasBWI() ||
19504 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
19505 (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
19506 return SDValue();
19507
19508 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
19509 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
19510 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
19511 return SDValue();
19512
19513 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
19514 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
19515 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
19516
19517 SmallVector<SDValue, 16> RawIndices;
19518 for (unsigned I = 0; I != NumElts; ++I)
19519 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
19520 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
19521
19522 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
19523 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
19525 }
19526
19527 if (N2C->getAPIntValue().uge(NumElts))
19528 return SDValue();
19529 uint64_t IdxVal = N2C->getZExtValue();
19530
19531 bool IsZeroElt = X86::isZeroNode(N1);
19532 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
19533
19534 if (IsZeroElt || IsAllOnesElt) {
19535 // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
19536 // We don't deal with i8 0 since it appears to be handled elsewhere.
19537 if (IsAllOnesElt &&
19538 ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
19539 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
19540 SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
19541 SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
19542 SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
19543 CstVectorElts[IdxVal] = OnesCst;
19544 SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
19545 return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
19546 }
19547 // See if we can do this more efficiently with a blend shuffle with a
19548 // rematerializable vector.
19549 if (Subtarget.hasSSE41() &&
19550 (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
19551 SmallVector<int, 8> BlendMask;
19552 for (unsigned i = 0; i != NumElts; ++i)
19553 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19554 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
19555 : getOnesVector(VT, DAG, dl);
19556 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
19557 }
19558 }
19559
19560 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
19561 // into that, and then insert the subvector back into the result.
19562 if (VT.is256BitVector() || VT.is512BitVector()) {
19563 // With a 256-bit vector, we can insert into the zero element efficiently
19564 // using a blend if we have AVX or AVX2 and the right data type.
19565 if (VT.is256BitVector() && IdxVal == 0) {
19566 // TODO: It is worthwhile to cast integer to floating point and back
19567 // and incur a domain crossing penalty if that's what we'll end up
19568 // doing anyway after extracting to a 128-bit vector.
19569 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
19570 (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
19571 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19572 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
19573 DAG.getTargetConstant(1, dl, MVT::i8));
19574 }
19575 }
19576
19577 unsigned NumEltsIn128 = 128 / EltSizeInBits;
19578 assert(isPowerOf2_32(NumEltsIn128) &&
19579 "Vectors will always have power-of-two number of elements.");
19580
19581 // If we are not inserting into the low 128-bit vector chunk,
19582 // then prefer the broadcast+blend sequence.
19583 // FIXME: relax the profitability check iff all N1 uses are insertions.
19584 if (IdxVal >= NumEltsIn128 &&
19585 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
19586 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
19587 X86::mayFoldLoad(N1, Subtarget)))) {
19588 SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
19589 SmallVector<int, 8> BlendMask;
19590 for (unsigned i = 0; i != NumElts; ++i)
19591 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19592 return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
19593 }
19594
19595 // Get the desired 128-bit vector chunk.
19596 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
19597
19598 // Insert the element into the desired chunk.
19599 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
19600 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
19601
19602 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
19603 DAG.getVectorIdxConstant(IdxIn128, dl));
19604
19605 // Insert the changed part back into the bigger vector
19606 return insert128BitVector(N0, V, IdxVal, DAG, dl);
19607 }
19608 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
19609
19610 // This will be just movw/movd/movq/movsh/movss/movsd.
19611 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
19612 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
19613 EltVT == MVT::f16 || EltVT == MVT::i64) {
19614 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19615 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19616 }
19617
19618 // We can't directly insert an i8 or i16 into a vector, so zero extend
19619 // it to i32 first.
19620 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
19621 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
19622 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
19623 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
19624 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19625 return DAG.getBitcast(VT, N1);
19626 }
19627 }
19628
19629 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
19630 // argument. SSE41 required for pinsrb.
19631 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
19632 unsigned Opc;
19633 if (VT == MVT::v8i16) {
19634 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
19635 Opc = X86ISD::PINSRW;
19636 } else {
19637 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
19638 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
19639 Opc = X86ISD::PINSRB;
19640 }
19641
19642 assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
19643 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
19644 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
19645 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
19646 }
19647
19648 if (Subtarget.hasSSE41()) {
19649 if (EltVT == MVT::f32) {
19650 // Bits [7:6] of the constant are the source select. This will always be
19651 // zero here. The DAG Combiner may combine an extract_elt index into
19652 // these bits. For example (insert (extract, 3), 2) could be matched by
19653 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
19654 // Bits [5:4] of the constant are the destination select. This is the
19655 // value of the incoming immediate.
19656 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
19657 // combine either bitwise AND or insert of float 0.0 to set these bits.
19658
19659 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
19660 if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
19661 // If this is an insertion of 32-bits into the low 32-bits of
19662 // a vector, we prefer to generate a blend with immediate rather
19663 // than an insertps. Blends are simpler operations in hardware and so
19664 // will always have equal or better performance than insertps.
19665 // But if optimizing for size and there's a load folding opportunity,
19666 // generate insertps because blendps does not have a 32-bit memory
19667 // operand form.
19668 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19669 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
19670 DAG.getTargetConstant(1, dl, MVT::i8));
19671 }
19672 // Create this as a scalar to vector..
19673 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19674 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
19675 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
19676 }
19677
19678 // PINSR* works with constant index.
19679 if (EltVT == MVT::i32 || EltVT == MVT::i64)
19680 return Op;
19681 }
19682
19683 return SDValue();
19684}
19685
19686static SDValue LowerFLDEXP(SDValue Op, const X86Subtarget &Subtarget,
19687 SelectionDAG &DAG) {
19688 SDLoc DL(Op);
19689 SDValue X = Op.getOperand(0);
19690 MVT XTy = X.getSimpleValueType();
19691 SDValue Exp = Op.getOperand(1);
19692
19693 switch (XTy.SimpleTy) {
19694 default:
19695 return SDValue();
19696 case MVT::f16:
19697 if (!Subtarget.hasFP16())
19698 X = DAG.getFPExtendOrRound(X, DL, MVT::f32);
19699 [[fallthrough]];
19700 case MVT::f32:
19701 case MVT::f64: {
19702 MVT VT = MVT::getVectorVT(X.getSimpleValueType(),
19703 128 / X.getSimpleValueType().getSizeInBits());
19704 Exp = DAG.getNode(ISD::SINT_TO_FP, DL, X.getValueType(), Exp);
19705 SDValue VX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, X);
19706 SDValue VExp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Exp);
19707 SDValue Scalefs = DAG.getNode(X86ISD::SCALEFS, DL, VT, VX, VExp);
19708 SDValue Final = DAG.getExtractVectorElt(DL, X.getValueType(), Scalefs, 0);
19709 return DAG.getFPExtendOrRound(Final, DL, XTy);
19710 }
19711 case MVT::v4f32:
19712 case MVT::v2f64:
19713 case MVT::v8f32:
19714 case MVT::v4f64:
19715 case MVT::v16f32:
19716 case MVT::v8f64:
19717 if (XTy.getSizeInBits() == 512 || Subtarget.hasVLX()) {
19718 Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp);
19719 return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp);
19720 }
19721 break;
19722 case MVT::v8f16:
19723 case MVT::v16f16:
19724 if (Subtarget.hasFP16()) {
19725 if (Subtarget.hasVLX()) {
19726 Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp);
19727 return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp);
19728 }
19729 break;
19730 }
19731 X = DAG.getFPExtendOrRound(X, DL, XTy.changeVectorElementType(MVT::f32));
19732 Exp = DAG.getSExtOrTrunc(Exp, DL,
19733 X.getSimpleValueType().changeTypeToInteger());
19734 break;
19735 case MVT::v32f16:
19736 if (Subtarget.hasFP16()) {
19737 Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp);
19738 return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp);
19739 }
19740 return splitVectorOp(Op, DAG, DL);
19741 }
19742 SDValue WideX = widenSubVector(X, true, Subtarget, DAG, DL, 512);
19743 SDValue WideExp = widenSubVector(Exp, true, Subtarget, DAG, DL, 512);
19744 Exp = DAG.getNode(ISD::SINT_TO_FP, DL, WideExp.getSimpleValueType(), Exp);
19745 SDValue Scalef =
19746 DAG.getNode(X86ISD::SCALEF, DL, WideX.getValueType(), WideX, WideExp);
19747 SDValue Final =
19748 DAG.getExtractSubvector(DL, X.getSimpleValueType(), Scalef, 0);
19749 return DAG.getFPExtendOrRound(Final, DL, XTy);
19750}
19751
19753 SelectionDAG &DAG) {
19754 SDLoc dl(Op);
19755 MVT OpVT = Op.getSimpleValueType();
19756
19757 // It's always cheaper to replace a xor+movd with xorps and simplifies further
19758 // combines.
19759 if (X86::isZeroNode(Op.getOperand(0)))
19760 return getZeroVector(OpVT, Subtarget, DAG, dl);
19761
19762 // If this is a 256-bit vector result, first insert into a 128-bit
19763 // vector and then insert into the 256-bit vector.
19764 if (!OpVT.is128BitVector()) {
19765 // Insert into a 128-bit vector.
19766 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
19768 OpVT.getVectorNumElements() / SizeFactor);
19769
19770 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
19771
19772 // Insert the 128-bit vector.
19773 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
19774 }
19775 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
19776 "Expected an SSE type!");
19777
19778 // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
19779 // tblgen.
19780 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
19781 return Op;
19782
19783 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
19784 return DAG.getBitcast(
19785 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
19786}
19787
19788// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
19789// simple superregister reference or explicit instructions to insert
19790// the upper bits of a vector.
19792 SelectionDAG &DAG) {
19793 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
19794
19795 return insert1BitVector(Op, DAG, Subtarget);
19796}
19797
19799 SelectionDAG &DAG) {
19800 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
19801 "Only vXi1 extract_subvectors need custom lowering");
19802
19803 SDLoc dl(Op);
19804 SDValue Vec = Op.getOperand(0);
19805 uint64_t IdxVal = Op.getConstantOperandVal(1);
19806
19807 if (IdxVal == 0) // the operation is legal
19808 return Op;
19809
19810 // Extend to natively supported kshift.
19811 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
19812
19813 // Shift to the LSB.
19814 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
19815 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19816
19817 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
19818 DAG.getVectorIdxConstant(0, dl));
19819}
19820
19821// Returns the appropriate wrapper opcode for a global reference.
19822unsigned X86TargetLowering::getGlobalWrapperKind(
19823 const GlobalValue *GV, const unsigned char OpFlags) const {
19824 // References to absolute symbols are never PC-relative.
19825 if (GV && GV->isAbsoluteSymbolRef())
19826 return X86ISD::Wrapper;
19827
19828 // The following OpFlags under RIP-rel PIC use RIP.
19829 if (Subtarget.isPICStyleRIPRel() &&
19830 (OpFlags == X86II::MO_NO_FLAG || OpFlags == X86II::MO_COFFSTUB ||
19831 OpFlags == X86II::MO_DLLIMPORT))
19832 return X86ISD::WrapperRIP;
19833
19834 // GOTPCREL references must always use RIP.
19835 if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
19836 return X86ISD::WrapperRIP;
19837
19838 return X86ISD::Wrapper;
19839}
19840
19841// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
19842// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
19843// one of the above mentioned nodes. It has to be wrapped because otherwise
19844// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
19845// be used to form addressing mode. These wrapped nodes will be selected
19846// into MOV32ri.
19847SDValue
19848X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
19849 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
19850
19851 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19852 // global base reg.
19853 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19854
19855 auto PtrVT = getPointerTy(DAG.getDataLayout());
19857 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
19858 SDLoc DL(CP);
19859 Result =
19860 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
19861 // With PIC, the address is actually $g + Offset.
19862 if (OpFlag) {
19863 Result =
19864 DAG.getNode(ISD::ADD, DL, PtrVT,
19865 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19866 }
19867
19868 return Result;
19869}
19870
19871SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
19872 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
19873
19874 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19875 // global base reg.
19876 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19877
19878 EVT PtrVT = Op.getValueType();
19879 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
19880 SDLoc DL(JT);
19881 Result =
19882 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
19883
19884 // With PIC, the address is actually $g + Offset.
19885 if (OpFlag)
19886 Result =
19887 DAG.getNode(ISD::ADD, DL, PtrVT,
19888 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19889
19890 return Result;
19891}
19892
19893SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
19894 SelectionDAG &DAG) const {
19895 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
19896}
19897
19898SDValue
19899X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
19900 // Create the TargetBlockAddressAddress node.
19901 unsigned char OpFlags =
19902 Subtarget.classifyBlockAddressReference();
19903 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
19904 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
19905 SDLoc dl(Op);
19906 EVT PtrVT = Op.getValueType();
19907 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
19908 Result =
19909 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlags), dl, PtrVT, Result);
19910
19911 // With PIC, the address is actually $g + Offset.
19912 if (isGlobalRelativeToPICBase(OpFlags)) {
19913 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19914 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19915 }
19916
19917 return Result;
19918}
19919
19920/// Creates target global address or external symbol nodes for calls or
19921/// other uses.
19922SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
19923 bool ForCall,
19924 bool *IsImpCall) const {
19925 // Unpack the global address or external symbol.
19926 SDLoc dl(Op);
19927 const GlobalValue *GV = nullptr;
19928 int64_t Offset = 0;
19929 const char *ExternalSym = nullptr;
19930 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
19931 GV = G->getGlobal();
19932 Offset = G->getOffset();
19933 } else {
19934 const auto *ES = cast<ExternalSymbolSDNode>(Op);
19935 ExternalSym = ES->getSymbol();
19936 }
19937
19938 // Calculate some flags for address lowering.
19940 unsigned char OpFlags;
19941 if (ForCall)
19942 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
19943 else
19944 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
19945 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
19946 bool NeedsLoad = isGlobalStubReference(OpFlags);
19947
19949 EVT PtrVT = Op.getValueType();
19951
19952 if (GV) {
19953 // Create a target global address if this is a global. If possible, fold the
19954 // offset into the global address reference. Otherwise, ADD it on later.
19955 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
19956 // allowed because if the address of foo is 0, the ELF R_X86_64_32
19957 // relocation will compute to a negative value, which is invalid.
19958 int64_t GlobalOffset = 0;
19959 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
19961 std::swap(GlobalOffset, Offset);
19962 }
19963 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
19964 } else {
19965 // If this is not a global address, this must be an external symbol.
19966 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
19967 }
19968
19969 // If this is a direct call, avoid the wrapper if we don't need to do any
19970 // loads or adds. This allows SDAG ISel to match direct calls.
19971 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
19972 return Result;
19973
19974 // If Import Call Optimization is enabled and this is an imported function
19975 // then make a note of it and return the global address without wrapping.
19976 if (IsImpCall && (OpFlags == X86II::MO_DLLIMPORT) &&
19977 Mod.getModuleFlag("import-call-optimization")) {
19978 assert(ForCall && "Should only enable import call optimization if we are "
19979 "lowering a call");
19980 *IsImpCall = true;
19981 return Result;
19982 }
19983
19984 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
19985
19986 // With PIC, the address is actually $g + Offset.
19987 if (HasPICReg) {
19988 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19989 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19990 }
19991
19992 // For globals that require a load from a stub to get the address, emit the
19993 // load.
19994 if (NeedsLoad)
19995 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
19997
19998 // If there was a non-zero offset that we didn't fold, create an explicit
19999 // addition for it.
20000 if (Offset != 0)
20001 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
20002 DAG.getSignedConstant(Offset, dl, PtrVT));
20003
20004 return Result;
20005}
20006
20007SDValue
20008X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
20009 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
20010}
20011
20013 const EVT PtrVT, unsigned ReturnReg,
20014 unsigned char OperandFlags,
20015 bool LoadGlobalBaseReg = false,
20016 bool LocalDynamic = false) {
20018 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
20019 SDLoc dl(GA);
20020 SDValue TGA;
20021 bool UseTLSDESC = DAG.getTarget().useTLSDESC();
20022 SDValue Chain = DAG.getEntryNode();
20023 SDValue Ret;
20024 if (LocalDynamic && UseTLSDESC) {
20025 TGA = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, OperandFlags);
20026 // Reuse existing GetTLSADDR node if we can find it.
20027 if (TGA->hasOneUse()) {
20028 // TLSDESC uses TGA.
20029 SDNode *TLSDescOp = *TGA->user_begin();
20030 assert(TLSDescOp->getOpcode() == X86ISD::TLSDESC &&
20031 "Unexpected TLSDESC DAG");
20032 // CALLSEQ_END uses TGA via a chain and glue.
20033 auto *CallSeqEndOp = TLSDescOp->getGluedUser();
20034 assert(CallSeqEndOp && CallSeqEndOp->getOpcode() == ISD::CALLSEQ_END &&
20035 "Unexpected TLSDESC DAG");
20036 // CopyFromReg uses CALLSEQ_END via a chain and glue.
20037 auto *CopyFromRegOp = CallSeqEndOp->getGluedUser();
20038 assert(CopyFromRegOp && CopyFromRegOp->getOpcode() == ISD::CopyFromReg &&
20039 "Unexpected TLSDESC DAG");
20040 Ret = SDValue(CopyFromRegOp, 0);
20041 }
20042 } else {
20043 TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
20044 GA->getOffset(), OperandFlags);
20045 }
20046
20047 if (!Ret) {
20048 unsigned CallType = UseTLSDESC ? X86ISD::TLSDESC
20049 : LocalDynamic ? X86ISD::TLSBASEADDR
20050 : X86ISD::TLSADDR;
20051
20052 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
20053 if (LoadGlobalBaseReg) {
20054 SDValue InGlue;
20055 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
20056 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT),
20057 InGlue);
20058 InGlue = Chain.getValue(1);
20059 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA, InGlue});
20060 } else {
20061 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA});
20062 }
20063 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), dl);
20064
20065 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
20066 MFI.setHasCalls(true);
20067
20068 SDValue Glue = Chain.getValue(1);
20069 Ret = DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
20070 }
20071
20072 if (!UseTLSDESC)
20073 return Ret;
20074
20075 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
20076 unsigned Seg = Subtarget.is64Bit() ? X86AS::FS : X86AS::GS;
20077
20079 SDValue Offset =
20080 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
20081 MachinePointerInfo(Ptr));
20082 return DAG.getNode(ISD::ADD, dl, PtrVT, Ret, Offset);
20083}
20084
20085// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
20086static SDValue
20088 const EVT PtrVT) {
20089 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD,
20090 /*LoadGlobalBaseReg=*/true);
20091}
20092
20093// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
20094static SDValue
20096 const EVT PtrVT) {
20097 return GetTLSADDR(DAG, GA, PtrVT, X86::RAX, X86II::MO_TLSGD);
20098}
20099
20100// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
20101static SDValue
20103 const EVT PtrVT) {
20104 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD);
20105}
20106
20108 SelectionDAG &DAG, const EVT PtrVT,
20109 bool Is64Bit, bool Is64BitLP64) {
20110 SDLoc dl(GA);
20111
20112 // Get the start address of the TLS block for this module.
20116
20117 SDValue Base;
20118 if (Is64Bit) {
20119 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
20120 Base = GetTLSADDR(DAG, GA, PtrVT, ReturnReg, X86II::MO_TLSLD,
20121 /*LoadGlobalBaseReg=*/false,
20122 /*LocalDynamic=*/true);
20123 } else {
20124 Base = GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSLDM,
20125 /*LoadGlobalBaseReg=*/true,
20126 /*LocalDynamic=*/true);
20127 }
20128
20129 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
20130 // of Base.
20131
20132 // Build x@dtpoff.
20133 unsigned char OperandFlags = X86II::MO_DTPOFF;
20134 unsigned WrapperKind = X86ISD::Wrapper;
20135 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
20136 GA->getValueType(0),
20137 GA->getOffset(), OperandFlags);
20138 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
20139
20140 // Add x@dtpoff with the base.
20141 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
20142}
20143
20144// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
20146 const EVT PtrVT, TLSModel::Model model,
20147 bool is64Bit, bool isPIC) {
20148 SDLoc dl(GA);
20149
20150 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
20153
20154 SDValue ThreadPointer =
20155 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
20156 MachinePointerInfo(Ptr));
20157
20158 unsigned char OperandFlags = 0;
20159 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
20160 // initialexec.
20161 unsigned WrapperKind = X86ISD::Wrapper;
20162 if (model == TLSModel::LocalExec) {
20163 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
20164 } else if (model == TLSModel::InitialExec) {
20165 if (is64Bit) {
20166 OperandFlags = X86II::MO_GOTTPOFF;
20167 WrapperKind = X86ISD::WrapperRIP;
20168 } else {
20169 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
20170 }
20171 } else {
20172 llvm_unreachable("Unexpected model");
20173 }
20174
20175 // emit "addl x@ntpoff,%eax" (local exec)
20176 // or "addl x@indntpoff,%eax" (initial exec)
20177 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
20178 SDValue TGA =
20179 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
20180 GA->getOffset(), OperandFlags);
20181 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
20182
20183 if (model == TLSModel::InitialExec) {
20184 if (isPIC && !is64Bit) {
20185 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
20186 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
20187 Offset);
20188 }
20189
20190 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
20192 }
20193
20194 // The address of the thread local variable is the add of the thread
20195 // pointer with the offset of the variable.
20196 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
20197}
20198
20199SDValue
20200X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
20201
20202 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
20203
20204 if (DAG.getTarget().useEmulatedTLS())
20205 return LowerToTLSEmulatedModel(GA, DAG);
20206
20207 const GlobalValue *GV = GA->getGlobal();
20208 EVT PtrVT = Op.getValueType();
20209 bool PositionIndependent = isPositionIndependent();
20210
20211 if (Subtarget.isTargetELF()) {
20212 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
20213 switch (model) {
20215 if (Subtarget.is64Bit()) {
20216 if (Subtarget.isTarget64BitLP64())
20217 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
20218 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
20219 }
20220 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
20222 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
20223 Subtarget.isTarget64BitLP64());
20226 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
20227 PositionIndependent);
20228 }
20229 llvm_unreachable("Unknown TLS model.");
20230 }
20231
20232 if (Subtarget.isTargetDarwin()) {
20233 // Darwin only has one model of TLS. Lower to that.
20234 unsigned char OpFlag = 0;
20235 unsigned WrapperKind = 0;
20236
20237 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
20238 // global base reg.
20239 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
20240 if (PIC32) {
20241 OpFlag = X86II::MO_TLVP_PIC_BASE;
20242 WrapperKind = X86ISD::Wrapper;
20243 } else {
20244 OpFlag = X86II::MO_TLVP;
20245 WrapperKind = X86ISD::WrapperRIP;
20246 }
20247 SDLoc DL(Op);
20249 GA->getValueType(0),
20250 GA->getOffset(), OpFlag);
20251 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
20252
20253 // With PIC32, the address is actually $g + Offset.
20254 if (PIC32)
20255 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
20256 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
20257 Offset);
20258
20259 // Lowering the machine isd will make sure everything is in the right
20260 // location.
20261 SDValue Chain = DAG.getEntryNode();
20262 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
20263 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
20264 SDValue Args[] = { Chain, Offset };
20265 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
20266 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
20267
20268 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
20269 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20270 MFI.setAdjustsStack(true);
20271
20272 // And our return value (tls address) is in the standard call return value
20273 // location.
20274 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
20275 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
20276 }
20277
20278 if (Subtarget.isOSWindows()) {
20279 // Just use the implicit TLS architecture
20280 // Need to generate something similar to:
20281 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
20282 // ; from TEB
20283 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
20284 // mov rcx, qword [rdx+rcx*8]
20285 // mov eax, .tls$:tlsvar
20286 // [rax+rcx] contains the address
20287 // Windows 64bit: gs:0x58
20288 // Windows 32bit: fs:__tls_array
20289
20290 SDLoc dl(GA);
20291 SDValue Chain = DAG.getEntryNode();
20292
20293 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
20294 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
20295 // use its literal value of 0x2C.
20297 Subtarget.is64Bit() ? PointerType::get(*DAG.getContext(), X86AS::GS)
20299
20300 SDValue TlsArray = Subtarget.is64Bit()
20301 ? DAG.getIntPtrConstant(0x58, dl)
20302 : (Subtarget.isTargetWindowsGNU()
20303 ? DAG.getIntPtrConstant(0x2C, dl)
20304 : DAG.getExternalSymbol("_tls_array", PtrVT));
20305
20306 SDValue ThreadPointer =
20307 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
20308
20309 SDValue res;
20311 res = ThreadPointer;
20312 } else {
20313 // Load the _tls_index variable
20314 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
20315 if (Subtarget.is64Bit())
20316 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
20317 MachinePointerInfo(), MVT::i32);
20318 else
20319 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
20320
20321 const DataLayout &DL = DAG.getDataLayout();
20322 SDValue Scale =
20323 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
20324 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
20325
20326 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
20327 }
20328
20329 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
20330
20331 // Get the offset of start of .tls section
20332 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
20333 GA->getValueType(0),
20335 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
20336
20337 // The address of the thread local variable is the add of the thread
20338 // pointer with the offset of the variable.
20339 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
20340 }
20341
20342 llvm_unreachable("TLS not implemented for this target.");
20343}
20344
20346 if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
20347 const TargetMachine &TM = getTargetMachine();
20348 TLSModel::Model Model = TM.getTLSModel(&GV);
20349 switch (Model) {
20352 // We can include the %fs segment register in addressing modes.
20353 return true;
20356 // These models do not result in %fs relative addresses unless
20357 // TLS descriptior are used.
20358 //
20359 // Even in the case of TLS descriptors we currently have no way to model
20360 // the difference between %fs access and the computations needed for the
20361 // offset and returning `true` for TLS-desc currently duplicates both
20362 // which is detrimental :-/
20363 return false;
20364 }
20365 }
20366 return false;
20367}
20368
20369/// Lower SRA_PARTS and friends, which return two i32 values
20370/// and take a 2 x i32 value to shift plus a shift amount.
20371/// TODO: Can this be moved to general expansion code?
20373 SDValue Lo, Hi;
20374 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
20375 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
20376}
20377
20378// Try to use a packed vector operation to handle i64 on 32-bit targets when
20379// AVX512DQ is enabled.
20381 SelectionDAG &DAG,
20382 const X86Subtarget &Subtarget) {
20383 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
20384 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
20385 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
20386 Op.getOpcode() == ISD::UINT_TO_FP) &&
20387 "Unexpected opcode!");
20388 bool IsStrict = Op->isStrictFPOpcode();
20389 unsigned OpNo = IsStrict ? 1 : 0;
20390 SDValue Src = Op.getOperand(OpNo);
20391 MVT SrcVT = Src.getSimpleValueType();
20392 MVT VT = Op.getSimpleValueType();
20393
20394 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
20395 (VT != MVT::f32 && VT != MVT::f64))
20396 return SDValue();
20397
20398 // Pack the i64 into a vector, do the operation and extract.
20399
20400 // Using 256-bit to ensure result is 128-bits for f32 case.
20401 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
20402 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
20403 MVT VecVT = MVT::getVectorVT(VT, NumElts);
20404
20405 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
20406 if (IsStrict) {
20407 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
20408 {Op.getOperand(0), InVec});
20409 SDValue Chain = CvtVec.getValue(1);
20410 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
20411 DAG.getVectorIdxConstant(0, dl));
20412 return DAG.getMergeValues({Value, Chain}, dl);
20413 }
20414
20415 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
20416
20417 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
20418 DAG.getVectorIdxConstant(0, dl));
20419}
20420
20421// Try to use a packed vector operation to handle i64 on 32-bit targets.
20423 const X86Subtarget &Subtarget) {
20424 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
20425 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
20426 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
20427 Op.getOpcode() == ISD::UINT_TO_FP) &&
20428 "Unexpected opcode!");
20429 bool IsStrict = Op->isStrictFPOpcode();
20430 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20431 MVT SrcVT = Src.getSimpleValueType();
20432 MVT VT = Op.getSimpleValueType();
20433
20434 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
20435 return SDValue();
20436
20437 // Pack the i64 into a vector, do the operation and extract.
20438
20439 assert(Subtarget.hasFP16() && "Expected FP16");
20440
20441 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
20442 if (IsStrict) {
20443 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
20444 {Op.getOperand(0), InVec});
20445 SDValue Chain = CvtVec.getValue(1);
20446 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
20447 DAG.getVectorIdxConstant(0, dl));
20448 return DAG.getMergeValues({Value, Chain}, dl);
20449 }
20450
20451 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
20452
20453 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
20454 DAG.getVectorIdxConstant(0, dl));
20455}
20456
20457static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
20458 const X86Subtarget &Subtarget) {
20459 switch (Opcode) {
20460 case ISD::SINT_TO_FP:
20461 // TODO: Handle wider types with AVX/AVX512.
20462 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
20463 return false;
20464 // CVTDQ2PS or (V)CVTDQ2PD
20465 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
20466
20467 case ISD::UINT_TO_FP:
20468 // TODO: Handle wider types and i64 elements.
20469 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
20470 return false;
20471 // VCVTUDQ2PS or VCVTUDQ2PD
20472 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
20473
20474 default:
20475 return false;
20476 }
20477}
20478
20479/// Given a scalar cast operation that is extracted from a vector, try to
20480/// vectorize the cast op followed by extraction. This will avoid an expensive
20481/// round-trip between XMM and GPR.
20483 SelectionDAG &DAG,
20484 const X86Subtarget &Subtarget) {
20485 // TODO: This could be enhanced to handle smaller integer types by peeking
20486 // through an extend.
20487 SDValue Extract = Cast.getOperand(0);
20488 MVT DestVT = Cast.getSimpleValueType();
20489 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
20490 !isa<ConstantSDNode>(Extract.getOperand(1)))
20491 return SDValue();
20492
20493 // See if we have a 128-bit vector cast op for this type of cast.
20494 SDValue VecOp = Extract.getOperand(0);
20495 EVT FromVT = VecOp.getValueType();
20496 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
20497 MVT Vec128VT =
20498 MVT::getVectorVT(FromVT.getScalarType().getSimpleVT(), NumEltsInXMM);
20499 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
20500 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
20501 return SDValue();
20502
20503 // If we are extracting from a non-zero element, first shuffle the source
20504 // vector to allow extracting from element zero.
20505 if (!isNullConstant(Extract.getOperand(1))) {
20506 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
20507 Mask[0] = Extract.getConstantOperandVal(1);
20508 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
20509 }
20510 // If the source vector is wider than 128-bits, extract the low part. Do not
20511 // create an unnecessarily wide vector cast op.
20512 if (FromVT != Vec128VT)
20513 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
20514
20515 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
20516 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
20517 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
20518 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
20519 DAG.getVectorIdxConstant(0, DL));
20520}
20521
20522/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
20523/// try to vectorize the cast ops. This will avoid an expensive round-trip
20524/// between XMM and GPR.
20525static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL,
20526 SelectionDAG &DAG,
20527 const X86Subtarget &Subtarget) {
20528 SDValue CastToInt = CastToFP.getOperand(0);
20529 MVT VT = CastToFP.getSimpleValueType();
20530 if ((CastToInt.getOpcode() != ISD::FP_TO_SINT &&
20531 CastToInt.getOpcode() != ISD::FP_TO_UINT) ||
20532 VT.isVector())
20533 return SDValue();
20534
20535 MVT IntVT = CastToInt.getSimpleValueType();
20536 SDValue X = CastToInt.getOperand(0);
20537 MVT SrcVT = X.getSimpleValueType();
20538 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
20539 return SDValue();
20540
20541 // See if we have 128-bit vector cast instructions for this type of cast.
20542 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
20543 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
20544 (IntVT != MVT::i32 && IntVT != MVT::i64))
20545 return SDValue();
20546
20547 unsigned SrcSize = SrcVT.getSizeInBits();
20548 unsigned IntSize = IntVT.getSizeInBits();
20549 unsigned VTSize = VT.getSizeInBits();
20550 bool IsUnsigned = CastToInt.getOpcode() == ISD::FP_TO_UINT;
20551 unsigned ToIntOpcode =
20552 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
20553 unsigned ToFPOpcode =
20554 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
20555 unsigned Width = 128;
20556
20557 if (Subtarget.hasVLX() && Subtarget.hasDQI()) {
20558 // AVX512DQ+VLX
20559 if (IsUnsigned) {
20560 ToIntOpcode =
20561 SrcSize != IntSize ? X86ISD::CVTTP2UI : (unsigned)ISD::FP_TO_UINT;
20562 ToFPOpcode =
20563 IntSize != VTSize ? X86ISD::CVTUI2P : (unsigned)ISD::UINT_TO_FP;
20564 }
20565 } else {
20566 if (IsUnsigned || IntVT == MVT::i64) {
20567 // SSE2 can only perform f64/f32 <-> i32 signed.
20568 if (!Subtarget.useAVX512Regs() || !Subtarget.hasDQI())
20569 return SDValue();
20570
20571 // Need to extend width for AVX512DQ without AVX512VL.
20572 Width = 512;
20573 ToIntOpcode = CastToInt.getOpcode();
20574 ToFPOpcode = IsUnsigned ? ISD::UINT_TO_FP : ISD::SINT_TO_FP;
20575 }
20576 }
20577
20578 MVT VecSrcVT, VecIntVT, VecVT;
20579 unsigned NumElts;
20580 unsigned SrcElts, VTElts;
20581 // Some conversions are only legal with uniform vector sizes on AVX512DQ.
20582 if (Width == 512) {
20583 NumElts = std::min(Width / IntSize, Width / SrcSize);
20584 SrcElts = NumElts;
20585 VTElts = NumElts;
20586 } else {
20587 NumElts = Width / IntSize;
20588 SrcElts = Width / SrcSize;
20589 VTElts = Width / VTSize;
20590 }
20591 VecIntVT = MVT::getVectorVT(IntVT, NumElts);
20592 VecSrcVT = MVT::getVectorVT(SrcVT, SrcElts);
20593 VecVT = MVT::getVectorVT(VT, VTElts);
20594 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
20595 //
20596 // We are not defining the high elements (for example, zero them) because
20597 // that could nullify any performance advantage that we hoped to gain from
20598 // this vector op hack. We do not expect any adverse effects (like denorm
20599 // penalties) with cast ops.
20600 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
20601 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
20602 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
20603 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
20604 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
20605}
20606
20608 SelectionDAG &DAG,
20609 const X86Subtarget &Subtarget) {
20610 bool IsStrict = Op->isStrictFPOpcode();
20611 MVT VT = Op->getSimpleValueType(0);
20612 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
20613
20614 if (Subtarget.hasDQI()) {
20615 assert(!Subtarget.hasVLX() && "Unexpected features");
20616
20617 assert((Src.getSimpleValueType() == MVT::v2i64 ||
20618 Src.getSimpleValueType() == MVT::v4i64) &&
20619 "Unsupported custom type");
20620
20621 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
20622 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
20623 "Unexpected VT!");
20624 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
20625
20626 // Need to concat with zero vector for strict fp to avoid spurious
20627 // exceptions.
20628 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
20629 : DAG.getUNDEF(MVT::v8i64);
20630 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
20631 DAG.getVectorIdxConstant(0, DL));
20632 SDValue Res, Chain;
20633 if (IsStrict) {
20634 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
20635 {Op->getOperand(0), Src});
20636 Chain = Res.getValue(1);
20637 } else {
20638 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
20639 }
20640
20641 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20642 DAG.getVectorIdxConstant(0, DL));
20643
20644 if (IsStrict)
20645 return DAG.getMergeValues({Res, Chain}, DL);
20646 return Res;
20647 }
20648
20649 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
20650 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
20651 if (VT != MVT::v4f32 || IsSigned)
20652 return SDValue();
20653
20654 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
20655 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
20656 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
20657 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
20658 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
20659 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
20660 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
20661 SmallVector<SDValue, 4> SignCvts(4);
20662 SmallVector<SDValue, 4> Chains(4);
20663 for (int i = 0; i != 4; ++i) {
20664 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
20665 DAG.getVectorIdxConstant(i, DL));
20666 if (IsStrict) {
20667 SignCvts[i] =
20668 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
20669 {Op.getOperand(0), Elt});
20670 Chains[i] = SignCvts[i].getValue(1);
20671 } else {
20672 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
20673 }
20674 }
20675 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
20676
20677 SDValue Slow, Chain;
20678 if (IsStrict) {
20679 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
20680 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
20681 {Chain, SignCvt, SignCvt});
20682 Chain = Slow.getValue(1);
20683 } else {
20684 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
20685 }
20686
20687 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
20688 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
20689
20690 if (IsStrict)
20691 return DAG.getMergeValues({Cvt, Chain}, DL);
20692
20693 return Cvt;
20694}
20695
20697 SelectionDAG &DAG) {
20698 bool IsStrict = Op->isStrictFPOpcode();
20699 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20700 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20701 MVT VT = Op.getSimpleValueType();
20702 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
20703
20704 SDValue Rnd = DAG.getIntPtrConstant(0, dl, /*isTarget=*/true);
20705 if (IsStrict)
20706 return DAG.getNode(
20707 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
20708 {Chain,
20709 DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
20710 Rnd});
20711 return DAG.getNode(ISD::FP_ROUND, dl, VT,
20712 DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
20713}
20714
20715static bool isLegalConversion(MVT VT, MVT FloatVT, bool IsSigned,
20716 const X86Subtarget &Subtarget) {
20717 if (FloatVT.getScalarType() != MVT::f16 || Subtarget.hasVLX()) {
20718 if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
20719 return true;
20720 if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
20721 return true;
20722 }
20723 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
20724 return true;
20725 if (Subtarget.useAVX512Regs()) {
20726 if (VT == MVT::v16i32)
20727 return true;
20728 if (VT == MVT::v8i64 && FloatVT == MVT::v8f16 && Subtarget.hasFP16())
20729 return true;
20730 if (VT == MVT::v8i64 && Subtarget.hasDQI())
20731 return true;
20732 }
20733 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
20734 (VT == MVT::v2i64 || VT == MVT::v4i64))
20735 return true;
20736 return false;
20737}
20738
20739SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
20740 SelectionDAG &DAG) const {
20741 bool IsStrict = Op->isStrictFPOpcode();
20742 unsigned OpNo = IsStrict ? 1 : 0;
20743 SDValue Src = Op.getOperand(OpNo);
20744 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20745 MVT SrcVT = Src.getSimpleValueType();
20746 MVT VT = Op.getSimpleValueType();
20747 SDLoc dl(Op);
20748
20749 if (isSoftF16(VT, Subtarget))
20750 return promoteXINT_TO_FP(Op, dl, DAG);
20751 else if (isLegalConversion(SrcVT, VT, true, Subtarget))
20752 return Op;
20753
20754 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20755 return LowerWin64_INT128_TO_FP(Op, DAG);
20756
20757 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
20758 return Extract;
20759
20760 if (SDValue R = lowerFPToIntToFP(Op, dl, DAG, Subtarget))
20761 return R;
20762
20763 if (SrcVT.isVector()) {
20764 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
20765 // Note: Since v2f64 is a legal type. We don't need to zero extend the
20766 // source for strict FP.
20767 if (IsStrict)
20768 return DAG.getNode(
20769 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
20770 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20771 DAG.getUNDEF(SrcVT))});
20772 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
20773 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20774 DAG.getUNDEF(SrcVT)));
20775 }
20776 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
20777 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
20778
20779 return SDValue();
20780 }
20781
20782 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
20783 "Unknown SINT_TO_FP to lower!");
20784
20785 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
20786
20787 // These are really Legal; return the operand so the caller accepts it as
20788 // Legal.
20789 if (SrcVT == MVT::i32 && UseSSEReg)
20790 return Op;
20791 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
20792 return Op;
20793
20794 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
20795 return V;
20796 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
20797 return V;
20798
20799 // SSE doesn't have an i16 conversion so we need to promote.
20800 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
20801 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
20802 if (IsStrict)
20803 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
20804 {Chain, Ext});
20805
20806 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
20807 }
20808
20809 if (VT == MVT::f128 || !Subtarget.hasX87())
20810 return SDValue();
20811
20812 SDValue ValueToStore = Src;
20813 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
20814 // Bitcasting to f64 here allows us to do a single 64-bit store from
20815 // an SSE register, avoiding the store forwarding penalty that would come
20816 // with two 32-bit stores.
20817 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20818
20819 unsigned Size = SrcVT.getStoreSize();
20820 Align Alignment(Size);
20821 MachineFunction &MF = DAG.getMachineFunction();
20822 auto PtrVT = getPointerTy(MF.getDataLayout());
20823 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
20824 MachinePointerInfo MPI =
20826 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20827 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
20828 std::pair<SDValue, SDValue> Tmp =
20829 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
20830
20831 if (IsStrict)
20832 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20833
20834 return Tmp.first;
20835}
20836
20837std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
20838 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
20839 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
20840 // Build the FILD
20841 SDVTList Tys;
20842 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
20843 if (useSSE)
20844 Tys = DAG.getVTList(MVT::f80, MVT::Other);
20845 else
20846 Tys = DAG.getVTList(DstVT, MVT::Other);
20847
20848 SDValue FILDOps[] = {Chain, Pointer};
20849 SDValue Result =
20850 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
20851 Alignment, MachineMemOperand::MOLoad);
20852 Chain = Result.getValue(1);
20853
20854 if (useSSE) {
20856 unsigned SSFISize = DstVT.getStoreSize();
20857 int SSFI =
20858 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
20859 auto PtrVT = getPointerTy(MF.getDataLayout());
20860 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20861 Tys = DAG.getVTList(MVT::Other);
20862 SDValue FSTOps[] = {Chain, Result, StackSlot};
20865 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
20866
20867 Chain =
20868 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
20869 Result = DAG.getLoad(
20870 DstVT, DL, Chain, StackSlot,
20872 Chain = Result.getValue(1);
20873 }
20874
20875 return { Result, Chain };
20876}
20877
20878/// Horizontal vector math instructions may be slower than normal math with
20879/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
20880/// implementation, and likely shuffle complexity of the alternate sequence.
20881static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
20882 const X86Subtarget &Subtarget) {
20883 bool IsOptimizingSize = DAG.shouldOptForSize();
20884 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
20885 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
20886}
20887
20888/// 64-bit unsigned integer to double expansion.
20890 SelectionDAG &DAG,
20891 const X86Subtarget &Subtarget) {
20892 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
20893 // when converting 0 when rounding toward negative infinity. Caller will
20894 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
20895 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
20896 // This algorithm is not obvious. Here it is what we're trying to output:
20897 /*
20898 movq %rax, %xmm0
20899 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
20900 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
20901 #ifdef __SSE3__
20902 haddpd %xmm0, %xmm0
20903 #else
20904 pshufd $0x4e, %xmm0, %xmm1
20905 addpd %xmm1, %xmm0
20906 #endif
20907 */
20908
20909 LLVMContext *Context = DAG.getContext();
20910
20911 // Build some magic constants.
20912 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
20913 Constant *C0 = ConstantDataVector::get(*Context, CV0);
20914 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20915 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
20916
20918 CV1.push_back(
20919 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20920 APInt(64, 0x4330000000000000ULL))));
20921 CV1.push_back(
20922 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20923 APInt(64, 0x4530000000000000ULL))));
20924 Constant *C1 = ConstantVector::get(CV1);
20925 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
20926
20927 // Load the 64-bit value into an XMM register.
20928 SDValue XR1 =
20929 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
20930 SDValue CLod0 = DAG.getLoad(
20931 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
20933 SDValue Unpck1 =
20934 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
20935
20936 SDValue CLod1 = DAG.getLoad(
20937 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
20939 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
20940 // TODO: Are there any fast-math-flags to propagate here?
20941 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
20942 SDValue Result;
20943
20944 if (Subtarget.hasSSE3() &&
20945 shouldUseHorizontalOp(true, DAG, Subtarget)) {
20946 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
20947 } else {
20948 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
20949 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
20950 }
20951 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
20952 DAG.getVectorIdxConstant(0, dl));
20953 return Result;
20954}
20955
20956/// 32-bit unsigned integer to float expansion.
20958 SelectionDAG &DAG,
20959 const X86Subtarget &Subtarget) {
20960 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20961 // FP constant to bias correct the final result.
20962 SDValue Bias = DAG.getConstantFP(
20963 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);
20964
20965 // Load the 32-bit value into an XMM register.
20966 SDValue Load =
20967 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
20968
20969 // Zero out the upper parts of the register.
20970 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
20971
20972 // Or the load with the bias.
20973 SDValue Or = DAG.getNode(
20974 ISD::OR, dl, MVT::v2i64,
20975 DAG.getBitcast(MVT::v2i64, Load),
20976 DAG.getBitcast(MVT::v2i64,
20977 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
20978 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
20979 DAG.getBitcast(MVT::v2f64, Or),
20980 DAG.getVectorIdxConstant(0, dl));
20981
20982 if (Op.getNode()->isStrictFPOpcode()) {
20983 // Subtract the bias.
20984 // TODO: Are there any fast-math-flags to propagate here?
20985 SDValue Chain = Op.getOperand(0);
20986 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
20987 {Chain, Or, Bias});
20988
20989 if (Op.getValueType() == Sub.getValueType())
20990 return Sub;
20991
20992 // Handle final rounding.
20993 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
20994 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
20995
20996 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
20997 }
20998
20999 // Subtract the bias.
21000 // TODO: Are there any fast-math-flags to propagate here?
21001 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
21002
21003 // Handle final rounding.
21004 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
21005}
21006
21008 SelectionDAG &DAG,
21009 const X86Subtarget &Subtarget) {
21010 if (Op.getSimpleValueType() != MVT::v2f64)
21011 return SDValue();
21012
21013 bool IsStrict = Op->isStrictFPOpcode();
21014
21015 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
21016 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
21017
21018 if (Subtarget.hasAVX512()) {
21019 if (!Subtarget.hasVLX()) {
21020 // Let generic type legalization widen this.
21021 if (!IsStrict)
21022 return SDValue();
21023 // Otherwise pad the integer input with 0s and widen the operation.
21024 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
21025 DAG.getConstant(0, DL, MVT::v2i32));
21026 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
21027 {Op.getOperand(0), N0});
21028 SDValue Chain = Res.getValue(1);
21029 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
21030 DAG.getVectorIdxConstant(0, DL));
21031 return DAG.getMergeValues({Res, Chain}, DL);
21032 }
21033
21034 // Legalize to v4i32 type.
21035 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
21036 DAG.getUNDEF(MVT::v2i32));
21037 if (IsStrict)
21038 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
21039 {Op.getOperand(0), N0});
21040 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
21041 }
21042
21043 // Zero extend to 2i64, OR with the floating point representation of 2^52.
21044 // This gives us the floating point equivalent of 2^52 + the i32 integer
21045 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
21046 // point leaving just our i32 integers in double format.
21047 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
21048 SDValue VBias = DAG.getConstantFP(
21049 llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);
21050 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
21051 DAG.getBitcast(MVT::v2i64, VBias));
21052 Or = DAG.getBitcast(MVT::v2f64, Or);
21053
21054 if (IsStrict)
21055 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
21056 {Op.getOperand(0), Or, VBias});
21057 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
21058}
21059
21061 SelectionDAG &DAG,
21062 const X86Subtarget &Subtarget) {
21063 bool IsStrict = Op->isStrictFPOpcode();
21064 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
21065 MVT VecIntVT = V.getSimpleValueType();
21066 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
21067 "Unsupported custom type");
21068
21069 if (Subtarget.hasAVX512()) {
21070 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
21071 assert(!Subtarget.hasVLX() && "Unexpected features");
21072 MVT VT = Op->getSimpleValueType(0);
21073
21074 // v8i32->v8f64 is legal with AVX512 so just return it.
21075 if (VT == MVT::v8f64)
21076 return Op;
21077
21078 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64 ||
21079 VT == MVT::v8f16) &&
21080 "Unexpected VT!");
21081 MVT WideVT = VT == MVT::v8f16 ? MVT::v16f16 : MVT::v16f32;
21082 MVT WideIntVT = MVT::v16i32;
21083 if (VT == MVT::v4f64) {
21084 WideVT = MVT::v8f64;
21085 WideIntVT = MVT::v8i32;
21086 }
21087
21088 // Need to concat with zero vector for strict fp to avoid spurious
21089 // exceptions.
21090 SDValue Tmp =
21091 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
21092 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
21093 DAG.getVectorIdxConstant(0, DL));
21094 SDValue Res, Chain;
21095 if (IsStrict) {
21096 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
21097 {Op->getOperand(0), V});
21098 Chain = Res.getValue(1);
21099 } else {
21100 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
21101 }
21102
21103 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
21104 DAG.getVectorIdxConstant(0, DL));
21105
21106 if (IsStrict)
21107 return DAG.getMergeValues({Res, Chain}, DL);
21108 return Res;
21109 }
21110
21111 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
21112 Op->getSimpleValueType(0) == MVT::v4f64) {
21113 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
21114 Constant *Bias = ConstantFP::get(
21115 *DAG.getContext(),
21116 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
21117 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
21118 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
21119 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
21120 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
21121 SDValue VBias = DAG.getMemIntrinsicNode(
21122 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
21125
21126 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
21127 DAG.getBitcast(MVT::v4i64, VBias));
21128 Or = DAG.getBitcast(MVT::v4f64, Or);
21129
21130 if (IsStrict)
21131 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
21132 {Op.getOperand(0), Or, VBias});
21133 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
21134 }
21135
21136 // The algorithm is the following:
21137 // #ifdef __SSE4_1__
21138 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
21139 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
21140 // (uint4) 0x53000000, 0xaa);
21141 // #else
21142 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
21143 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
21144 // #endif
21145 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
21146 // return (float4) lo + fhi;
21147
21148 bool Is128 = VecIntVT == MVT::v4i32;
21149 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
21150 // If we convert to something else than the supported type, e.g., to v4f64,
21151 // abort early.
21152 if (VecFloatVT != Op->getSimpleValueType(0))
21153 return SDValue();
21154
21155 // In the #idef/#else code, we have in common:
21156 // - The vector of constants:
21157 // -- 0x4b000000
21158 // -- 0x53000000
21159 // - A shift:
21160 // -- v >> 16
21161
21162 // Create the splat vector for 0x4b000000.
21163 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
21164 // Create the splat vector for 0x53000000.
21165 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
21166
21167 // Create the right shift.
21168 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
21169 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
21170
21171 SDValue Low, High;
21172 if (Subtarget.hasSSE41()) {
21173 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
21174 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
21175 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
21176 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
21177 // Low will be bitcasted right away, so do not bother bitcasting back to its
21178 // original type.
21179 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
21180 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
21181 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
21182 // (uint4) 0x53000000, 0xaa);
21183 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
21184 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
21185 // High will be bitcasted right away, so do not bother bitcasting back to
21186 // its original type.
21187 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
21188 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
21189 } else {
21190 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
21191 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
21192 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
21193 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
21194
21195 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
21196 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
21197 }
21198
21199 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
21200 SDValue VecCstFSub = DAG.getConstantFP(
21201 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
21202
21203 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
21204 // NOTE: By using fsub of a positive constant instead of fadd of a negative
21205 // constant, we avoid reassociation in MachineCombiner when reassoc is
21206 // enabled. See PR24512.
21207 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
21208 // TODO: Are there any fast-math-flags to propagate here?
21209 // (float4) lo;
21210 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
21211 // return (float4) lo + fhi;
21212 if (IsStrict) {
21213 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
21214 {Op.getOperand(0), HighBitcast, VecCstFSub});
21215 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
21216 {FHigh.getValue(1), LowBitcast, FHigh});
21217 }
21218
21219 SDValue FHigh =
21220 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
21221 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
21222}
21223
21225 const X86Subtarget &Subtarget) {
21226 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
21227 SDValue N0 = Op.getOperand(OpNo);
21228 MVT SrcVT = N0.getSimpleValueType();
21229
21230 switch (SrcVT.SimpleTy) {
21231 default:
21232 llvm_unreachable("Custom UINT_TO_FP is not supported!");
21233 case MVT::v2i32:
21234 return lowerUINT_TO_FP_v2i32(Op, dl, DAG, Subtarget);
21235 case MVT::v4i32:
21236 case MVT::v8i32:
21237 return lowerUINT_TO_FP_vXi32(Op, dl, DAG, Subtarget);
21238 case MVT::v2i64:
21239 case MVT::v4i64:
21240 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
21241 }
21242}
21243
21244SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
21245 SelectionDAG &DAG) const {
21246 bool IsStrict = Op->isStrictFPOpcode();
21247 unsigned OpNo = IsStrict ? 1 : 0;
21248 SDValue Src = Op.getOperand(OpNo);
21249 SDLoc dl(Op);
21250 auto PtrVT = getPointerTy(DAG.getDataLayout());
21251 MVT SrcVT = Src.getSimpleValueType();
21252 MVT DstVT = Op->getSimpleValueType(0);
21253 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
21254
21255 // Bail out when we don't have native conversion instructions.
21256 if (DstVT == MVT::f128)
21257 return SDValue();
21258
21259 if (isSoftF16(DstVT, Subtarget))
21260 return promoteXINT_TO_FP(Op, dl, DAG);
21261 else if (isLegalConversion(SrcVT, DstVT, false, Subtarget))
21262 return Op;
21263
21264 if (SDValue V = lowerFPToIntToFP(Op, dl, DAG, Subtarget))
21265 return V;
21266
21267 if (DstVT.isVector())
21268 return lowerUINT_TO_FP_vec(Op, dl, DAG, Subtarget);
21269
21270 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
21271 return LowerWin64_INT128_TO_FP(Op, DAG);
21272
21273 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
21274 return Extract;
21275
21276 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
21277 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
21278 // Conversions from unsigned i32 to f32/f64 are legal,
21279 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
21280 return Op;
21281 }
21282
21283 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
21284 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
21285 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
21286 if (IsStrict)
21287 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
21288 {Chain, Src});
21289 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
21290 }
21291
21292 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
21293 return V;
21294 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
21295 return V;
21296
21297 // The transform for i64->f64 isn't correct for 0 when rounding to negative
21298 // infinity. It produces -0.0, so disable under strictfp.
21299 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
21300 !IsStrict)
21301 return LowerUINT_TO_FP_i64(Op, dl, DAG, Subtarget);
21302 // The transform for i32->f64/f32 isn't correct for 0 when rounding to
21303 // negative infinity. So disable under strictfp. Using FILD instead.
21304 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
21305 !IsStrict)
21306 return LowerUINT_TO_FP_i32(Op, dl, DAG, Subtarget);
21307 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
21308 (DstVT == MVT::f32 || DstVT == MVT::f64))
21309 return SDValue();
21310
21311 // Make a 64-bit buffer, and use it to build an FILD.
21312 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
21313 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
21314 Align SlotAlign(8);
21315 MachinePointerInfo MPI =
21317 if (SrcVT == MVT::i32) {
21318 SDValue OffsetSlot =
21319 DAG.getMemBasePlusOffset(StackSlot, TypeSize::getFixed(4), dl);
21320 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
21321 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
21322 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
21323 std::pair<SDValue, SDValue> Tmp =
21324 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
21325 if (IsStrict)
21326 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
21327
21328 return Tmp.first;
21329 }
21330
21331 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
21332 SDValue ValueToStore = Src;
21333 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
21334 // Bitcasting to f64 here allows us to do a single 64-bit store from
21335 // an SSE register, avoiding the store forwarding penalty that would come
21336 // with two 32-bit stores.
21337 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
21338 }
21339 SDValue Store =
21340 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
21341 // For i64 source, we need to add the appropriate power of 2 if the input
21342 // was negative. We must be careful to do the computation in x87 extended
21343 // precision, not in SSE.
21344 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
21345 SDValue Ops[] = {Store, StackSlot};
21346 SDValue Fild =
21347 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
21348 SlotAlign, MachineMemOperand::MOLoad);
21349 Chain = Fild.getValue(1);
21350
21351 // Check whether the sign bit is set.
21352 SDValue SignSet = DAG.getSetCC(
21353 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
21354 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
21355
21356 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
21357 APInt FF(64, 0x5F80000000000000ULL);
21358 SDValue FudgePtr =
21359 DAG.getConstantPool(ConstantInt::get(*DAG.getContext(), FF), PtrVT);
21360 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
21361
21362 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
21363 SDValue Zero = DAG.getIntPtrConstant(0, dl);
21364 SDValue Four = DAG.getIntPtrConstant(4, dl);
21365 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
21366 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
21367
21368 // Load the value out, extending it from f32 to f80.
21369 SDValue Fudge = DAG.getExtLoad(
21370 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
21372 CPAlignment);
21373 Chain = Fudge.getValue(1);
21374 // Extend everything to 80 bits to force it to be done on x87.
21375 // TODO: Are there any fast-math-flags to propagate here?
21376 if (IsStrict) {
21377 unsigned Opc = ISD::STRICT_FADD;
21378 // Windows needs the precision control changed to 80bits around this add.
21379 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
21380 Opc = X86ISD::STRICT_FP80_ADD;
21381
21382 SDValue Add =
21383 DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
21384 // STRICT_FP_ROUND can't handle equal types.
21385 if (DstVT == MVT::f80)
21386 return Add;
21387 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
21388 {Add.getValue(1), Add,
21389 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
21390 }
21391 unsigned Opc = ISD::FADD;
21392 // Windows needs the precision control changed to 80bits around this add.
21393 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
21394 Opc = X86ISD::FP80_ADD;
21395
21396 SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
21397 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
21398 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
21399}
21400
21401// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
21402// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
21403// just return an SDValue().
21404// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
21405// to i16, i32 or i64, and we lower it to a legal sequence and return the
21406// result.
21407SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
21408 bool IsSigned,
21409 SDValue &Chain) const {
21410 bool IsStrict = Op->isStrictFPOpcode();
21411 SDLoc DL(Op);
21412
21413 EVT DstTy = Op.getValueType();
21414 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
21415 EVT TheVT = Value.getValueType();
21416 auto PtrVT = getPointerTy(DAG.getDataLayout());
21417
21418 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
21419 // f16 must be promoted before using the lowering in this routine.
21420 // fp128 does not use this lowering.
21421 return SDValue();
21422 }
21423
21424 // If using FIST to compute an unsigned i64, we'll need some fixup
21425 // to handle values above the maximum signed i64. A FIST is always
21426 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
21427 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
21428
21429 // FIXME: This does not generate an invalid exception if the input does not
21430 // fit in i32. PR44019
21431 if (!IsSigned && DstTy != MVT::i64) {
21432 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
21433 // The low 32 bits of the fist result will have the correct uint32 result.
21434 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
21435 DstTy = MVT::i64;
21436 }
21437
21438 assert(DstTy.getSimpleVT() <= MVT::i64 &&
21439 DstTy.getSimpleVT() >= MVT::i16 &&
21440 "Unknown FP_TO_INT to lower!");
21441
21442 // We lower FP->int64 into FISTP64 followed by a load from a temporary
21443 // stack slot.
21444 MachineFunction &MF = DAG.getMachineFunction();
21445 unsigned MemSize = DstTy.getStoreSize();
21446 int SSFI =
21447 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
21448 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
21449
21450 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
21451
21452 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
21453
21454 if (UnsignedFixup) {
21455 //
21456 // Conversion to unsigned i64 is implemented with a select,
21457 // depending on whether the source value fits in the range
21458 // of a signed i64. Let Thresh be the FP equivalent of
21459 // 0x8000000000000000ULL.
21460 //
21461 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
21462 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
21463 // FistSrc = (Value - FltOfs);
21464 // Fist-to-mem64 FistSrc
21465 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
21466 // to XOR'ing the high 32 bits with Adjust.
21467 //
21468 // Being a power of 2, Thresh is exactly representable in all FP formats.
21469 // For X87 we'd like to use the smallest FP type for this constant, but
21470 // for DAG type consistency we have to match the FP operand type.
21471
21472 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
21473 [[maybe_unused]] APFloat::opStatus Status = APFloat::opOK;
21474 bool LosesInfo = false;
21475 if (TheVT == MVT::f64)
21476 // The rounding mode is irrelevant as the conversion should be exact.
21477 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
21478 &LosesInfo);
21479 else if (TheVT == MVT::f80)
21480 Status = Thresh.convert(APFloat::x87DoubleExtended(),
21481 APFloat::rmNearestTiesToEven, &LosesInfo);
21482
21483 assert(Status == APFloat::opOK && !LosesInfo &&
21484 "FP conversion should have been exact");
21485
21486 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
21487
21488 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
21489 *DAG.getContext(), TheVT);
21490 SDValue Cmp;
21491 if (IsStrict) {
21492 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
21493 /*IsSignaling*/ true);
21494 Chain = Cmp.getValue(1);
21495 } else {
21496 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
21497 }
21498
21499 // Our preferred lowering of
21500 //
21501 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
21502 //
21503 // is
21504 //
21505 // (Value >= Thresh) << 63
21506 //
21507 // but since we can get here after LegalOperations, DAGCombine might do the
21508 // wrong thing if we create a select. So, directly create the preferred
21509 // version.
21510 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
21511 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
21512 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
21513
21514 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
21515 DAG.getConstantFP(0.0, DL, TheVT));
21516
21517 if (IsStrict) {
21518 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
21519 { Chain, Value, FltOfs });
21520 Chain = Value.getValue(1);
21521 } else
21522 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
21523 }
21524
21525 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
21526
21527 // FIXME This causes a redundant load/store if the SSE-class value is already
21528 // in memory, such as if it is on the callstack.
21529 if (isScalarFPTypeInSSEReg(TheVT)) {
21530 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
21531 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
21532 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
21533 SDValue Ops[] = { Chain, StackSlot };
21534
21535 unsigned FLDSize = TheVT.getStoreSize();
21536 assert(FLDSize <= MemSize && "Stack slot not big enough");
21537 MachineMemOperand *MMO = MF.getMachineMemOperand(
21538 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
21539 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
21540 Chain = Value.getValue(1);
21541 }
21542
21543 // Build the FP_TO_INT*_IN_MEM
21544 MachineMemOperand *MMO = MF.getMachineMemOperand(
21545 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
21546 SDValue Ops[] = { Chain, Value, StackSlot };
21547 SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
21548 DAG.getVTList(MVT::Other),
21549 Ops, DstTy, MMO);
21550
21551 SDValue Res = DAG.getLoad(Op.getValueType(), DL, FIST, StackSlot, MPI);
21552 Chain = Res.getValue(1);
21553
21554 // If we need an unsigned fixup, XOR the result with adjust.
21555 if (UnsignedFixup)
21556 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
21557
21558 return Res;
21559}
21560
21562 const X86Subtarget &Subtarget) {
21563 MVT VT = Op.getSimpleValueType();
21564 SDValue In = Op.getOperand(0);
21565 MVT InVT = In.getSimpleValueType();
21566 unsigned Opc = Op.getOpcode();
21567
21568 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
21570 "Unexpected extension opcode");
21572 "Expected same number of elements");
21573 assert((VT.getVectorElementType() == MVT::i16 ||
21574 VT.getVectorElementType() == MVT::i32 ||
21575 VT.getVectorElementType() == MVT::i64) &&
21576 "Unexpected element type");
21577 assert((InVT.getVectorElementType() == MVT::i8 ||
21578 InVT.getVectorElementType() == MVT::i16 ||
21579 InVT.getVectorElementType() == MVT::i32) &&
21580 "Unexpected element type");
21581
21582 unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);
21583
21584 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
21585 assert(InVT == MVT::v32i8 && "Unexpected VT!");
21586 return splitVectorIntUnary(Op, DAG, dl);
21587 }
21588
21589 if (Subtarget.hasInt256())
21590 return Op;
21591
21592 // Optimize vectors in AVX mode:
21593 //
21594 // v8i16 -> v8i32
21595 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
21596 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
21597 // Concat upper and lower parts.
21598 //
21599 // v4i32 -> v4i64
21600 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
21601 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
21602 // Concat upper and lower parts.
21603 //
21604 MVT HalfVT = VT.getHalfNumVectorElementsVT();
21605 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
21606
21607 // Short-circuit if we can determine that each 128-bit half is the same value.
21608 // Otherwise, this is difficult to match and optimize.
21609 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
21610 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
21611 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
21612
21613 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
21614 SDValue Undef = DAG.getUNDEF(InVT);
21615 bool NeedZero = Opc == ISD::ZERO_EXTEND;
21616 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
21617 OpHi = DAG.getBitcast(HalfVT, OpHi);
21618
21619 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
21620}
21621
21622// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
21623static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
21624