LLVM 19.0.0git
X86ISelLowering.cpp
Go to the documentation of this file.
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
16#include "X86.h"
17#include "X86CallingConv.h"
18#include "X86FrameLowering.h"
19#include "X86InstrBuilder.h"
20#include "X86IntrinsicsInfo.h"
22#include "X86TargetMachine.h"
23#include "X86TargetObjectFile.h"
25#include "llvm/ADT/SmallSet.h"
26#include "llvm/ADT/Statistic.h"
43#include "llvm/IR/CallingConv.h"
44#include "llvm/IR/Constants.h"
47#include "llvm/IR/Function.h"
48#include "llvm/IR/GlobalAlias.h"
50#include "llvm/IR/IRBuilder.h"
52#include "llvm/IR/Intrinsics.h"
54#include "llvm/MC/MCAsmInfo.h"
55#include "llvm/MC/MCContext.h"
56#include "llvm/MC/MCExpr.h"
57#include "llvm/MC/MCSymbol.h"
59#include "llvm/Support/Debug.h"
64#include <algorithm>
65#include <bitset>
66#include <cctype>
67#include <numeric>
68using namespace llvm;
69
70#define DEBUG_TYPE "x86-isel"
71
73 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
75 "Sets the preferable loop alignment for experiments (as log2 bytes) "
76 "for innermost loops only. If specified, this option overrides "
77 "alignment set by x86-experimental-pref-loop-alignment."),
79
81 "x86-br-merging-base-cost", cl::init(2),
83 "Sets the cost threshold for when multiple conditionals will be merged "
84 "into one branch versus be split in multiple branches. Merging "
85 "conditionals saves branches at the cost of additional instructions. "
86 "This value sets the instruction cost limit, below which conditionals "
87 "will be merged, and above which conditionals will be split. Set to -1 "
88 "to never merge branches."),
90
92 "x86-br-merging-likely-bias", cl::init(0),
93 cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely "
94 "that all conditionals will be executed. For example for merging "
95 "the conditionals (a == b && c > d), if its known that a == b is "
96 "likely, then it is likely that if the conditionals are split "
97 "both sides will be executed, so it may be desirable to increase "
98 "the instruction cost threshold. Set to -1 to never merge likely "
99 "branches."),
100 cl::Hidden);
101
103 "x86-br-merging-unlikely-bias", cl::init(-1),
104 cl::desc(
105 "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
106 "that all conditionals will be executed. For example for merging "
107 "the conditionals (a == b && c > d), if its known that a == b is "
108 "unlikely, then it is unlikely that if the conditionals are split "
109 "both sides will be executed, so it may be desirable to decrease "
110 "the instruction cost threshold. Set to -1 to never merge unlikely "
111 "branches."),
112 cl::Hidden);
113
115 "mul-constant-optimization", cl::init(true),
116 cl::desc("Replace 'mul x, Const' with more effective instructions like "
117 "SHIFT, LEA, etc."),
118 cl::Hidden);
119
121 const X86Subtarget &STI)
122 : TargetLowering(TM), Subtarget(STI) {
123 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
124 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
125
126 // Set up the TargetLowering object.
127
128 // X86 is weird. It always uses i8 for shift amounts and setcc results.
130 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
132
133 // For 64-bit, since we have so many registers, use the ILP scheduler.
134 // For 32-bit, use the register pressure specific scheduling.
135 // For Atom, always use ILP scheduling.
136 if (Subtarget.isAtom())
138 else if (Subtarget.is64Bit())
140 else
142 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
144
145 // Bypass expensive divides and use cheaper ones.
146 if (TM.getOptLevel() >= CodeGenOptLevel::Default) {
147 if (Subtarget.hasSlowDivide32())
148 addBypassSlowDiv(32, 8);
149 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
150 addBypassSlowDiv(64, 32);
151 }
152
153 // Setup Windows compiler runtime calls.
154 if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
155 static const struct {
156 const RTLIB::Libcall Op;
157 const char * const Name;
158 const CallingConv::ID CC;
159 } LibraryCalls[] = {
160 { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
161 { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
162 { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
163 { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
164 { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
165 };
166
167 for (const auto &LC : LibraryCalls) {
168 setLibcallName(LC.Op, LC.Name);
169 setLibcallCallingConv(LC.Op, LC.CC);
170 }
171 }
172
173 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
174 // MSVCRT doesn't have powi; fall back to pow
175 setLibcallName(RTLIB::POWI_F32, nullptr);
176 setLibcallName(RTLIB::POWI_F64, nullptr);
177 }
178
179 if (Subtarget.canUseCMPXCHG16B())
181 else if (Subtarget.canUseCMPXCHG8B())
183 else
185
186 setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
187
189
190 // Set up the register classes.
191 addRegisterClass(MVT::i8, &X86::GR8RegClass);
192 addRegisterClass(MVT::i16, &X86::GR16RegClass);
193 addRegisterClass(MVT::i32, &X86::GR32RegClass);
194 if (Subtarget.is64Bit())
195 addRegisterClass(MVT::i64, &X86::GR64RegClass);
196
197 for (MVT VT : MVT::integer_valuetypes())
199
200 // We don't accept any truncstore of integer registers.
201 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
202 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
203 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
204 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
205 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
206 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
207
208 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
209
210 // SETOEQ and SETUNE require checking two conditions.
211 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
214 }
215
216 // Integer absolute.
217 if (Subtarget.canUseCMOV()) {
218 setOperationAction(ISD::ABS , MVT::i16 , Custom);
219 setOperationAction(ISD::ABS , MVT::i32 , Custom);
220 if (Subtarget.is64Bit())
221 setOperationAction(ISD::ABS , MVT::i64 , Custom);
222 }
223
224 // Absolute difference.
225 for (auto Op : {ISD::ABDS, ISD::ABDU}) {
226 setOperationAction(Op , MVT::i8 , Custom);
227 setOperationAction(Op , MVT::i16 , Custom);
228 setOperationAction(Op , MVT::i32 , Custom);
229 if (Subtarget.is64Bit())
230 setOperationAction(Op , MVT::i64 , Custom);
231 }
232
233 // Signed saturation subtraction.
237 if (Subtarget.is64Bit())
239
240 // Funnel shifts.
241 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
242 // For slow shld targets we only lower for code size.
243 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
244
245 setOperationAction(ShiftOp , MVT::i8 , Custom);
246 setOperationAction(ShiftOp , MVT::i16 , Custom);
247 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
248 if (Subtarget.is64Bit())
249 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
250 }
251
252 if (!Subtarget.useSoftFloat()) {
253 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
254 // operation.
259 // We have an algorithm for SSE2, and we turn this into a 64-bit
260 // FILD or VCVTUSI2SS/SD for other targets.
263 // We have an algorithm for SSE2->double, and we turn this into a
264 // 64-bit FILD followed by conditional FADD for other targets.
267
268 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
269 // this operation.
272 // SSE has no i16 to fp conversion, only i32. We promote in the handler
273 // to allow f80 to use i16 and f64 to use i16 with sse1 only
276 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
279 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
280 // are Legal, f80 is custom lowered.
283
284 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
285 // this operation.
287 // FIXME: This doesn't generate invalid exception when it should. PR44019.
293 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
294 // are Legal, f80 is custom lowered.
297
298 // Handle FP_TO_UINT by promoting the destination to a larger signed
299 // conversion.
301 // FIXME: This doesn't generate invalid exception when it should. PR44019.
304 // FIXME: This doesn't generate invalid exception when it should. PR44019.
310
315
316 if (!Subtarget.is64Bit()) {
319 }
320 }
321
322 if (Subtarget.hasSSE2()) {
323 // Custom lowering for saturating float to int conversions.
324 // We handle promotion to larger result types manually.
325 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
328 }
329 if (Subtarget.is64Bit()) {
332 }
333 }
334
335 // Handle address space casts between mixed sized pointers.
338
339 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
340 if (!Subtarget.hasSSE2()) {
343 if (Subtarget.is64Bit()) {
345 // Without SSE, i64->f64 goes through memory.
347 }
348 } else if (!Subtarget.is64Bit())
350
351 // Scalar integer divide and remainder are lowered to use operations that
352 // produce two results, to match the available instructions. This exposes
353 // the two-result form to trivial CSE, which is able to combine x/y and x%y
354 // into a single instruction.
355 //
356 // Scalar integer multiply-high is also lowered to use two-result
357 // operations, to match the available instructions. However, plain multiply
358 // (low) operations are left as Legal, as there are single-result
359 // instructions for this in x86. Using the two-result multiply instructions
360 // when both high and low results are needed must be arranged by dagcombine.
361 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
368 }
369
370 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
372 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
373 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
376 }
377 if (Subtarget.is64Bit())
382
383 setOperationAction(ISD::FREM , MVT::f32 , Expand);
384 setOperationAction(ISD::FREM , MVT::f64 , Expand);
385 setOperationAction(ISD::FREM , MVT::f80 , Expand);
386 setOperationAction(ISD::FREM , MVT::f128 , Expand);
387
388 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
394 }
395
396 // Promote the i8 variants and force them on up to i32 which has a shorter
397 // encoding.
398 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
400 // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
401 // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
402 // promote that too.
403 setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);
405
406 if (!Subtarget.hasBMI()) {
407 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
409 if (Subtarget.is64Bit()) {
410 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
412 }
413 }
414
415 if (Subtarget.hasLZCNT()) {
416 // When promoting the i8 variants, force them to i32 for a shorter
417 // encoding.
418 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
420 } else {
421 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
422 if (VT == MVT::i64 && !Subtarget.is64Bit())
423 continue;
426 }
427 }
428
431 // Special handling for half-precision floating point conversions.
432 // If we don't have F16C support, then lower half float conversions
433 // into library calls.
435 Op, MVT::f32,
436 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
437 // There's never any support for operations beyond MVT::f32.
438 setOperationAction(Op, MVT::f64, Expand);
439 setOperationAction(Op, MVT::f80, Expand);
440 setOperationAction(Op, MVT::f128, Expand);
441 }
442
443 for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
446 }
447
448 for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
449 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
450 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
451 setTruncStoreAction(VT, MVT::f16, Expand);
452 setTruncStoreAction(VT, MVT::bf16, Expand);
453
456 }
457
461 if (Subtarget.is64Bit())
463 if (Subtarget.hasPOPCNT()) {
464 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
465 // popcntw is longer to encode than popcntl and also has a false dependency
466 // on the dest that popcntl hasn't had since Cannon Lake.
467 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
468 } else {
473 }
474
476
477 if (!Subtarget.hasMOVBE())
479
480 // X86 wants to expand cmov itself.
481 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
486 }
487 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
488 if (VT == MVT::i64 && !Subtarget.is64Bit())
489 continue;
492 }
493
494 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
497
499 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
500 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
504 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
505 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
506
507 // Darwin ABI issue.
508 for (auto VT : { MVT::i32, MVT::i64 }) {
509 if (VT == MVT::i64 && !Subtarget.is64Bit())
510 continue;
517 }
518
519 // 64-bit shl, sra, srl (iff 32-bit x86)
520 for (auto VT : { MVT::i32, MVT::i64 }) {
521 if (VT == MVT::i64 && !Subtarget.is64Bit())
522 continue;
526 }
527
528 if (Subtarget.hasSSEPrefetch() || Subtarget.hasThreeDNow())
530
532
533 // Expand certain atomics
534 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
542 }
543
544 if (!Subtarget.is64Bit())
546
547 if (Subtarget.canUseCMPXCHG16B())
549
550 // FIXME - use subtarget debug flags
551 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
552 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
553 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
555 }
556
559
562
563 setOperationAction(ISD::TRAP, MVT::Other, Legal);
565 if (Subtarget.isTargetPS())
567 else
569
570 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
572 setOperationAction(ISD::VAEND , MVT::Other, Expand);
573 bool Is64Bit = Subtarget.is64Bit();
574 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
575 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
576
579
581
582 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
585
587
588 auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
589 setOperationAction(ISD::FABS, VT, Action);
590 setOperationAction(ISD::FNEG, VT, Action);
592 setOperationAction(ISD::FREM, VT, Action);
593 setOperationAction(ISD::FMA, VT, Action);
594 setOperationAction(ISD::FMINNUM, VT, Action);
595 setOperationAction(ISD::FMAXNUM, VT, Action);
598 setOperationAction(ISD::FSIN, VT, Action);
599 setOperationAction(ISD::FCOS, VT, Action);
600 setOperationAction(ISD::FSINCOS, VT, Action);
601 setOperationAction(ISD::FSQRT, VT, Action);
602 setOperationAction(ISD::FPOW, VT, Action);
603 setOperationAction(ISD::FLOG, VT, Action);
604 setOperationAction(ISD::FLOG2, VT, Action);
605 setOperationAction(ISD::FLOG10, VT, Action);
606 setOperationAction(ISD::FEXP, VT, Action);
607 setOperationAction(ISD::FEXP2, VT, Action);
608 setOperationAction(ISD::FEXP10, VT, Action);
609 setOperationAction(ISD::FCEIL, VT, Action);
610 setOperationAction(ISD::FFLOOR, VT, Action);
612 setOperationAction(ISD::FRINT, VT, Action);
613 setOperationAction(ISD::BR_CC, VT, Action);
614 setOperationAction(ISD::SETCC, VT, Action);
617 setOperationAction(ISD::FROUND, VT, Action);
619 setOperationAction(ISD::FTRUNC, VT, Action);
620 setOperationAction(ISD::FLDEXP, VT, Action);
621 };
622
623 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
624 // f16, f32 and f64 use SSE.
625 // Set up the FP register classes.
626 addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
627 : &X86::FR16RegClass);
628 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
629 : &X86::FR32RegClass);
630 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
631 : &X86::FR64RegClass);
632
633 // Disable f32->f64 extload as we can only generate this in one instruction
634 // under optsize. So its easier to pattern match (fpext (load)) for that
635 // case instead of needing to emit 2 instructions for extload in the
636 // non-optsize case.
637 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
638
639 for (auto VT : { MVT::f32, MVT::f64 }) {
640 // Use ANDPD to simulate FABS.
642
643 // Use XORP to simulate FNEG.
645
646 // Use ANDPD and ORPD to simulate FCOPYSIGN.
648
649 // These might be better off as horizontal vector ops.
652
653 // We don't support sin/cos/fmod
657 }
658
659 // Half type will be promoted by default.
660 setF16Action(MVT::f16, Promote);
668
698
699 setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
700 setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
701
702 // Lower this to MOVMSK plus an AND.
705
706 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
707 (UseX87 || Is64Bit)) {
708 // Use SSE for f32, x87 for f64.
709 // Set up the FP register classes.
710 addRegisterClass(MVT::f32, &X86::FR32RegClass);
711 if (UseX87)
712 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
713
714 // Use ANDPS to simulate FABS.
716
717 // Use XORP to simulate FNEG.
719
720 if (UseX87)
722
723 // Use ANDPS and ORPS to simulate FCOPYSIGN.
724 if (UseX87)
727
728 // We don't support sin/cos/fmod
732
733 if (UseX87) {
734 // Always expand sin/cos functions even though x87 has an instruction.
738 }
739 } else if (UseX87) {
740 // f32 and f64 in x87.
741 // Set up the FP register classes.
742 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
743 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
744
745 for (auto VT : { MVT::f32, MVT::f64 }) {
748
749 // Always expand sin/cos functions even though x87 has an instruction.
753 }
754 }
755
756 // Expand FP32 immediates into loads from the stack, save special cases.
757 if (isTypeLegal(MVT::f32)) {
758 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
759 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
760 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
761 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
762 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
763 } else // SSE immediates.
764 addLegalFPImmediate(APFloat(+0.0f)); // xorps
765 }
766 // Expand FP64 immediates into loads from the stack, save special cases.
767 if (isTypeLegal(MVT::f64)) {
768 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
769 addLegalFPImmediate(APFloat(+0.0)); // FLD0
770 addLegalFPImmediate(APFloat(+1.0)); // FLD1
771 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
772 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
773 } else // SSE immediates.
774 addLegalFPImmediate(APFloat(+0.0)); // xorpd
775 }
776 // Support fp16 0 immediate.
777 if (isTypeLegal(MVT::f16))
778 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
779
780 // Handle constrained floating-point operations of scalar.
793
794 // We don't support FMA.
797
798 // f80 always uses X87.
799 if (UseX87) {
800 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
803 {
805 addLegalFPImmediate(TmpFlt); // FLD0
806 TmpFlt.changeSign();
807 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
808
809 bool ignored;
810 APFloat TmpFlt2(+1.0);
812 &ignored);
813 addLegalFPImmediate(TmpFlt2); // FLD1
814 TmpFlt2.changeSign();
815 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
816 }
817
818 // Always expand sin/cos functions even though x87 has an instruction.
822
834
835 // Handle constrained floating-point operations of scalar.
841 if (isTypeLegal(MVT::f16)) {
844 } else {
846 }
847 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
848 // as Custom.
850 }
851
852 // f128 uses xmm registers, but most operations require libcalls.
853 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
854 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
855 : &X86::VR128RegClass);
856
857 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
858
869
873
879 // No STRICT_FSINCOS
882
885 // We need to custom handle any FP_ROUND with an f128 input, but
886 // LegalizeDAG uses the result type to know when to run a custom handler.
887 // So we have to list all legal floating point result types here.
888 if (isTypeLegal(MVT::f32)) {
891 }
892 if (isTypeLegal(MVT::f64)) {
895 }
896 if (isTypeLegal(MVT::f80)) {
899 }
900
902
903 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
904 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
905 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
906 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
907 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
908 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
909 }
910
911 // Always use a library call for pow.
912 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
913 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
914 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
915 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
916
925
926 // Some FP actions are always expanded for vector types.
927 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
928 MVT::v4f32, MVT::v8f32, MVT::v16f32,
929 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
942 }
943
944 // First set operation action for all vector types to either promote
945 // (for widening) or expand (for scalarization). Then we will selectively
946 // turn on ones that can be effectively codegen'd.
986 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
987 setTruncStoreAction(InnerVT, VT, Expand);
988
991
992 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
993 // types, we have to deal with them whether we ask for Expansion or not.
994 // Setting Expand causes its own optimisation problems though, so leave
995 // them legal.
996 if (VT.getVectorElementType() == MVT::i1)
997 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
998
999 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
1000 // split/scalarized right now.
1001 if (VT.getVectorElementType() == MVT::f16 ||
1002 VT.getVectorElementType() == MVT::bf16)
1003 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1004 }
1005 }
1006
1007 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
1008 // with -msoft-float, disable use of MMX as well.
1009 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1010 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
1011 // No operations on x86mmx supported, everything uses intrinsics.
1012 }
1013
1014 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1015 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1016 : &X86::VR128RegClass);
1017
1020
1021 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
1022 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
1029
1030 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
1031 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
1032
1038 }
1039
1040 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1041 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1042 : &X86::VR128RegClass);
1043
1044 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1045 // registers cannot be used even for integer operations.
1046 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1047 : &X86::VR128RegClass);
1048 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1049 : &X86::VR128RegClass);
1050 addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1051 : &X86::VR128RegClass);
1052 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1053 : &X86::VR128RegClass);
1054 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1055 : &X86::VR128RegClass);
1056
1057 for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1060 }
1061
1062 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1063 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1068 }
1069
1070 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
1071 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
1072 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
1073
1074 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
1075 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1076 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1077 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
1078 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
1079 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
1080 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
1081 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
1082 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
1083 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
1086
1087 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
1088 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
1089 setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
1090
1091 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
1092 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
1094
1095 setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
1096
1097 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1098 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1099 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1100 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1101 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1102 }
1103
1104 setOperationAction(ISD::ABDU, MVT::v16i8, Custom);
1105 setOperationAction(ISD::ABDS, MVT::v16i8, Custom);
1106 setOperationAction(ISD::ABDU, MVT::v8i16, Custom);
1107 setOperationAction(ISD::ABDS, MVT::v8i16, Custom);
1108 setOperationAction(ISD::ABDU, MVT::v4i32, Custom);
1109 setOperationAction(ISD::ABDS, MVT::v4i32, Custom);
1110
1121
1126
1127 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1131
1132 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1133 // setcc all the way to isel and prefer SETGT in some isel patterns.
1136 }
1137
1138 setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
1139 setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
1144
1145 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1151 }
1152
1153 for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1157
1158 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1159 continue;
1160
1163 }
1164 setF16Action(MVT::v8f16, Expand);
1165 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1166 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1167 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1168 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1169 setOperationAction(ISD::FNEG, MVT::v8f16, Custom);
1170 setOperationAction(ISD::FABS, MVT::v8f16, Custom);
1172
1173 // Custom lower v2i64 and v2f64 selects.
1180
1187
1188 // Custom legalize these to avoid over promotion or custom promotion.
1189 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1194 }
1195
1200
1203
1206
1207 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1212
1217
1218 // We want to legalize this to an f64 load rather than an i64 load on
1219 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1220 // store.
1221 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1222 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1223 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1224 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1225 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1227
1228 // Add 32-bit vector stores to help vectorization opportunities.
1229 setOperationAction(ISD::STORE, MVT::v2i16, Custom);
1231
1235 if (!Subtarget.hasAVX512())
1237
1241
1243
1260
1261 // In the customized shift lowering, the legal v4i32/v2i64 cases
1262 // in AVX2 will be recognized.
1263 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1267 if (VT == MVT::v2i64) continue;
1272 }
1273
1279 }
1280
1281 if (Subtarget.hasGFNI()) {
1286 }
1287
1288 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1289 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1290 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1291 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1292
1293 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1296 }
1297
1298 // These might be better off as horizontal vector ops.
1303 }
1304
1305 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1306 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1309 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1313 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1319
1321 }
1322
1323 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1324 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1325 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1326 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1327 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1328 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1329 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1330 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1331
1332 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1335 }
1336
1340
1341 // FIXME: Do we need to handle scalar-to-vector here?
1342 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1343 setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
1344
1345 // We directly match byte blends in the backend as they match the VSELECT
1346 // condition form.
1348
1349 // SSE41 brings specific instructions for doing vector sign extend even in
1350 // cases where we don't have SRA.
1351 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1354 }
1355
1356 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1357 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1358 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1359 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1360 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1361 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1362 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1363 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1364 }
1365
1366 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1367 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1368 // do the pre and post work in the vector domain.
1371 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1372 // so that DAG combine doesn't try to turn it into uint_to_fp.
1375 }
1376 }
1377
1378 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1380 }
1381
1382 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1383 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1384 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1387 }
1388
1389 // XOP can efficiently perform BITREVERSE with VPPERM.
1390 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1392 }
1393
1394 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1395 bool HasInt256 = Subtarget.hasInt256();
1396
1397 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1398 : &X86::VR256RegClass);
1399 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1400 : &X86::VR256RegClass);
1401 addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1402 : &X86::VR256RegClass);
1403 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1404 : &X86::VR256RegClass);
1405 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1406 : &X86::VR256RegClass);
1407 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1408 : &X86::VR256RegClass);
1409 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1410 : &X86::VR256RegClass);
1411
1412 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1425
1427
1431
1434 }
1435
1436 setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
1437 setOperationAction(ISD::LRINT, MVT::v4f64, Custom);
1438
1439 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1440 // even though v8i16 is a legal type.
1441 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1442 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1443 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1444 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1448
1455
1467
1468 if (!Subtarget.hasAVX512())
1470
1471 // In the customized shift lowering, the legal v8i32/v4i64 cases
1472 // in AVX2 will be recognized.
1473 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1479 if (VT == MVT::v4i64) continue;
1484 }
1485
1486 // These types need custom splitting if their input is a 128-bit vector.
1491
1495 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1496 setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
1499
1500 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1504 }
1505
1510
1511 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1516
1517 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1518 // setcc all the way to isel and prefer SETGT in some isel patterns.
1521 }
1522
1523 setOperationAction(ISD::SETCC, MVT::v4f64, Custom);
1524 setOperationAction(ISD::SETCC, MVT::v8f32, Custom);
1529
1530 if (Subtarget.hasAnyFMA()) {
1531 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1532 MVT::v2f64, MVT::v4f64 }) {
1535 }
1536 }
1537
1538 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1539 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1540 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1541 }
1542
1543 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1544 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1545 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1546 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1547
1548 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1549 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1550 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1551 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1552 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1553 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1554 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1555 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1556
1557 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1558 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1559
1560 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1561 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1562 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1563 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1564 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1565
1566 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1567 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1568 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1569 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1570 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1571 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1572 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1573 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1578
1579 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1580 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1581 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1582 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1583 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1584 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1585 }
1586
1587 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1590 }
1591
1592 if (HasInt256) {
1593 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1594 // when we have a 256bit-wide blend with immediate.
1597
1598 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1599 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1600 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1601 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1602 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1603 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1604 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1605 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1606 }
1607 }
1608
1609 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1610 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1611 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1613 }
1614
1615 // Extract subvector is special because the value type
1616 // (result) is 128-bit but the source is 256-bit wide.
1617 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1618 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1620 }
1621
1622 // Custom lower several nodes for 256-bit types.
1623 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1624 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1634 }
1635 setF16Action(MVT::v16f16, Expand);
1636 setOperationAction(ISD::FNEG, MVT::v16f16, Custom);
1637 setOperationAction(ISD::FABS, MVT::v16f16, Custom);
1639 setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1640 setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1641 setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1642 setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1643
1644 if (HasInt256) {
1646
1647 // Custom legalize 2x32 to get a little better code.
1650
1651 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1652 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1654 }
1655 }
1656
1657 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1658 Subtarget.hasF16C()) {
1659 for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1662 }
1663 for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1666 }
1667 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1668 setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1669 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1670 }
1671 }
1672
1673 // This block controls legalization of the mask vector sizes that are
1674 // available with AVX512. 512-bit vectors are in a separate block controlled
1675 // by useAVX512Regs.
1676 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1677 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1678 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1679 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1680 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1681 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1682
1686
1687 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1688 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1689 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1690 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1691 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1692 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1693 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1694 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1699
1700 // There is no byte sized k-register load or store without AVX512DQ.
1701 if (!Subtarget.hasDQI()) {
1702 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1703 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1704 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1705 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1706
1711 }
1712
1713 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1714 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1718 }
1719
1720 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1722
1723 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1727
1734 }
1735
1736 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1738 }
1739 if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
1740 for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1743 }
1744 }
1745
1746 // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1747 // elements. 512-bits can be disabled based on prefer-vector-width and
1748 // required-vector-width function attributes.
1749 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1750 bool HasBWI = Subtarget.hasBWI();
1751
1752 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1753 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1754 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1755 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1756 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1757 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1758 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1759
1760 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1761 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1762 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1763 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1764 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1765 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1766 if (HasBWI)
1767 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1768 }
1769
1770 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1778 }
1779 setOperationAction(ISD::LRINT, MVT::v16f32,
1780 Subtarget.hasDQI() ? Legal : Custom);
1781 setOperationAction(ISD::LRINT, MVT::v8f64,
1782 Subtarget.hasDQI() ? Legal : Custom);
1783 if (Subtarget.hasDQI())
1784 setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
1785
1786 for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1791 }
1792
1793 for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1798 }
1799
1806
1818
1819 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1820 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1821 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1822 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1823 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1824 if (HasBWI)
1825 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1826
1827 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1828 // to 512-bit rather than use the AVX2 instructions so that we can use
1829 // k-masks.
1830 if (!Subtarget.hasVLX()) {
1831 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1832 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1835 }
1836 }
1837
1839 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1840 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1850
1851 if (HasBWI) {
1852 // Extends from v64i1 masks to 512-bit vectors.
1856 }
1857
1858 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1871
1873 }
1874
1875 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1878 }
1879
1880 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1881 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1882 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1883 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1884
1885 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1886 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1887 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1888 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1889
1890 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1891 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1892 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1893 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1894 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1895 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1896 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
1897 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
1898
1899 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1900 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1901
1902 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1912
1913 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1914 // setcc all the way to isel and prefer SETGT in some isel patterns.
1917 }
1918
1919 setOperationAction(ISD::SETCC, MVT::v8f64, Custom);
1920 setOperationAction(ISD::SETCC, MVT::v16f32, Custom);
1925
1926 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1933 }
1934
1935 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1936 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
1937 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
1939 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
1940 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
1941 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
1942 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
1947 }
1948
1949 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
1950 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
1951 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
1952 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
1953 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
1954 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
1955
1956 if (Subtarget.hasDQI()) {
1960 setOperationAction(Opc, MVT::v8i64, Custom);
1961 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1962 }
1963
1964 if (Subtarget.hasCDI()) {
1965 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1966 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1968 }
1969 } // Subtarget.hasCDI()
1970
1971 if (Subtarget.hasVPOPCNTDQ()) {
1972 for (auto VT : { MVT::v16i32, MVT::v8i64 })
1974 }
1975
1976 // Extract subvector is special because the value type
1977 // (result) is 256-bit but the source is 512-bit wide.
1978 // 128-bit was made Legal under AVX1.
1979 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1980 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1982
1983 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
1984 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
1994 }
1995 setF16Action(MVT::v32f16, Expand);
2000 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2001 setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
2002
2003 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
2008 }
2009 if (HasBWI) {
2010 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2013 }
2014 } else {
2015 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
2016 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
2017 }
2018
2019 if (Subtarget.hasVBMI2()) {
2020 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2023 }
2024
2025 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
2026 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
2027 }
2028 }// useAVX512Regs
2029
2030 if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
2031 for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
2032 MVT::v4i64}) {
2035 }
2036 }
2037
2038 // This block controls legalization for operations that don't have
2039 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
2040 // narrower widths.
2041 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
2042 // These operations are handled on non-VLX by artificially widening in
2043 // isel patterns.
2044
2048
2049 if (Subtarget.hasDQI()) {
2050 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
2051 // v2f32 UINT_TO_FP is already custom under SSE2.
2054 "Unexpected operation action!");
2055 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
2060 }
2061
2062 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
2068 }
2069
2070 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2073 }
2074
2075 // Custom legalize 2x32 to get a little better code.
2078
2079 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2080 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2082
2083 if (Subtarget.hasDQI()) {
2087 setOperationAction(Opc, MVT::v2i64, Custom);
2088 setOperationAction(Opc, MVT::v4i64, Custom);
2089 }
2090 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
2091 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
2092 }
2093
2094 if (Subtarget.hasCDI()) {
2095 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2097 }
2098 } // Subtarget.hasCDI()
2099
2100 if (Subtarget.hasVPOPCNTDQ()) {
2101 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2103 }
2104 setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
2105 setOperationAction(ISD::FABS, MVT::v32f16, Custom);
2107 }
2108
2109 // This block control legalization of v32i1/v64i1 which are available with
2110 // AVX512BW..
2111 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2112 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
2113 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
2114
2115 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2126 }
2127
2128 for (auto VT : { MVT::v16i1, MVT::v32i1 })
2130
2131 // Extends from v32i1 masks to 256-bit vectors.
2135
2136 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
2137 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
2138 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2139 }
2140
2141 // These operations are handled on non-VLX by artificially widening in
2142 // isel patterns.
2143 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2144
2145 if (Subtarget.hasBITALG()) {
2146 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2148 }
2149 }
2150
2151 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2152 auto setGroup = [&] (MVT VT) {
2163
2176
2178
2181
2187
2193
2197 };
2198
2199 // AVX512_FP16 scalar operations
2200 setGroup(MVT::f16);
2214
2217
2218 if (Subtarget.useAVX512Regs()) {
2219 setGroup(MVT::v32f16);
2225 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
2232
2237 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
2239 MVT::v32i16);
2240 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2242 MVT::v32i16);
2243 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2245 MVT::v32i16);
2246 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2248 MVT::v32i16);
2249
2253
2254 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2255 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2256 }
2257
2258 if (Subtarget.hasVLX()) {
2259 setGroup(MVT::v8f16);
2260 setGroup(MVT::v16f16);
2261
2272
2283
2284 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2287
2291
2292 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2293 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2294 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2295 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2296
2297 // Need to custom widen these to prevent scalarization.
2298 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2299 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2300 }
2301 }
2302
2303 if (!Subtarget.useSoftFloat() &&
2304 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2305 addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass
2306 : &X86::VR128RegClass);
2307 addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
2308 : &X86::VR256RegClass);
2309 // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2310 // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT.
2311 // Set the operation action Custom to do the customization later.
2314 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2315 setF16Action(VT, Expand);
2320 }
2321 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
2322 setOperationPromotedToType(Opc, MVT::v8bf16, MVT::v8f32);
2323 setOperationPromotedToType(Opc, MVT::v16bf16, MVT::v16f32);
2324 }
2326 addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2327 }
2328
2329 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16()) {
2330 addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2331 setF16Action(MVT::v32bf16, Expand);
2332 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2333 setOperationPromotedToType(Opc, MVT::v32bf16, MVT::v32f32);
2335 setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
2339 }
2340
2341 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2342 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2343 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2344 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2345 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2346 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2347
2348 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2349 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2350 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2351 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2352 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2353
2354 if (Subtarget.hasBWI()) {
2355 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2356 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2357 }
2358
2359 if (Subtarget.hasFP16()) {
2360 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2369 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2378 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2383 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2388 }
2389 }
2390
2391 if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {
2392 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2393 }
2394
2395 // We want to custom lower some of our intrinsics.
2399 if (!Subtarget.is64Bit()) {
2401 }
2402
2403 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2404 // handle type legalization for these operations here.
2405 //
2406 // FIXME: We really should do custom legalization for addition and
2407 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2408 // than generic legalization for 64-bit multiplication-with-overflow, though.
2409 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2410 if (VT == MVT::i64 && !Subtarget.is64Bit())
2411 continue;
2412 // Add/Sub/Mul with overflow operations are custom lowered.
2419
2420 // Support carry in as value rather than glue.
2426 }
2427
2428 if (!Subtarget.is64Bit()) {
2429 // These libcalls are not available in 32-bit.
2430 setLibcallName(RTLIB::SHL_I128, nullptr);
2431 setLibcallName(RTLIB::SRL_I128, nullptr);
2432 setLibcallName(RTLIB::SRA_I128, nullptr);
2433 setLibcallName(RTLIB::MUL_I128, nullptr);
2434 // The MULO libcall is not part of libgcc, only compiler-rt.
2435 setLibcallName(RTLIB::MULO_I64, nullptr);
2436 }
2437 // The MULO libcall is not part of libgcc, only compiler-rt.
2438 setLibcallName(RTLIB::MULO_I128, nullptr);
2439
2440 // Combine sin / cos into _sincos_stret if it is available.
2441 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2442 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2445 }
2446
2447 if (Subtarget.isTargetWin64()) {
2448 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2449 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2450 setOperationAction(ISD::SREM, MVT::i128, Custom);
2451 setOperationAction(ISD::UREM, MVT::i128, Custom);
2460 }
2461
2462 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2463 // is. We should promote the value to 64-bits to solve this.
2464 // This is what the CRT headers do - `fmodf` is an inline header
2465 // function casting to f64 and calling `fmod`.
2466 if (Subtarget.is32Bit() &&
2467 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2468 for (ISD::NodeType Op :
2478 if (isOperationExpand(Op, MVT::f32))
2479 setOperationAction(Op, MVT::f32, Promote);
2480
2481 // We have target-specific dag combine patterns for the following nodes:
2492 ISD::SHL,
2493 ISD::SRA,
2494 ISD::SRL,
2495 ISD::OR,
2496 ISD::AND,
2498 ISD::ADD,
2499 ISD::FADD,
2500 ISD::FSUB,
2501 ISD::FNEG,
2502 ISD::FMA,
2506 ISD::SUB,
2507 ISD::LOAD,
2508 ISD::LRINT,
2510 ISD::MLOAD,
2511 ISD::STORE,
2525 ISD::SETCC,
2526 ISD::MUL,
2527 ISD::XOR,
2535
2537
2538 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2540 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2542 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2544
2545 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2546 // that needs to benchmarked and balanced with the potential use of vector
2547 // load/store types (PR33329, PR33914).
2550
2551 // Default loop alignment, which can be overridden by -align-loops.
2553
2554 // An out-of-order CPU can speculatively execute past a predictable branch,
2555 // but a conditional move could be stalled by an expensive earlier operation.
2556 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2557 EnableExtLdPromotion = true;
2559
2561
2562 // Default to having -disable-strictnode-mutation on
2563 IsStrictFPEnabled = true;
2564}
2565
2566// This has so far only been implemented for 64-bit MachO.
2568 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2569}
2570
2572 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2573 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2574}
2575
2577 const SDLoc &DL) const {
2578 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2579 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2580 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2581 return SDValue(Node, 0);
2582}
2583
2586 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2587 !Subtarget.hasBWI())
2588 return TypeSplitVector;
2589
2590 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2591 !Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16)
2592 return TypeSplitVector;
2593
2594 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2595 VT.getVectorElementType() != MVT::i1)
2596 return TypeWidenVector;
2597
2599}
2600
2601FastISel *
2603 const TargetLibraryInfo *libInfo) const {
2604 return X86::createFastISel(funcInfo, libInfo);
2605}
2606
2607//===----------------------------------------------------------------------===//
2608// Other Lowering Hooks
2609//===----------------------------------------------------------------------===//
2610
2612 bool AssumeSingleUse) {
2613 if (!AssumeSingleUse && !Op.hasOneUse())
2614 return false;
2615 if (!ISD::isNormalLoad(Op.getNode()))
2616 return false;
2617
2618 // If this is an unaligned vector, make sure the target supports folding it.
2619 auto *Ld = cast<LoadSDNode>(Op.getNode());
2620 if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
2621 Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
2622 return false;
2623
2624 // TODO: If this is a non-temporal load and the target has an instruction
2625 // for it, it should not be folded. See "useNonTemporalLoad()".
2626
2627 return true;
2628}
2629
2631 const X86Subtarget &Subtarget,
2632 bool AssumeSingleUse) {
2633 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
2634 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
2635 return false;
2636
2637 // We can not replace a wide volatile load with a broadcast-from-memory,
2638 // because that would narrow the load, which isn't legal for volatiles.
2639 auto *Ld = cast<LoadSDNode>(Op.getNode());
2640 return !Ld->isVolatile() ||
2641 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
2642}
2643
2645 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
2646}
2647
2649 if (Op.hasOneUse()) {
2650 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
2651 return (ISD::ZERO_EXTEND == Opcode);
2652 }
2653 return false;
2654}
2655
2656static bool isLogicOp(unsigned Opcode) {
2657 // TODO: Add support for X86ISD::FAND/FOR/FXOR/FANDN with test coverage.
2658 return ISD::isBitwiseLogicOp(Opcode) || X86ISD::ANDNP == Opcode;
2659}
2660
2661static bool isTargetShuffle(unsigned Opcode) {
2662 switch(Opcode) {
2663 default: return false;
2664 case X86ISD::BLENDI:
2665 case X86ISD::PSHUFB:
2666 case X86ISD::PSHUFD:
2667 case X86ISD::PSHUFHW:
2668 case X86ISD::PSHUFLW:
2669 case X86ISD::SHUFP:
2670 case X86ISD::INSERTPS:
2671 case X86ISD::EXTRQI:
2672 case X86ISD::INSERTQI:
2673 case X86ISD::VALIGN:
2674 case X86ISD::PALIGNR:
2675 case X86ISD::VSHLDQ:
2676 case X86ISD::VSRLDQ:
2677 case X86ISD::MOVLHPS:
2678 case X86ISD::MOVHLPS:
2679 case X86ISD::MOVSHDUP:
2680 case X86ISD::MOVSLDUP:
2681 case X86ISD::MOVDDUP:
2682 case X86ISD::MOVSS:
2683 case X86ISD::MOVSD:
2684 case X86ISD::MOVSH:
2685 case X86ISD::UNPCKL:
2686 case X86ISD::UNPCKH:
2687 case X86ISD::VBROADCAST:
2688 case X86ISD::VPERMILPI:
2689 case X86ISD::VPERMILPV:
2690 case X86ISD::VPERM2X128:
2691 case X86ISD::SHUF128:
2692 case X86ISD::VPERMIL2:
2693 case X86ISD::VPERMI:
2694 case X86ISD::VPPERM:
2695 case X86ISD::VPERMV:
2696 case X86ISD::VPERMV3:
2697 case X86ISD::VZEXT_MOVL:
2698 return true;
2699 }
2700}
2701
2702static bool isTargetShuffleVariableMask(unsigned Opcode) {
2703 switch (Opcode) {
2704 default: return false;
2705 // Target Shuffles.
2706 case X86ISD::PSHUFB:
2707 case X86ISD::VPERMILPV:
2708 case X86ISD::VPERMIL2:
2709 case X86ISD::VPPERM:
2710 case X86ISD::VPERMV:
2711 case X86ISD::VPERMV3:
2712 return true;
2713 // 'Faux' Target Shuffles.
2714 case ISD::OR:
2715 case ISD::AND:
2716 case X86ISD::ANDNP:
2717 return true;
2718 }
2719}
2720
2723 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2725 int ReturnAddrIndex = FuncInfo->getRAIndex();
2726
2727 if (ReturnAddrIndex == 0) {
2728 // Set up a frame object for the return address.
2729 unsigned SlotSize = RegInfo->getSlotSize();
2730 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
2731 -(int64_t)SlotSize,
2732 false);
2733 FuncInfo->setRAIndex(ReturnAddrIndex);
2734 }
2735
2736 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
2737}
2738
2740 bool HasSymbolicDisplacement) {
2741 // Offset should fit into 32 bit immediate field.
2742 if (!isInt<32>(Offset))
2743 return false;
2744
2745 // If we don't have a symbolic displacement - we don't have any extra
2746 // restrictions.
2747 if (!HasSymbolicDisplacement)
2748 return true;
2749
2750 // We can fold large offsets in the large code model because we always use
2751 // 64-bit offsets.
2752 if (CM == CodeModel::Large)
2753 return true;
2754
2755 // For kernel code model we know that all object resist in the negative half
2756 // of 32bits address space. We may not accept negative offsets, since they may
2757 // be just off and we may accept pretty large positive ones.
2758 if (CM == CodeModel::Kernel)
2759 return Offset >= 0;
2760
2761 // For other non-large code models we assume that latest small object is 16MB
2762 // before end of 31 bits boundary. We may also accept pretty large negative
2763 // constants knowing that all objects are in the positive half of address
2764 // space.
2765 return Offset < 16 * 1024 * 1024;
2766}
2767
2768/// Return true if the condition is an signed comparison operation.
2769static bool isX86CCSigned(unsigned X86CC) {
2770 switch (X86CC) {
2771 default:
2772 llvm_unreachable("Invalid integer condition!");
2773 case X86::COND_E:
2774 case X86::COND_NE:
2775 case X86::COND_B:
2776 case X86::COND_A:
2777 case X86::COND_BE:
2778 case X86::COND_AE:
2779 return false;
2780 case X86::COND_G:
2781 case X86::COND_GE:
2782 case X86::COND_L:
2783 case X86::COND_LE:
2784 return true;
2785 }
2786}
2787
2789 switch (SetCCOpcode) {
2790 // clang-format off
2791 default: llvm_unreachable("Invalid integer condition!");
2792 case ISD::SETEQ: return X86::COND_E;
2793 case ISD::SETGT: return X86::COND_G;
2794 case ISD::SETGE: return X86::COND_GE;
2795 case ISD::SETLT: return X86::COND_L;
2796 case ISD::SETLE: return X86::COND_LE;
2797 case ISD::SETNE: return X86::COND_NE;
2798 case ISD::SETULT: return X86::COND_B;
2799 case ISD::SETUGT: return X86::COND_A;
2800 case ISD::SETULE: return X86::COND_BE;
2801 case ISD::SETUGE: return X86::COND_AE;
2802 // clang-format on
2803 }
2804}
2805
2806/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
2807/// condition code, returning the condition code and the LHS/RHS of the
2808/// comparison to make.
2810 bool isFP, SDValue &LHS, SDValue &RHS,
2811 SelectionDAG &DAG) {
2812 if (!isFP) {
2813 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
2814 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
2815 // X > -1 -> X == 0, jump !sign.
2816 RHS = DAG.getConstant(0, DL, RHS.getValueType());
2817 return X86::COND_NS;
2818 }
2819 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
2820 // X < 0 -> X == 0, jump on sign.
2821 return X86::COND_S;
2822 }
2823 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
2824 // X >= 0 -> X == 0, jump on !sign.
2825 return X86::COND_NS;
2826 }
2827 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
2828 // X < 1 -> X <= 0
2829 RHS = DAG.getConstant(0, DL, RHS.getValueType());
2830 return X86::COND_LE;
2831 }
2832 }
2833
2834 return TranslateIntegerX86CC(SetCCOpcode);
2835 }
2836
2837 // First determine if it is required or is profitable to flip the operands.
2838
2839 // If LHS is a foldable load, but RHS is not, flip the condition.
2840 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
2841 !ISD::isNON_EXTLoad(RHS.getNode())) {
2842 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
2843 std::swap(LHS, RHS);
2844 }
2845
2846 switch (SetCCOpcode) {
2847 default: break;
2848 case ISD::SETOLT:
2849 case ISD::SETOLE:
2850 case ISD::SETUGT:
2851 case ISD::SETUGE:
2852 std::swap(LHS, RHS);
2853 break;
2854 }
2855
2856 // On a floating point condition, the flags are set as follows:
2857 // ZF PF CF op
2858 // 0 | 0 | 0 | X > Y
2859 // 0 | 0 | 1 | X < Y
2860 // 1 | 0 | 0 | X == Y
2861 // 1 | 1 | 1 | unordered
2862 switch (SetCCOpcode) {
2863 // clang-format off
2864 default: llvm_unreachable("Condcode should be pre-legalized away");
2865 case ISD::SETUEQ:
2866 case ISD::SETEQ: return X86::COND_E;
2867 case ISD::SETOLT: // flipped
2868 case ISD::SETOGT:
2869 case ISD::SETGT: return X86::COND_A;
2870 case ISD::SETOLE: // flipped
2871 case ISD::SETOGE:
2872 case ISD::SETGE: return X86::COND_AE;
2873 case ISD::SETUGT: // flipped
2874 case ISD::SETULT:
2875 case ISD::SETLT: return X86::COND_B;
2876 case ISD::SETUGE: // flipped
2877 case ISD::SETULE:
2878 case ISD::SETLE: return X86::COND_BE;
2879 case ISD::SETONE:
2880 case ISD::SETNE: return X86::COND_NE;
2881 case ISD::SETUO: return X86::COND_P;
2882 case ISD::SETO: return X86::COND_NP;
2883 case ISD::SETOEQ:
2884 case ISD::SETUNE: return X86::COND_INVALID;
2885 // clang-format on
2886 }
2887}
2888
2889/// Is there a floating point cmov for the specific X86 condition code?
2890/// Current x86 isa includes the following FP cmov instructions:
2891/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
2892static bool hasFPCMov(unsigned X86CC) {
2893 switch (X86CC) {
2894 default:
2895 return false;
2896 case X86::COND_B:
2897 case X86::COND_BE:
2898 case X86::COND_E:
2899 case X86::COND_P:
2900 case X86::COND_A:
2901 case X86::COND_AE:
2902 case X86::COND_NE:
2903 case X86::COND_NP:
2904 return true;
2905 }
2906}
2907
2908static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
2909 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
2910 VT.is512BitVector();
2911}
2912
2914 const CallInst &I,
2915 MachineFunction &MF,
2916 unsigned Intrinsic) const {
2918 Info.offset = 0;
2919
2920 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
2921 if (!IntrData) {
2922 switch (Intrinsic) {
2923 case Intrinsic::x86_aesenc128kl:
2924 case Intrinsic::x86_aesdec128kl:
2926 Info.ptrVal = I.getArgOperand(1);
2927 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
2928 Info.align = Align(1);
2930 return true;
2931 case Intrinsic::x86_aesenc256kl:
2932 case Intrinsic::x86_aesdec256kl:
2934 Info.ptrVal = I.getArgOperand(1);
2935 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
2936 Info.align = Align(1);
2938 return true;
2939 case Intrinsic::x86_aesencwide128kl:
2940 case Intrinsic::x86_aesdecwide128kl:
2942 Info.ptrVal = I.getArgOperand(0);
2943 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
2944 Info.align = Align(1);
2946 return true;
2947 case Intrinsic::x86_aesencwide256kl:
2948 case Intrinsic::x86_aesdecwide256kl:
2950 Info.ptrVal = I.getArgOperand(0);
2951 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
2952 Info.align = Align(1);
2954 return true;
2955 case Intrinsic::x86_cmpccxadd32:
2956 case Intrinsic::x86_cmpccxadd64:
2957 case Intrinsic::x86_atomic_bts:
2958 case Intrinsic::x86_atomic_btc:
2959 case Intrinsic::x86_atomic_btr: {
2961 Info.ptrVal = I.getArgOperand(0);
2962 unsigned Size = I.getType()->getScalarSizeInBits();
2963 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
2964 Info.align = Align(Size);
2967 return true;
2968 }
2969 case Intrinsic::x86_atomic_bts_rm:
2970 case Intrinsic::x86_atomic_btc_rm:
2971 case Intrinsic::x86_atomic_btr_rm: {
2973 Info.ptrVal = I.getArgOperand(0);
2974 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
2975 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
2976 Info.align = Align(Size);
2979 return true;
2980 }
2981 case Intrinsic::x86_aadd32:
2982 case Intrinsic::x86_aadd64:
2983 case Intrinsic::x86_aand32:
2984 case Intrinsic::x86_aand64:
2985 case Intrinsic::x86_aor32:
2986 case Intrinsic::x86_aor64:
2987 case Intrinsic::x86_axor32:
2988 case Intrinsic::x86_axor64:
2989 case Intrinsic::x86_atomic_add_cc:
2990 case Intrinsic::x86_atomic_sub_cc:
2991 case Intrinsic::x86_atomic_or_cc:
2992 case Intrinsic::x86_atomic_and_cc:
2993 case Intrinsic::x86_atomic_xor_cc: {
2995 Info.ptrVal = I.getArgOperand(0);
2996 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
2997 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
2998 Info.align = Align(Size);
3001 return true;
3002 }
3003 }
3004 return false;
3005 }
3006
3007 switch (IntrData->Type) {
3010 case TRUNCATE_TO_MEM_VI32: {
3012 Info.ptrVal = I.getArgOperand(0);
3013 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
3015 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
3016 ScalarVT = MVT::i8;
3017 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
3018 ScalarVT = MVT::i16;
3019 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
3020 ScalarVT = MVT::i32;
3021
3022 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
3023 Info.align = Align(1);
3025 break;
3026 }
3027 case GATHER:
3028 case GATHER_AVX2: {
3030 Info.ptrVal = nullptr;
3031 MVT DataVT = MVT::getVT(I.getType());
3032 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3033 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3034 IndexVT.getVectorNumElements());
3035 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3036 Info.align = Align(1);
3038 break;
3039 }
3040 case SCATTER: {
3042 Info.ptrVal = nullptr;
3043 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
3044 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3045 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3046 IndexVT.getVectorNumElements());
3047 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3048 Info.align = Align(1);
3050 break;
3051 }
3052 default:
3053 return false;
3054 }
3055
3056 return true;
3057}
3058
3059/// Returns true if the target can instruction select the
3060/// specified FP immediate natively. If false, the legalizer will
3061/// materialize the FP immediate as a load from a constant pool.
3063 bool ForCodeSize) const {
3064 for (const APFloat &FPImm : LegalFPImmediates)
3065 if (Imm.bitwiseIsEqual(FPImm))
3066 return true;
3067 return false;
3068}
3069
3071 ISD::LoadExtType ExtTy,
3072 EVT NewVT) const {
3073 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
3074
3075 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3076 // relocation target a movq or addq instruction: don't let the load shrink.
3077 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3078 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3079 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3080 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3081
3082 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
3083 // those uses are extracted directly into a store, then the extract + store
3084 // can be store-folded. Therefore, it's probably not worth splitting the load.
3085 EVT VT = Load->getValueType(0);
3086 if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
3087 for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
3088 // Skip uses of the chain value. Result 0 of the node is the load value.
3089 if (UI.getUse().getResNo() != 0)
3090 continue;
3091
3092 // If this use is not an extract + store, it's probably worth splitting.
3093 if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
3094 UI->use_begin()->getOpcode() != ISD::STORE)
3095 return true;
3096 }
3097 // All non-chain uses are extract + store.
3098 return false;
3099 }
3100
3101 return true;
3102}
3103
3104/// Returns true if it is beneficial to convert a load of a constant
3105/// to just the constant itself.
3107 Type *Ty) const {
3108 assert(Ty->isIntegerTy());
3109
3110 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3111 if (BitSize == 0 || BitSize > 64)
3112 return false;
3113 return true;
3114}
3115
3117 // If we are using XMM registers in the ABI and the condition of the select is
3118 // a floating-point compare and we have blendv or conditional move, then it is
3119 // cheaper to select instead of doing a cross-register move and creating a
3120 // load that depends on the compare result.
3121 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
3122 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
3123}
3124
3126 // TODO: It might be a win to ease or lift this restriction, but the generic
3127 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
3128 if (VT.isVector() && Subtarget.hasAVX512())
3129 return false;
3130
3131 return true;
3132}
3133
3135 SDValue C) const {
3136 // TODO: We handle scalars using custom code, but generic combining could make
3137 // that unnecessary.
3138 APInt MulC;
3139 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
3140 return false;
3141
3142 // Find the type this will be legalized too. Otherwise we might prematurely
3143 // convert this to shl+add/sub and then still have to type legalize those ops.
3144 // Another choice would be to defer the decision for illegal types until
3145 // after type legalization. But constant splat vectors of i64 can't make it
3146 // through type legalization on 32-bit targets so we would need to special
3147 // case vXi64.
3148 while (getTypeAction(Context, VT) != TypeLegal)
3149 VT = getTypeToTransformTo(Context, VT);
3150
3151 // If vector multiply is legal, assume that's faster than shl + add/sub.
3152 // Multiply is a complex op with higher latency and lower throughput in
3153 // most implementations, sub-vXi32 vector multiplies are always fast,
3154 // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
3155 // is always going to be slow.
3156 unsigned EltSizeInBits = VT.getScalarSizeInBits();
3157 if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
3158 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
3159 return false;
3160
3161 // shl+add, shl+sub, shl+add+neg
3162 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
3163 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
3164}
3165
3167 unsigned Index) const {
3169 return false;
3170
3171 // Mask vectors support all subregister combinations and operations that
3172 // extract half of vector.
3173 if (ResVT.getVectorElementType() == MVT::i1)
3174 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
3175 (Index == ResVT.getVectorNumElements()));
3176
3177 return (Index % ResVT.getVectorNumElements()) == 0;
3178}
3179
3181 unsigned Opc = VecOp.getOpcode();
3182
3183 // Assume target opcodes can't be scalarized.
3184 // TODO - do we have any exceptions?
3185 if (Opc >= ISD::BUILTIN_OP_END)
3186 return false;
3187
3188 // If the vector op is not supported, try to convert to scalar.
3189 EVT VecVT = VecOp.getValueType();
3190 if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
3191 return true;
3192
3193 // If the vector op is supported, but the scalar op is not, the transform may
3194 // not be worthwhile.
3195 EVT ScalarVT = VecVT.getScalarType();
3196 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
3197}
3198
3200 bool) const {
3201 // TODO: Allow vectors?
3202 if (VT.isVector())
3203 return false;
3204 return VT.isSimple() || !isOperationExpand(Opcode, VT);
3205}
3206
3208 // Speculate cttz only if we can directly use TZCNT or can promote to i32.
3209 return Subtarget.hasBMI() ||
3210 (!Ty->isVectorTy() && Ty->getScalarSizeInBits() < 32);
3211}
3212
3214 // Speculate ctlz only if we can directly use LZCNT.
3215 return Subtarget.hasLZCNT();
3216}
3217
3219 // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
3220 // expensive than a straight movsd. On the other hand, it's important to
3221 // shrink long double fp constant since fldt is very slow.
3222 return !Subtarget.hasSSE2() || VT == MVT::f80;
3223}
3224
3226 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
3227 (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
3228}
3229
3231 const SelectionDAG &DAG,
3232 const MachineMemOperand &MMO) const {
3233 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
3234 BitcastVT.getVectorElementType() == MVT::i1)
3235 return false;
3236
3237 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
3238 return false;
3239
3240 // If both types are legal vectors, it's always ok to convert them.
3241 if (LoadVT.isVector() && BitcastVT.isVector() &&
3242 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
3243 return true;
3244
3245 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
3246}
3247
3249 const MachineFunction &MF) const {
3250 // Do not merge to float value size (128 bytes) if no implicit
3251 // float attribute is set.
3252 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
3253
3254 if (NoFloat) {
3255 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
3256 return (MemVT.getSizeInBits() <= MaxIntSize);
3257 }
3258 // Make sure we don't merge greater than our preferred vector
3259 // width.
3260 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
3261 return false;
3262
3263 return true;
3264}
3265
3267 return Subtarget.hasFastLZCNT();
3268}
3269
3271 const Instruction &AndI) const {
3272 return true;
3273}
3274
3276 EVT VT = Y.getValueType();
3277
3278 if (VT.isVector())
3279 return false;
3280
3281 if (!Subtarget.hasBMI())
3282 return false;
3283
3284 // There are only 32-bit and 64-bit forms for 'andn'.
3285 if (VT != MVT::i32 && VT != MVT::i64)
3286 return false;
3287
3288 return !isa<ConstantSDNode>(Y);
3289}
3290
3292 EVT VT = Y.getValueType();
3293
3294 if (!VT.isVector())
3295 return hasAndNotCompare(Y);
3296
3297 // Vector.
3298
3299 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
3300 return false;
3301
3302 if (VT == MVT::v4i32)
3303 return true;
3304
3305 return Subtarget.hasSSE2();
3306}
3307
3309 return X.getValueType().isScalarInteger(); // 'bt'
3310}
3311
3315 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
3316 SelectionDAG &DAG) const {
3317 // Does baseline recommend not to perform the fold by default?
3319 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
3320 return false;
3321 // For scalars this transform is always beneficial.
3322 if (X.getValueType().isScalarInteger())
3323 return true;
3324 // If all the shift amounts are identical, then transform is beneficial even
3325 // with rudimentary SSE2 shifts.
3326 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
3327 return true;
3328 // If we have AVX2 with it's powerful shift operations, then it's also good.
3329 if (Subtarget.hasAVX2())
3330 return true;
3331 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
3332 return NewShiftOpcode == ISD::SHL;
3333}
3334
3336 EVT VT, unsigned ShiftOpc, bool MayTransformRotate,
3337 const APInt &ShiftOrRotateAmt, const std::optional<APInt> &AndMask) const {
3338 if (!VT.isInteger())
3339 return ShiftOpc;
3340
3341 bool PreferRotate = false;
3342 if (VT.isVector()) {
3343 // For vectors, if we have rotate instruction support, then its definetly
3344 // best. Otherwise its not clear what the best so just don't make changed.
3345 PreferRotate = Subtarget.hasAVX512() && (VT.getScalarType() == MVT::i32 ||
3346 VT.getScalarType() == MVT::i64);
3347 } else {
3348 // For scalar, if we have bmi prefer rotate for rorx. Otherwise prefer
3349 // rotate unless we have a zext mask+shr.
3350 PreferRotate = Subtarget.hasBMI2();
3351 if (!PreferRotate) {
3352 unsigned MaskBits =
3353 VT.getScalarSizeInBits() - ShiftOrRotateAmt.getZExtValue();
3354 PreferRotate = (MaskBits != 8) && (MaskBits != 16) && (MaskBits != 32);
3355 }
3356 }
3357
3358 if (ShiftOpc == ISD::SHL || ShiftOpc == ISD::SRL) {
3359 assert(AndMask.has_value() && "Null andmask when querying about shift+and");
3360
3361 if (PreferRotate && MayTransformRotate)
3362 return ISD::ROTL;
3363
3364 // If vector we don't really get much benefit swapping around constants.
3365 // Maybe we could check if the DAG has the flipped node already in the
3366 // future.
3367 if (VT.isVector())
3368 return ShiftOpc;
3369
3370 // See if the beneficial to swap shift type.
3371 if (ShiftOpc == ISD::SHL) {
3372 // If the current setup has imm64 mask, then inverse will have
3373 // at least imm32 mask (or be zext i32 -> i64).
3374 if (VT == MVT::i64)
3375 return AndMask->getSignificantBits() > 32 ? (unsigned)ISD::SRL
3376 : ShiftOpc;
3377
3378 // We can only benefit if req at least 7-bit for the mask. We
3379 // don't want to replace shl of 1,2,3 as they can be implemented
3380 // with lea/add.
3381 return ShiftOrRotateAmt.uge(7) ? (unsigned)ISD::SRL : ShiftOpc;
3382 }
3383
3384 if (VT == MVT::i64)
3385 // Keep exactly 32-bit imm64, this is zext i32 -> i64 which is
3386 // extremely efficient.
3387 return AndMask->getSignificantBits() > 33 ? (unsigned)ISD::SHL : ShiftOpc;
3388
3389 // Keep small shifts as shl so we can generate add/lea.
3390 return ShiftOrRotateAmt.ult(7) ? (unsigned)ISD::SHL : ShiftOpc;
3391 }
3392
3393 // We prefer rotate for vectors of if we won't get a zext mask with SRL
3394 // (PreferRotate will be set in the latter case).
3395 if (PreferRotate || VT.isVector())
3396 return ShiftOpc;
3397
3398 // Non-vector type and we have a zext mask with SRL.
3399 return ISD::SRL;
3400}
3401
3404 const Value *Lhs,
3405 const Value *Rhs) const {
3406 using namespace llvm::PatternMatch;
3407 int BaseCost = BrMergingBaseCostThresh.getValue();
3408 // a == b && a == c is a fast pattern on x86.
3410 if (BaseCost >= 0 && Opc == Instruction::And &&
3411 match(Lhs, m_ICmp(Pred, m_Value(), m_Value())) &&
3412 Pred == ICmpInst::ICMP_EQ &&
3413 match(Rhs, m_ICmp(Pred, m_Value(), m_Value())) &&
3414 Pred == ICmpInst::ICMP_EQ)
3415 BaseCost += 1;
3416 return {BaseCost, BrMergingLikelyBias.getValue(),
3417 BrMergingUnlikelyBias.getValue()};
3418}
3419
3421 return N->getOpcode() != ISD::FP_EXTEND;
3422}
3423
3425 const SDNode *N, CombineLevel Level) const {
3426 assert(((N->getOpcode() == ISD::SHL &&
3427 N->getOperand(0).getOpcode() == ISD::SRL) ||
3428 (N->getOpcode() == ISD::SRL &&
3429 N->getOperand(0).getOpcode() == ISD::SHL)) &&
3430 "Expected shift-shift mask");
3431 // TODO: Should we always create i64 masks? Or only folded immediates?
3432 EVT VT = N->getValueType(0);
3433 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
3434 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
3435 // Only fold if the shift values are equal - so it folds to AND.
3436 // TODO - we should fold if either is a non-uniform vector but we don't do
3437 // the fold for non-splats yet.
3438 return N->getOperand(1) == N->getOperand(0).getOperand(1);
3439 }
3441}
3442
3444 EVT VT = Y.getValueType();
3445
3446 // For vectors, we don't have a preference, but we probably want a mask.
3447 if (VT.isVector())
3448 return false;
3449
3450 // 64-bit shifts on 32-bit targets produce really bad bloated code.
3451 if (VT == MVT::i64 && !Subtarget.is64Bit())
3452 return false;
3453
3454 return true;
3455}
3456
3459 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
3461 !Subtarget.isOSWindows())
3464 ExpansionFactor);
3465}
3466
3468 // Any legal vector type can be splatted more efficiently than
3469 // loading/spilling from memory.
3470 return isTypeLegal(VT);
3471}
3472
3474 MVT VT = MVT::getIntegerVT(NumBits);
3475 if (isTypeLegal(VT))
3476 return VT;
3477
3478 // PMOVMSKB can handle this.
3479 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
3480 return MVT::v16i8;
3481
3482 // VPMOVMSKB can handle this.
3483 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
3484 return MVT::v32i8;
3485
3486 // TODO: Allow 64-bit type for 32-bit target.
3487 // TODO: 512-bit types should be allowed, but make sure that those
3488 // cases are handled in combineVectorSizedSetCCEquality().
3489
3491}
3492
3493/// Val is the undef sentinel value or equal to the specified value.
3494static bool isUndefOrEqual(int Val, int CmpVal) {
3495 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
3496}
3497
3498/// Return true if every element in Mask is the undef sentinel value or equal to
3499/// the specified value.
3500static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
3501 return llvm::all_of(Mask, [CmpVal](int M) {
3502 return (M == SM_SentinelUndef) || (M == CmpVal);
3503 });
3504}
3505
3506/// Return true if every element in Mask, beginning from position Pos and ending
3507/// in Pos+Size is the undef sentinel value or equal to the specified value.
3508static bool isUndefOrEqualInRange(ArrayRef<int> Mask, int CmpVal, unsigned Pos,
3509 unsigned Size) {
3510 return llvm::all_of(Mask.slice(Pos, Size),
3511 [CmpVal](int M) { return isUndefOrEqual(M, CmpVal); });
3512}
3513
3514/// Val is either the undef or zero sentinel value.
3515static bool isUndefOrZero(int Val) {
3516 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
3517}
3518
3519/// Return true if every element in Mask, beginning from position Pos and ending
3520/// in Pos+Size is the undef sentinel value.
3521static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
3522 return llvm::all_of(Mask.slice(Pos, Size),
3523 [](int M) { return M == SM_SentinelUndef; });
3524}
3525
3526/// Return true if the mask creates a vector whose lower half is undefined.
3528 unsigned NumElts = Mask.size();
3529 return isUndefInRange(Mask, 0, NumElts / 2);
3530}
3531
3532/// Return true if the mask creates a vector whose upper half is undefined.
3534 unsigned NumElts = Mask.size();
3535 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
3536}
3537
3538/// Return true if Val falls within the specified range (L, H].
3539static bool isInRange(int Val, int Low, int Hi) {
3540 return (Val >= Low && Val < Hi);
3541}
3542
3543/// Return true if the value of any element in Mask falls within the specified
3544/// range (L, H].
3545static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
3546 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
3547}
3548
3549/// Return true if the value of any element in Mask is the zero sentinel value.
3550static bool isAnyZero(ArrayRef<int> Mask) {
3551 return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
3552}
3553
3554/// Return true if Val is undef or if its value falls within the
3555/// specified range (L, H].
3556static bool isUndefOrInRange(int Val, int Low, int Hi) {
3557 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
3558}
3559
3560/// Return true if every element in Mask is undef or if its value
3561/// falls within the specified range (L, H].
3562static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3563 return llvm::all_of(
3564 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
3565}
3566
3567/// Return true if Val is undef, zero or if its value falls within the
3568/// specified range (L, H].
3569static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
3570 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
3571}
3572
3573/// Return true if every element in Mask is undef, zero or if its value
3574/// falls within the specified range (L, H].
3575static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3576 return llvm::all_of(
3577 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
3578}
3579
3580/// Return true if every element in Mask, beginning
3581/// from position Pos and ending in Pos + Size, falls within the specified
3582/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
3583static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
3584 unsigned Size, int Low, int Step = 1) {
3585 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3586 if (!isUndefOrEqual(Mask[i], Low))
3587 return false;
3588 return true;
3589}
3590
3591/// Return true if every element in Mask, beginning
3592/// from position Pos and ending in Pos+Size, falls within the specified
3593/// sequential range (Low, Low+Size], or is undef or is zero.
3595 unsigned Size, int Low,
3596 int Step = 1) {
3597 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3598 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
3599 return false;
3600 return true;
3601}
3602
3603/// Return true if every element in Mask, beginning
3604/// from position Pos and ending in Pos+Size is undef or is zero.
3605static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
3606 unsigned Size) {
3607 return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
3608}
3609
3610/// Return true if every element of a single input is referenced by the shuffle
3611/// mask. i.e. it just permutes them all.
3613 unsigned NumElts = Mask.size();
3614 APInt DemandedElts = APInt::getZero(NumElts);
3615 for (int M : Mask)
3616 if (isInRange(M, 0, NumElts))
3617 DemandedElts.setBit(M);
3618 return DemandedElts.isAllOnes();
3619}
3620
3621/// Helper function to test whether a shuffle mask could be
3622/// simplified by widening the elements being shuffled.
3623///
3624/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
3625/// leaves it in an unspecified state.
3626///
3627/// NOTE: This must handle normal vector shuffle masks and *target* vector
3628/// shuffle masks. The latter have the special property of a '-2' representing
3629/// a zero-ed lane of a vector.
3631 SmallVectorImpl<int> &WidenedMask) {
3632 WidenedMask.assign(Mask.size() / 2, 0);
3633 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
3634 int M0 = Mask[i];
3635 int M1 = Mask[i + 1];
3636
3637 // If both elements are undef, its trivial.
3638 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
3639 WidenedMask[i / 2] = SM_SentinelUndef;
3640 continue;
3641 }
3642
3643 // Check for an undef mask and a mask value properly aligned to fit with
3644 // a pair of values. If we find such a case, use the non-undef mask's value.
3645 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
3646 WidenedMask[i / 2] = M1 / 2;
3647 continue;
3648 }
3649 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
3650 WidenedMask[i / 2] = M0 / 2;
3651 continue;
3652 }
3653
3654 // When zeroing, we need to spread the zeroing across both lanes to widen.
3655 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
3656 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
3658 WidenedMask[i / 2] = SM_SentinelZero;
3659 continue;
3660 }
3661 return false;
3662 }
3663
3664 // Finally check if the two mask values are adjacent and aligned with
3665 // a pair.
3666 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
3667 WidenedMask[i / 2] = M0 / 2;
3668 continue;
3669 }
3670
3671 // Otherwise we can't safely widen the elements used in this shuffle.
3672 return false;
3673 }
3674 assert(WidenedMask.size() == Mask.size() / 2 &&
3675 "Incorrect size of mask after widening the elements!");
3676
3677 return true;
3678}
3679
3681 const APInt &Zeroable,
3682 bool V2IsZero,
3683 SmallVectorImpl<int> &WidenedMask) {
3684 // Create an alternative mask with info about zeroable elements.
3685 // Here we do not set undef elements as zeroable.
3686 SmallVector<int, 64> ZeroableMask(Mask);
3687 if (V2IsZero) {
3688 assert(!Zeroable.isZero() && "V2's non-undef elements are used?!");
3689 for (int i = 0, Size = Mask.size(); i != Size; ++i)
3690 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
3691 ZeroableMask[i] = SM_SentinelZero;
3692 }
3693 return canWidenShuffleElements(ZeroableMask, WidenedMask);
3694}
3695
3697 SmallVector<int, 32> WidenedMask;
3698 return canWidenShuffleElements(Mask, WidenedMask);
3699}
3700
3701// Attempt to narrow/widen shuffle mask until it matches the target number of
3702// elements.
3703static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
3704 SmallVectorImpl<int> &ScaledMask) {
3705 unsigned NumSrcElts = Mask.size();
3706 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
3707 "Illegal shuffle scale factor");
3708
3709 // Narrowing is guaranteed to work.
3710 if (NumDstElts >= NumSrcElts) {
3711 int Scale = NumDstElts / NumSrcElts;
3712 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
3713 return true;
3714 }
3715
3716 // We have to repeat the widening until we reach the target size, but we can
3717 // split out the first widening as it sets up ScaledMask for us.
3718 if (canWidenShuffleElements(Mask, ScaledMask)) {
3719 while (ScaledMask.size() > NumDstElts) {
3720 SmallVector<int, 16> WidenedMask;
3721 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
3722 return false;
3723 ScaledMask = std::move(WidenedMask);
3724 }
3725 return true;
3726 }
3727
3728 return false;
3729}
3730
3731/// Returns true if Elt is a constant zero or a floating point constant +0.0.
3733 return isNullConstant(Elt) || isNullFPConstant(Elt);
3734}
3735
3736// Build a vector of constants.
3737// Use an UNDEF node if MaskElt == -1.
3738// Split 64-bit constants in the 32-bit mode.
3740 const SDLoc &dl, bool IsMask = false) {
3741
3743 bool Split = false;
3744
3745 MVT ConstVecVT = VT;
3746 unsigned NumElts = VT.getVectorNumElements();
3747 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
3748 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
3749 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
3750 Split = true;
3751 }
3752
3753 MVT EltVT = ConstVecVT.getVectorElementType();
3754 for (unsigned i = 0; i < NumElts; ++i) {
3755 bool IsUndef = Values[i] < 0 && IsMask;
3756 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
3757 DAG.getConstant(Values[i], dl, EltVT);
3758 Ops.push_back(OpNode);
3759 if (Split)
3760 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
3761 DAG.getConstant(0, dl, EltVT));
3762 }
3763 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
3764 if (Split)
3765 ConstsNode = DAG.getBitcast(VT, ConstsNode);
3766 return ConstsNode;
3767}
3768
3769static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
3770 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
3771 assert(Bits.size() == Undefs.getBitWidth() &&
3772 "Unequal constant and undef arrays");
3774 bool Split = false;
3775
3776 MVT ConstVecVT = VT;
3777 unsigned NumElts = VT.getVectorNumElements();
3778 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
3779 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
3780 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
3781 Split = true;
3782 }
3783
3784 MVT EltVT = ConstVecVT.getVectorElementType();
3785 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
3786 if (Undefs[i]) {
3787 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
3788 continue;
3789 }
3790 const APInt &V = Bits[i];
3791 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
3792 if (Split) {
3793 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
3794 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
3795 } else if (EltVT == MVT::f32) {
3797 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
3798 } else if (EltVT == MVT::f64) {
3800 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
3801 } else {
3802 Ops.push_back(DAG.getConstant(V, dl, EltVT));
3803 }
3804 }
3805
3806 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
3807 return DAG.getBitcast(VT, ConstsNode);
3808}
3809
3811 SelectionDAG &DAG, const SDLoc &dl) {
3812 APInt Undefs = APInt::getZero(Bits.size());
3813 return getConstVector(Bits, Undefs, VT, DAG, dl);
3814}
3815
3816/// Returns a vector of specified type with all zero elements.
3817static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
3818 SelectionDAG &DAG, const SDLoc &dl) {
3819 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
3820 VT.getVectorElementType() == MVT::i1) &&
3821 "Unexpected vector type");
3822
3823 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
3824 // type. This ensures they get CSE'd. But if the integer type is not
3825 // available, use a floating-point +0.0 instead.
3826 SDValue Vec;
3827 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3828 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
3829 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
3830 } else if (VT.isFloatingPoint() &&
3832 Vec = DAG.getConstantFP(+0.0, dl, VT);
3833 } else if (VT.getVectorElementType() == MVT::i1) {
3834 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
3835 "Unexpected vector type");
3836 Vec = DAG.getConstant(0, dl, VT);
3837 } else {
3838 unsigned Num32BitElts = VT.getSizeInBits() / 32;
3839 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
3840 }
3841 return DAG.getBitcast(VT, Vec);
3842}
3843
3844// Helper to determine if the ops are all the extracted subvectors come from a
3845// single source. If we allow commute they don't have to be in order (Lo/Hi).
3846static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
3847 if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
3848 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
3849 LHS.getValueType() != RHS.getValueType() ||
3850 LHS.getOperand(0) != RHS.getOperand(0))
3851 return SDValue();
3852
3853 SDValue Src = LHS.getOperand(0);
3854 if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
3855 return SDValue();
3856
3857 unsigned NumElts = LHS.getValueType().getVectorNumElements();
3858 if ((LHS.getConstantOperandAPInt(1) == 0 &&
3859 RHS.getConstantOperandAPInt(1) == NumElts) ||
3860 (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
3861 LHS.getConstantOperandAPInt(1) == NumElts))
3862 return Src;
3863
3864 return SDValue();
3865}
3866
3867static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
3868 const SDLoc &dl, unsigned vectorWidth) {
3869 EVT VT = Vec.getValueType();
3870 EVT ElVT = VT.getVectorElementType();
3871 unsigned Factor = VT.getSizeInBits() / vectorWidth;
3872 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
3873 VT.getVectorNumElements() / Factor);
3874
3875 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
3876 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
3877 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
3878
3879 // This is the index of the first element of the vectorWidth-bit chunk
3880 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
3881 IdxVal &= ~(ElemsPerChunk - 1);
3882
3883 // If the input is a buildvector just emit a smaller one.
3884 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
3885 return DAG.getBuildVector(ResultVT, dl,
3886 Vec->ops().slice(IdxVal, ElemsPerChunk));
3887
3888 // Check if we're extracting the upper undef of a widening pattern.
3889 if (Vec.getOpcode() == ISD::INSERT_SUBVECTOR && Vec.getOperand(0).isUndef() &&
3890 Vec.getOperand(1).getValueType().getVectorNumElements() <= IdxVal &&
3891 isNullConstant(Vec.getOperand(2)))
3892 return DAG.getUNDEF(ResultVT);
3893
3894 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
3895 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
3896}
3897
3898/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
3899/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
3900/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
3901/// instructions or a simple subregister reference. Idx is an index in the
3902/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
3903/// lowering EXTRACT_VECTOR_ELT operations easier.
3904static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
3905 SelectionDAG &DAG, const SDLoc &dl) {
3907 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
3908 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
3909}
3910
3911/// Generate a DAG to grab 256-bits from a 512-bit vector.
3912static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
3913 SelectionDAG &DAG, const SDLoc &dl) {
3914 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
3915 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
3916}
3917
3918static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
3919 SelectionDAG &DAG, const SDLoc &dl,
3920 unsigned vectorWidth) {
3921 assert((vectorWidth == 128 || vectorWidth == 256) &&
3922 "Unsupported vector width");
3923 // Inserting UNDEF is Result
3924 if (Vec.isUndef())
3925 return Result;
3926 EVT VT = Vec.getValueType();
3927 EVT ElVT = VT.getVectorElementType();
3928 EVT ResultVT = Result.getValueType();
3929
3930 // Insert the relevant vectorWidth bits.
3931 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
3932 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
3933
3934 // This is the index of the first element of the vectorWidth-bit chunk
3935 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
3936 IdxVal &= ~(ElemsPerChunk - 1);
3937
3938 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
3939 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
3940}
3941
3942/// Generate a DAG to put 128-bits into a vector > 128 bits. This
3943/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
3944/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
3945/// simple superregister reference. Idx is an index in the 128 bits
3946/// we want. It need not be aligned to a 128-bit boundary. That makes
3947/// lowering INSERT_VECTOR_ELT operations easier.
3948static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
3949 SelectionDAG &DAG, const SDLoc &dl) {
3950 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
3951 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
3952}
3953
3954/// Widen a vector to a larger size with the same scalar type, with the new
3955/// elements either zero or undef.
3956static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
3957 const X86Subtarget &Subtarget, SelectionDAG &DAG,
3958 const SDLoc &dl) {
3960 Vec.getValueType().getScalarType() == VT.getScalarType() &&
3961 "Unsupported vector widening type");
3962 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
3963 : DAG.getUNDEF(VT);
3964 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
3965 DAG.getIntPtrConstant(0, dl));
3966}
3967
3968/// Widen a vector to a larger size with the same scalar type, with the new
3969/// elements either zero or undef.
3970static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
3971 const X86Subtarget &Subtarget, SelectionDAG &DAG,
3972 const SDLoc &dl, unsigned WideSizeInBits) {
3973 assert(Vec.getValueSizeInBits() <= WideSizeInBits &&
3974 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
3975 "Unsupported vector widening type");
3976 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
3977 MVT SVT = Vec.getSimpleValueType().getScalarType();
3978 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
3979 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
3980}
3981
3982/// Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT
3983/// and bitcast with integer types.
3984static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget) {
3985 assert(VT.getVectorElementType() == MVT::i1 && "Expected bool vector");
3986 unsigned NumElts = VT.getVectorNumElements();
3987 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
3988 return Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
3989 return VT;
3990}
3991
3992/// Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and
3993/// bitcast with integer types.
3994static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements,
3995 const X86Subtarget &Subtarget, SelectionDAG &DAG,
3996 const SDLoc &dl) {
3997 MVT VT = widenMaskVectorType(Vec.getSimpleValueType(), Subtarget);
3998 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
3999}
4000
4001// Helper function to collect subvector ops that are concatenated together,
4002// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
4003// The subvectors in Ops are guaranteed to be the same type.
4005 SelectionDAG &DAG) {
4006 assert(Ops.empty() && "Expected an empty ops vector");
4007
4008 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
4009 Ops.append(N->op_begin(), N->op_end());
4010 return true;
4011 }
4012
4013 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
4014 SDValue Src = N->getOperand(0);
4015 SDValue Sub = N->getOperand(1);
4016 const APInt &Idx = N->getConstantOperandAPInt(2);
4017 EVT VT = Src.getValueType();
4018 EVT SubVT = Sub.getValueType();
4019
4020 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
4021 // insert_subvector(undef, x, lo)
4022 if (Idx == 0 && Src.isUndef()) {
4023 Ops.push_back(Sub);
4024 Ops.push_back(DAG.getUNDEF(SubVT));
4025 return true;
4026 }
4027 if (Idx == (VT.getVectorNumElements() / 2)) {
4028 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
4029 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
4030 Src.getOperand(1).getValueType() == SubVT &&
4031 isNullConstant(Src.getOperand(2))) {
4032 // Attempt to recurse into inner (matching) concats.
4033 SDValue Lo = Src.getOperand(1);
4034 SDValue Hi = Sub;
4035 SmallVector<SDValue, 2> LoOps, HiOps;
4036 if (collectConcatOps(Lo.getNode(), LoOps, DAG) &&
4037 collectConcatOps(Hi.getNode(), HiOps, DAG) &&
4038 LoOps.size() == HiOps.size()) {
4039 Ops.append(LoOps);
4040 Ops.append(HiOps);
4041 return true;
4042 }
4043 Ops.push_back(Lo);
4044 Ops.push_back(Hi);
4045 return true;
4046 }
4047 // insert_subvector(x, extract_subvector(x, lo), hi)
4048 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4049 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
4050 Ops.append(2, Sub);
4051 return true;
4052 }
4053 // insert_subvector(undef, x, hi)
4054 if (Src.isUndef()) {
4055 Ops.push_back(DAG.getUNDEF(SubVT));
4056 Ops.push_back(Sub);
4057 return true;
4058 }
4059 }
4060 }
4061 }
4062
4063 return false;
4064}
4065
4066// Helper to check if \p V can be split into subvectors and the upper subvectors
4067// are all undef. In which case return the lower subvector.
4069 SelectionDAG &DAG) {
4070 SmallVector<SDValue> SubOps;
4071 if (!collectConcatOps(V.getNode(), SubOps, DAG))
4072 return SDValue();
4073
4074 unsigned NumSubOps = SubOps.size();
4075 unsigned HalfNumSubOps = NumSubOps / 2;
4076 assert((NumSubOps % 2) == 0 && "Unexpected number of subvectors");
4077
4078 ArrayRef<SDValue> UpperOps(SubOps.begin() + HalfNumSubOps, SubOps.end());
4079 if (any_of(UpperOps, [](SDValue Op) { return !Op.isUndef(); }))
4080 return SDValue();
4081
4082 EVT HalfVT = V.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
4083 ArrayRef<SDValue> LowerOps(SubOps.begin(), SubOps.begin() + HalfNumSubOps);
4084 return DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT, LowerOps);
4085}
4086
4087// Helper to check if we can access all the constituent subvectors without any
4088// extract ops.
4091 return collectConcatOps(N, Ops, DAG);
4092}
4093
4094static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
4095 const SDLoc &dl) {
4096 EVT VT = Op.getValueType();
4097 unsigned NumElems = VT.getVectorNumElements();
4098 unsigned SizeInBits = VT.getSizeInBits();
4099 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
4100 "Can't split odd sized vector");
4101
4102 // If this is a splat value (with no-undefs) then use the lower subvector,
4103 // which should be a free extraction.
4104 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
4105 if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
4106 return std::make_pair(Lo, Lo);
4107
4108 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
4109 return std::make_pair(Lo, Hi);
4110}
4111
4112/// Break an operation into 2 half sized ops and then concatenate the results.
4114 unsigned NumOps = Op.getNumOperands();
4115 EVT VT = Op.getValueType();
4116
4117 // Extract the LHS Lo/Hi vectors
4118 SmallVector<SDValue> LoOps(NumOps, SDValue());
4119 SmallVector<SDValue> HiOps(NumOps, SDValue());
4120 for (unsigned I = 0; I != NumOps; ++I) {
4121 SDValue SrcOp = Op.getOperand(I);
4122 if (!SrcOp.getValueType().isVector()) {
4123 LoOps[I] = HiOps[I] = SrcOp;
4124 continue;
4125 }
4126 std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
4127 }
4128
4129 EVT LoVT, HiVT;
4130 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
4131 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
4132 DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
4133 DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
4134}
4135
4136/// Break an unary integer operation into 2 half sized ops and then
4137/// concatenate the result back.
4139 const SDLoc &dl) {
4140 // Make sure we only try to split 256/512-bit types to avoid creating
4141 // narrow vectors.
4142 EVT VT = Op.getValueType();
4143 (void)VT;
4144 assert((Op.getOperand(0).getValueType().is256BitVector() ||
4145 Op.getOperand(0).getValueType().is512BitVector()) &&
4146 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4147 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
4148 VT.getVectorNumElements() &&
4149 "Unexpected VTs!");
4150 return splitVectorOp(Op, DAG, dl);
4151}
4152
4153/// Break a binary integer operation into 2 half sized ops and then
4154/// concatenate the result back.
4156 const SDLoc &dl) {
4157 // Assert that all the types match.
4158 EVT VT = Op.getValueType();
4159 (void)VT;
4160 assert(Op.getOperand(0).getValueType() == VT &&
4161 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
4162 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4163 return splitVectorOp(Op, DAG, dl);
4164}
4165
4166// Helper for splitting operands of an operation to legal target size and
4167// apply a function on each part.
4168// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
4169// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
4170// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
4171// The argument Builder is a function that will be applied on each split part:
4172// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
4173template <typename F>
4175 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
4176 F Builder, bool CheckBWI = true) {
4177 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
4178 unsigned NumSubs = 1;
4179 if ((CheckBWI && Subtarget.useBWIRegs()) ||
4180 (!CheckBWI && Subtarget.useAVX512Regs())) {
4181 if (VT.getSizeInBits() > 512) {
4182 NumSubs = VT.getSizeInBits() / 512;
4183 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
4184 }
4185 } else if (Subtarget.hasAVX2()) {
4186 if (VT.getSizeInBits() > 256) {
4187 NumSubs = VT.getSizeInBits() / 256;
4188 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
4189 }
4190 } else {
4191 if (VT.getSizeInBits() > 128) {
4192 NumSubs = VT.getSizeInBits() / 128;
4193 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
4194 }
4195 }
4196
4197 if (NumSubs == 1)
4198 return Builder(DAG, DL, Ops);
4199
4201 for (unsigned i = 0; i != NumSubs; ++i) {
4203 for (SDValue Op : Ops) {
4204 EVT OpVT = Op.getValueType();
4205 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
4206 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
4207 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
4208 }
4209 Subs.push_back(Builder(DAG, DL, SubOps));
4210 }
4211 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
4212}
4213
4214// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
4215// targets.
4216static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
4218 const X86Subtarget &Subtarget) {
4219 assert(Subtarget.hasAVX512() && "AVX512 target expected");
4220 MVT SVT = VT.getScalarType();
4221
4222 // If we have a 32/64 splatted constant, splat it to DstTy to
4223 // encourage a foldable broadcast'd operand.
4224 auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
4225 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
4226 // AVX512 broadcasts 32/64-bit operands.
4227 // TODO: Support float once getAVX512Node is used by fp-ops.
4228 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
4230 return SDValue();
4231 // If we're not widening, don't bother if we're not bitcasting.
4232 if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
4233 return SDValue();
4234 if (auto *BV = dyn_cast<BuildVectorSDNode>(peekThroughBitcasts(Op))) {
4235 APInt SplatValue, SplatUndef;
4236 unsigned SplatBitSize;
4237 bool HasAnyUndefs;
4238 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
4239 HasAnyUndefs, OpEltSizeInBits) &&
4240 !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
4241 return DAG.getConstant(SplatValue, DL, DstVT);
4242 }
4243 return SDValue();
4244 };
4245
4246 bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
4247
4248 MVT DstVT = VT;
4249 if (Widen)
4250 DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
4251
4252 // Canonicalize src operands.
4253 SmallVector<SDValue> SrcOps(Ops.begin(), Ops.end());
4254 for (SDValue &Op : SrcOps) {
4255 MVT OpVT = Op.getSimpleValueType();
4256 // Just pass through scalar operands.
4257 if (!OpVT.isVector())
4258 continue;
4259 assert(OpVT == VT && "Vector type mismatch");
4260
4261 if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
4262 Op = BroadcastOp;
4263 continue;
4264 }
4265
4266 // Just widen the subvector by inserting into an undef wide vector.
4267 if (Widen)
4268 Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
4269 }
4270
4271 SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
4272
4273 // Perform the 512-bit op then extract the bottom subvector.
4274 if (Widen)
4275 Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
4276 return Res;
4277}
4278
4279/// Insert i1-subvector to i1-vector.
4281 const X86Subtarget &Subtarget) {
4282
4283 SDLoc dl(Op);
4284 SDValue Vec = Op.getOperand(0);
4285 SDValue SubVec = Op.getOperand(1);
4286 SDValue Idx = Op.getOperand(2);
4287 unsigned IdxVal = Op.getConstantOperandVal(2);
4288
4289 // Inserting undef is a nop. We can just return the original vector.
4290 if (SubVec.isUndef())
4291 return Vec;
4292
4293 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
4294 return Op;
4295
4296 MVT OpVT = Op.getSimpleValueType();
4297 unsigned NumElems = OpVT.getVectorNumElements();
4298 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
4299
4300 // Extend to natively supported kshift.
4301 MVT WideOpVT = widenMaskVectorType(OpVT, Subtarget);
4302
4303 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
4304 // if necessary.
4305 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
4306 // May need to promote to a legal type.
4307 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4308 DAG.getConstant(0, dl, WideOpVT),
4309 SubVec, Idx);
4310 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4311 }
4312
4313 MVT SubVecVT = SubVec.getSimpleValueType();
4314 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4315 assert(IdxVal + SubVecNumElems <= NumElems &&
4316 IdxVal % SubVecVT.getSizeInBits() == 0 &&
4317 "Unexpected index value in INSERT_SUBVECTOR");
4318
4319 SDValue Undef = DAG.getUNDEF(WideOpVT);
4320
4321 if (IdxVal == 0) {
4322 // Zero lower bits of the Vec
4323 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
4324 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
4325 ZeroIdx);
4326 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4327 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4328 // Merge them together, SubVec should be zero extended.
4329 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4330 DAG.getConstant(0, dl, WideOpVT),
4331 SubVec, ZeroIdx);
4332 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4333 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4334 }
4335
4336 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4337 Undef, SubVec, ZeroIdx);
4338
4339 if (Vec.isUndef()) {
4340 assert(IdxVal != 0 && "Unexpected index");
4341 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4342 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4343 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4344 }
4345
4347 assert(IdxVal != 0 && "Unexpected index");
4348 // If upper elements of Vec are known undef, then just shift into place.
4349 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
4350 [](SDValue V) { return V.isUndef(); })) {
4351 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4352 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4353 } else {
4354 NumElems = WideOpVT.getVectorNumElements();
4355 unsigned ShiftLeft = NumElems - SubVecNumElems;
4356 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4357 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4358 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4359 if (ShiftRight != 0)
4360 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4361 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4362 }
4363 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4364 }
4365
4366 // Simple case when we put subvector in the upper part
4367 if (IdxVal + SubVecNumElems == NumElems) {
4368 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4369 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4370 if (SubVecNumElems * 2 == NumElems) {
4371 // Special case, use legal zero extending insert_subvector. This allows
4372 // isel to optimize when bits are known zero.
4373 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
4374 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4375 DAG.getConstant(0, dl, WideOpVT),
4376 Vec, ZeroIdx);
4377 } else {
4378 // Otherwise use explicit shifts to zero the bits.
4379 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4380 Undef, Vec, ZeroIdx);
4381 NumElems = WideOpVT.getVectorNumElements();
4382 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
4383 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4384 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4385 }
4386 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4387 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4388 }
4389
4390 // Inserting into the middle is more complicated.
4391
4392 NumElems = WideOpVT.getVectorNumElements();
4393
4394 // Widen the vector if needed.
4395 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4396
4397 unsigned ShiftLeft = NumElems - SubVecNumElems;
4398 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4399
4400 // Do an optimization for the most frequently used types.
4401 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
4402 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
4403 Mask0.flipAllBits();
4404 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
4405 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
4406 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
4407 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4408 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4409 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4410 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4411 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4412
4413 // Reduce to original width if needed.
4414 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4415 }
4416
4417 // Clear the upper bits of the subvector and move it to its insert position.
4418 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4419 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4420 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4421 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4422
4423 // Isolate the bits below the insertion point.
4424 unsigned LowShift = NumElems - IdxVal;
4425 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
4426 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4427 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
4428 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4429
4430 // Isolate the bits after the last inserted bit.
4431 unsigned HighShift = IdxVal + SubVecNumElems;
4432 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
4433 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4434 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
4435 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4436
4437 // Now OR all 3 pieces together.
4438 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
4439 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
4440
4441 // Reduce to original width if needed.
4442 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4443}
4444
4446 const SDLoc &dl) {
4447 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
4448 EVT SubVT = V1.getValueType();
4449 EVT SubSVT = SubVT.getScalarType();
4450 unsigned SubNumElts = SubVT.getVectorNumElements();
4451 unsigned SubVectorWidth = SubVT.getSizeInBits();
4452 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
4453 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
4454 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
4455}
4456
4457/// Returns a vector of specified type with all bits set.
4458/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
4459/// Then bitcast to their original type, ensuring they get CSE'd.
4460static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4461 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
4462 "Expected a 128/256/512-bit vector type");
4463 unsigned NumElts = VT.getSizeInBits() / 32;
4464 SDValue Vec = DAG.getAllOnesConstant(dl, MVT::getVectorVT(MVT::i32, NumElts));
4465 return DAG.getBitcast(VT, Vec);
4466}
4467
4468static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
4469 SDValue In, SelectionDAG &DAG) {
4470 EVT InVT = In.getValueType();
4471 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
4472 assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
4473 ISD::ZERO_EXTEND == Opcode) &&
4474 "Unknown extension opcode");
4475
4476 // For 256-bit vectors, we only need the lower (128-bit) input half.
4477 // For 512-bit vectors, we only need the lower input half or quarter.
4478 if (InVT.getSizeInBits() > 128) {
4479 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
4480 "Expected VTs to be the same size!");
4481 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
4482 In = extractSubVector(In, 0, DAG, DL,
4483 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
4484 InVT = In.getValueType();
4485 }
4486
4487 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
4488 Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);
4489
4490 return DAG.getNode(Opcode, DL, VT, In);
4491}
4492
4493// Create OR(AND(LHS,MASK),AND(RHS,~MASK)) bit select pattern
4494static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS,
4495 SDValue Mask, SelectionDAG &DAG) {
4496 LHS = DAG.getNode(ISD::AND, DL, VT, LHS, Mask);
4497 RHS = DAG.getNode(X86ISD::ANDNP, DL, VT, Mask, RHS);
4498 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
4499}
4500
4502 bool Lo, bool Unary) {
4503 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
4504 "Illegal vector type to unpack");
4505 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4506 int NumElts = VT.getVectorNumElements();
4507 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
4508 for (int i = 0; i < NumElts; ++i) {
4509 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
4510 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
4511 Pos += (Unary ? 0 : NumElts * (i % 2));
4512 Pos += (Lo ? 0 : NumEltsInLane / 2);
4513 Mask.push_back(Pos);
4514 }
4515}
4516
4517/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
4518/// imposed by AVX and specific to the unary pattern. Example:
4519/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
4520/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
4522 bool Lo) {
4523 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4524 int NumElts = VT.getVectorNumElements();
4525 for (int i = 0; i < NumElts; ++i) {
4526 int Pos = i / 2;
4527 Pos += (Lo ? 0 : NumElts / 2);
4528 Mask.push_back(Pos);
4529 }
4530}
4531
4532// Attempt to constant fold, else just create a VECTOR_SHUFFLE.
4533static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
4534 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
4536 (ISD::isBuildVectorOfConstantSDNodes(V2.getNode()) || V2.isUndef())) {
4537 SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
4538 for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
4539 int M = Mask[I];
4540 if (M < 0)
4541 continue;
4542 SDValue V = (M < NumElts) ? V1 : V2;
4543 if (V.isUndef())
4544 continue;
4545 Ops[I] = V.getOperand(M % NumElts);
4546 }
4547 return DAG.getBuildVector(VT, dl, Ops);
4548 }
4549
4550 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4551}
4552
4553/// Returns a vector_shuffle node for an unpackl operation.
4554static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4555 SDValue V1, SDValue V2) {
4557 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
4558 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4559}
4560
4561/// Returns a vector_shuffle node for an unpackh operation.
4562static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4563 SDValue V1, SDValue V2) {
4565 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
4566 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4567}
4568
4569/// Returns a node that packs the LHS + RHS nodes together at half width.
4570/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
4571/// TODO: Add subvector splitting if/when we have a need for it.
4572static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
4573 const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
4574 bool PackHiHalf = false) {
4575 MVT OpVT = LHS.getSimpleValueType();
4576 unsigned EltSizeInBits = VT.getScalarSizeInBits();
4577 bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
4578 assert(OpVT == RHS.getSimpleValueType() &&
4579 VT.getSizeInBits() == OpVT.getSizeInBits() &&
4580 (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&
4581 "Unexpected PACK operand types");
4582 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&
4583 "Unexpected PACK result type");
4584
4585 // Rely on vector shuffles for vXi64 -> vXi32 packing.
4586 if (EltSizeInBits == 32) {
4587 SmallVector<int> PackMask;
4588 int Offset = PackHiHalf ? 1 : 0;
4589 int NumElts = VT.getVectorNumElements();
4590 for (int I = 0; I != NumElts; I += 4) {
4591 PackMask.push_back(I + Offset);
4592 PackMask.push_back(I + Offset + 2);
4593 PackMask.push_back(I + Offset + NumElts);
4594 PackMask.push_back(I + Offset + NumElts + 2);
4595 }
4596 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
4597 DAG.getBitcast(VT, RHS), PackMask);
4598 }
4599
4600 // See if we already have sufficient leading bits for PACKSS/PACKUS.
4601 if (!PackHiHalf) {
4602 if (UsePackUS &&
4603 DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
4604 DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
4605 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4606
4607 if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
4608 DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
4609 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4610 }
4611
4612 // Fallback to sign/zero extending the requested half and pack.
4613 SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
4614 if (UsePackUS) {
4615 if (PackHiHalf) {
4616 LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
4617 RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
4618 } else {
4619 SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
4620 LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
4621 RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
4622 };
4623 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4624 };
4625
4626 if (!PackHiHalf) {
4627 LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
4628 RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
4629 }
4630 LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
4631 RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
4632 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4633}
4634
4635/// Return a vector_shuffle of the specified vector of zero or undef vector.
4636/// This produces a shuffle where the low element of V2 is swizzled into the
4637/// zero/undef vector, landing at element Idx.
4638/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
4640 bool IsZero,
4641 const X86Subtarget &Subtarget,
4642 SelectionDAG &DAG) {
4643 MVT VT = V2.getSimpleValueType();
4644 SDValue V1 = IsZero
4645 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
4646 int NumElems = VT.getVectorNumElements();
4647 SmallVector<int, 16> MaskVec(NumElems);
4648 for (int i = 0; i != NumElems; ++i)
4649 // If this is the insertion idx, put the low elt of V2 here.
4650 MaskVec[i] = (i == Idx) ? NumElems : i;
4651 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
4652}
4653
4655 if (Ptr.getOpcode() == X86ISD::Wrapper ||
4656 Ptr.getOpcode() == X86ISD::WrapperRIP)
4657 Ptr = Ptr.getOperand(0);
4658 return dyn_cast<ConstantPoolSDNode>(Ptr);
4659}
4660
4661// TODO: Add support for non-zero offsets.
4664 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
4665 return nullptr;
4666 return CNode->getConstVal();
4667}
4668
4670 if (!Load || !ISD::isNormalLoad(Load))
4671 return nullptr;
4672 return getTargetConstantFromBasePtr(Load->getBasePtr());
4673}
4674
4677 return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
4678}
4679
4680const Constant *
4682 assert(LD && "Unexpected null LoadSDNode");
4683 return getTargetConstantFromNode(LD);
4684}
4685
4686// Extract raw constant bits from constant pools.
4687static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
4688 APInt &UndefElts,
4689 SmallVectorImpl<APInt> &EltBits,
4690 bool AllowWholeUndefs = true,
4691 bool AllowPartialUndefs = false) {
4692 assert(EltBits.empty() && "Expected an empty EltBits vector");
4693
4695
4696 EVT VT = Op.getValueType();
4697 unsigned SizeInBits = VT.getSizeInBits();
4698 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
4699 unsigned NumElts = SizeInBits / EltSizeInBits;
4700
4701 // Bitcast a source array of element bits to the target size.
4702 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
4703 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
4704 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
4705 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
4706 "Constant bit sizes don't match");
4707
4708 // Don't split if we don't allow undef bits.
4709 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
4710 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
4711 return false;
4712
4713 // If we're already the right size, don't bother bitcasting.
4714 if (NumSrcElts == NumElts) {
4715 UndefElts = UndefSrcElts;
4716 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
4717 return true;
4718 }
4719
4720 // Extract all the undef/constant element data and pack into single bitsets.
4721 APInt UndefBits(SizeInBits, 0);
4722 APInt MaskBits(SizeInBits, 0);
4723
4724 for (unsigned i = 0; i != NumSrcElts; ++i) {
4725 unsigned BitOffset = i * SrcEltSizeInBits;
4726 if (UndefSrcElts[i])
4727 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
4728 MaskBits.insertBits(SrcEltBits[i], BitOffset);
4729 }
4730
4731 // Split the undef/constant single bitset data into the target elements.
4732 UndefElts = APInt(NumElts, 0);
4733 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
4734
4735 for (unsigned i = 0; i != NumElts; ++i) {
4736 unsigned BitOffset = i * EltSizeInBits;
4737 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
4738
4739 // Only treat an element as UNDEF if all bits are UNDEF.
4740 if (UndefEltBits.isAllOnes()) {
4741 if (!AllowWholeUndefs)
4742 return false;
4743 UndefElts.setBit(i);
4744 continue;
4745 }
4746
4747 // If only some bits are UNDEF then treat them as zero (or bail if not
4748 // supported).
4749 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
4750 return false;
4751
4752 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
4753 }
4754 return true;
4755 };
4756
4757 // Collect constant bits and insert into mask/undef bit masks.
4758 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
4759 unsigned UndefBitIndex) {
4760 if (!Cst)
4761 return false;
4762 if (isa<UndefValue>(Cst)) {
4763 Undefs.setBit(UndefBitIndex);
4764 return true;
4765 }
4766 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
4767 Mask = CInt->getValue();
4768 return true;
4769 }
4770 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
4771 Mask = CFP->getValueAPF().bitcastToAPInt();
4772 return true;
4773 }
4774 if (auto *CDS = dyn_cast<ConstantDataSequential>(Cst)) {
4775 Type *Ty = CDS->getType();
4777 Type *EltTy = CDS->getElementType();
4778 bool IsInteger = EltTy->isIntegerTy();
4779 bool IsFP =
4780 EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy();
4781 if (!IsInteger && !IsFP)
4782 return false;
4783 unsigned EltBits = EltTy->getPrimitiveSizeInBits();
4784 for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
4785 if (IsInteger)
4786 Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits);
4787 else
4788 Mask.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(),
4789 I * EltBits);
4790 return true;
4791 }
4792 return false;
4793 };
4794
4795 // Handle UNDEFs.
4796 if (Op.isUndef()) {
4797 APInt UndefSrcElts = APInt::getAllOnes(NumElts);
4798 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
4799 return CastBitData(UndefSrcElts, SrcEltBits);
4800 }
4801
4802 // Extract scalar constant bits.
4803 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
4804 APInt UndefSrcElts = APInt::getZero(1);
4805 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
4806 return CastBitData(UndefSrcElts, SrcEltBits);
4807 }
4808 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
4809 APInt UndefSrcElts = APInt::getZero(1);
4810 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
4811 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
4812 return CastBitData(UndefSrcElts, SrcEltBits);
4813 }
4814
4815 // Extract constant bits from build vector.
4816 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
4817 BitVector Undefs;
4818 SmallVector<APInt> SrcEltBits;
4819 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
4820 if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
4821 APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());
4822 for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
4823 if (Undefs[I])
4824 UndefSrcElts.setBit(I);
4825 return CastBitData(UndefSrcElts, SrcEltBits);
4826 }
4827 }
4828
4829 // Extract constant bits from constant pool vector.
4830 if (auto *Cst = getTargetConstantFromNode(Op)) {
4831 Type *CstTy = Cst->getType();
4832 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
4833 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
4834 return false;
4835
4836 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
4837 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
4838 if ((SizeInBits % SrcEltSizeInBits) != 0)
4839 return false;
4840
4841 APInt UndefSrcElts(NumSrcElts, 0);
4842 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
4843 for (unsigned i = 0; i != NumSrcElts; ++i)
4844 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
4845 UndefSrcElts, i))
4846 return false;
4847
4848 return CastBitData(UndefSrcElts, SrcEltBits);
4849 }
4850
4851 // Extract constant bits from a broadcasted constant pool scalar.
4852 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
4853 EltSizeInBits <= VT.getScalarSizeInBits()) {
4854 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
4855 if (MemIntr->getMemoryVT().getStoreSizeInBits() != VT.getScalarSizeInBits())
4856 return false;
4857
4858 SDValue Ptr = MemIntr->getBasePtr();
4860 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
4861 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
4862
4863 APInt UndefSrcElts(NumSrcElts, 0);
4864 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
4865 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
4866 if (UndefSrcElts[0])
4867 UndefSrcElts.setBits(0, NumSrcElts);
4868 if (SrcEltBits[0].getBitWidth() != SrcEltSizeInBits)
4869 SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);
4870 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
4871 return CastBitData(UndefSrcElts, SrcEltBits);
4872 }
4873 }
4874 }
4875
4876 // Extract constant bits from a subvector broadcast.
4877 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
4878 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
4879 SDValue Ptr = MemIntr->getBasePtr();
4880 // The source constant may be larger than the subvector broadcast,
4881 // ensure we extract the correct subvector constants.
4882 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
4883 Type *CstTy = Cst->getType();
4884 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
4885 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
4886 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
4887 (SizeInBits % SubVecSizeInBits) != 0)
4888 return false;
4889 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
4890 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
4891 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
4892 APInt UndefSubElts(NumSubElts, 0);
4893 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
4894 APInt(CstEltSizeInBits, 0));
4895 for (unsigned i = 0; i != NumSubElts; ++i) {
4896 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
4897 UndefSubElts, i))
4898 return false;
4899 for (unsigned j = 1; j != NumSubVecs; ++j)
4900 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
4901 }
4902 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
4903 UndefSubElts);
4904 return CastBitData(UndefSubElts, SubEltBits);
4905 }
4906 }
4907
4908 // Extract a rematerialized scalar constant insertion.
4909 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
4910 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
4911 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
4912 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
4913 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
4914
4915 APInt UndefSrcElts(NumSrcElts, 0);
4916 SmallVector<APInt, 64> SrcEltBits;
4917 const APInt &C = Op.getOperand(0).getConstantOperandAPInt(0);
4918 SrcEltBits.push_back(C.zextOrTrunc(SrcEltSizeInBits));
4919 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
4920 return CastBitData(UndefSrcElts, SrcEltBits);
4921 }
4922
4923 // Insert constant bits from a base and sub vector sources.
4924 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
4925 // If bitcasts to larger elements we might lose track of undefs - don't
4926 // allow any to be safe.
4927 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
4928 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
4929
4930 APInt UndefSrcElts, UndefSubElts;
4931 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
4932 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
4933 UndefSubElts, EltSubBits,
4934 AllowWholeUndefs && AllowUndefs,
4935 AllowPartialUndefs && AllowUndefs) &&
4936 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
4937 UndefSrcElts, EltSrcBits,
4938 AllowWholeUndefs && AllowUndefs,
4939 AllowPartialUndefs && AllowUndefs)) {
4940 unsigned BaseIdx = Op.getConstantOperandVal(2);
4941 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
4942 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
4943 EltSrcBits[BaseIdx + i] = EltSubBits[i];
4944 return CastBitData(UndefSrcElts, EltSrcBits);
4945 }
4946 }
4947
4948 // Extract constant bits from a subvector's source.
4949 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
4950 // TODO - support extract_subvector through bitcasts.
4951 if (EltSizeInBits != VT.getScalarSizeInBits())
4952 return false;
4953
4954 if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
4955 UndefElts, EltBits, AllowWholeUndefs,
4956 AllowPartialUndefs)) {
4957 EVT SrcVT = Op.getOperand(0).getValueType();
4958 unsigned NumSrcElts = SrcVT.getVectorNumElements();
4959 unsigned NumSubElts = VT.getVectorNumElements();
4960 unsigned BaseIdx = Op.getConstantOperandVal(1);
4961 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
4962 if ((BaseIdx + NumSubElts) != NumSrcElts)
4963 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
4964 if (BaseIdx != 0)
4965 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
4966 return true;
4967 }
4968 }
4969
4970 // Extract constant bits from shuffle node sources.
4971 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
4972 // TODO - support shuffle through bitcasts.
4973 if (EltSizeInBits != VT.getScalarSizeInBits())
4974 return false;
4975
4976 ArrayRef<int> Mask = SVN->getMask();
4977 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
4978 llvm::any_of(Mask, [](int M) { return M < 0; }))
4979 return false;
4980
4981 APInt UndefElts0, UndefElts1;
4982 SmallVector<APInt, 32> EltBits0, EltBits1;
4983 if (isAnyInRange(Mask, 0, NumElts) &&
4984 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
4985 UndefElts0, EltBits0, AllowWholeUndefs,
4986 AllowPartialUndefs))
4987 return false;
4988 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
4989 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
4990 UndefElts1, EltBits1, AllowWholeUndefs,
4991 AllowPartialUndefs))
4992 return false;
4993
4994 UndefElts = APInt::getZero(NumElts);
4995 for (int i = 0; i != (int)NumElts; ++i) {
4996 int M = Mask[i];
4997 if (M < 0) {
4998 UndefElts.setBit(i);
4999 EltBits.push_back(APInt::getZero(EltSizeInBits));
5000 } else if (M < (int)NumElts) {
5001 if (UndefElts0[M])
5002 UndefElts.setBit(i);
5003 EltBits.push_back(EltBits0[M]);
5004 } else {
5005 if (UndefElts1[M - NumElts])
5006 UndefElts.setBit(i);
5007 EltBits.push_back(EltBits1[M - NumElts]);
5008 }
5009 }
5010 return true;
5011 }
5012
5013 return false;
5014}
5015
5016namespace llvm {
5017namespace X86 {
5018bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
5019 APInt UndefElts;
5020 SmallVector<APInt, 16> EltBits;
5022 Op, Op.getScalarValueSizeInBits(), UndefElts, EltBits,
5023 /*AllowWholeUndefs*/ true, AllowPartialUndefs)) {
5024 int SplatIndex = -1;
5025 for (int i = 0, e = EltBits.size(); i != e; ++i) {
5026 if (UndefElts[i])
5027 continue;
5028 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
5029 SplatIndex = -1;
5030 break;
5031 }
5032 SplatIndex = i;
5033 }
5034 if (0 <= SplatIndex) {
5035 SplatVal = EltBits[SplatIndex];
5036 return true;
5037 }
5038 }
5039
5040 return false;
5041}
5042} // namespace X86
5043} // namespace llvm
5044
5046 unsigned MaskEltSizeInBits,
5048 APInt &UndefElts) {
5049 // Extract the raw target constant bits.
5050 SmallVector<APInt, 64> EltBits;
5051 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5052 EltBits, /* AllowWholeUndefs */ true,
5053 /* AllowPartialUndefs */ false))
5054 return false;
5055
5056 // Insert the extracted elements into the mask.
5057 for (const APInt &Elt : EltBits)
5058 RawMask.push_back(Elt.getZExtValue());
5059
5060 return true;
5061}
5062
5063// Match not(xor X, -1) -> X.
5064// Match not(pcmpgt(C, X)) -> pcmpgt(X, C - 1).
5065// Match not(extract_subvector(xor X, -1)) -> extract_subvector(X).
5066// Match not(concat_vectors(xor X, -1, xor Y, -1)) -> concat_vectors(X, Y).
5068 V = peekThroughBitcasts(V);
5069 if (V.getOpcode() == ISD::XOR &&
5070 (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||
5071 isAllOnesConstant(V.getOperand(1))))
5072 return V.getOperand(0);
5073 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5074 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
5075 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
5076 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
5077 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
5078 Not, V.getOperand(1));
5079 }
5080 }
5081 if (V.getOpcode() == X86ISD::PCMPGT &&
5082 !ISD::isBuildVectorAllZeros(V.getOperand(0).getNode()) &&
5083 !ISD::isBuildVectorAllOnes(V.getOperand(0).getNode()) &&
5084 V.getOperand(0).hasOneUse()) {
5085 APInt UndefElts;
5086 SmallVector<APInt> EltBits;
5087 if (getTargetConstantBitsFromNode(V.getOperand(0),
5088 V.getScalarValueSizeInBits(), UndefElts,
5089 EltBits)) {
5090 // Don't fold min_signed_value -> (min_signed_value - 1)
5091 bool MinSigned = false;
5092 for (APInt &Elt : EltBits) {
5093 MinSigned |= Elt.isMinSignedValue();
5094 Elt -= 1;
5095 }
5096 if (!MinSigned) {
5097 SDLoc DL(V);
5098 MVT VT = V.getSimpleValueType();
5099 return DAG.getNode(X86ISD::PCMPGT, DL, VT, V.getOperand(1),
5100 getConstVector(EltBits, UndefElts, VT, DAG, DL));
5101 }
5102 }
5103 }
5105 if (collectConcatOps(V.getNode(), CatOps, DAG)) {
5106 for (SDValue &CatOp : CatOps) {
5107 SDValue NotCat = IsNOT(CatOp, DAG);
5108 if (!NotCat) return SDValue();
5109 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
5110 }
5111 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
5112 }
5113 return SDValue();
5114}
5115
5116/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5117/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
5118/// Note: This ignores saturation, so inputs must be checked first.
5120 bool Unary, unsigned NumStages = 1) {
5121 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5122 unsigned NumElts = VT.getVectorNumElements();
5123 unsigned NumLanes = VT.getSizeInBits() / 128;
5124 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5125 unsigned Offset = Unary ? 0 : NumElts;
5126 unsigned Repetitions = 1u << (NumStages - 1);
5127 unsigned Increment = 1u << NumStages;
5128 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
5129
5130 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5131 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
5132 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5133 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5134 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5135 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5136 }
5137 }
5138}
5139
5140// Split the demanded elts of a PACKSS/PACKUS node between its operands.
5141static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
5142 APInt &DemandedLHS, APInt &DemandedRHS) {
5143 int NumLanes = VT.getSizeInBits() / 128;
5144 int NumElts = DemandedElts.getBitWidth();
5145 int NumInnerElts = NumElts / 2;
5146 int NumEltsPerLane = NumElts / NumLanes;
5147 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
5148
5149 DemandedLHS = APInt::getZero(NumInnerElts);
5150 DemandedRHS = APInt::getZero(NumInnerElts);
5151
5152 // Map DemandedElts to the packed operands.
5153 for (int Lane = 0; Lane != NumLanes; ++Lane) {
5154 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
5155 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
5156 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
5157 if (DemandedElts[OuterIdx])
5158 DemandedLHS.setBit(InnerIdx);
5159 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
5160 DemandedRHS.setBit(InnerIdx);
5161 }
5162 }
5163}
5164
5165// Split the demanded elts of a HADD/HSUB node between its operands.
5166static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
5167 APInt &DemandedLHS, APInt &DemandedRHS) {
5168 int NumLanes = VT.getSizeInBits() / 128;
5169 int NumElts = DemandedElts.getBitWidth();
5170 int NumEltsPerLane = NumElts / NumLanes;
5171 int HalfEltsPerLane = NumEltsPerLane / 2;
5172
5173 DemandedLHS = APInt::getZero(NumElts);
5174 DemandedRHS = APInt::getZero(NumElts);
5175
5176 // Map DemandedElts to the horizontal operands.
5177 for (int Idx = 0; Idx != NumElts; ++Idx) {
5178 if (!DemandedElts[Idx])
5179 continue;
5180 int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
5181 int LocalIdx = Idx % NumEltsPerLane;
5182 if (LocalIdx < HalfEltsPerLane) {
5183 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
5184 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
5185 } else {
5186 LocalIdx -= HalfEltsPerLane;
5187 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
5188 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
5189 }
5190 }
5191}
5192
5193/// Calculates the shuffle mask corresponding to the target-specific opcode.
5194/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5195/// operands in \p Ops, and returns true.
5196/// Sets \p IsUnary to true if only one source is used. Note that this will set
5197/// IsUnary for shuffles which use a single input multiple times, and in those
5198/// cases it will adjust the mask to only have indices within that single input.
5199/// It is an error to call this with non-empty Mask/Ops vectors.
5200static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5202 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5203 if (!isTargetShuffle(N.getOpcode()))
5204 return false;
5205
5206 MVT VT = N.getSimpleValueType();
5207 unsigned NumElems = VT.getVectorNumElements();
5208 unsigned MaskEltSize = VT.getScalarSizeInBits();
5210 APInt RawUndefs;
5211 uint64_t ImmN;
5212
5213 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5214 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5215
5216 IsUnary = false;
5217 bool IsFakeUnary = false;
5218 switch (N.getOpcode()) {
5219 case X86ISD::BLENDI:
5220 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5221 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5222 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5223 DecodeBLENDMask(NumElems, ImmN, Mask);
5224 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5225 break;
5226 case X86ISD::SHUFP:
5227 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5228 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5229 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5230 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
5231 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5232 break;
5233 case X86ISD::INSERTPS:
5234 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5235 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5236 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5237 DecodeINSERTPSMask(ImmN, Mask);
5238 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5239 break;
5240 case X86ISD::EXTRQI:
5241 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5242 if (isa<ConstantSDNode>(N.getOperand(1)) &&
5243 isa<ConstantSDNode>(N.getOperand(2))) {
5244 int BitLen = N.getConstantOperandVal(1);
5245 int BitIdx = N.getConstantOperandVal(2);
5246 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5247 IsUnary = true;
5248 }
5249 break;
5250 case X86ISD::INSERTQI:
5251 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5252 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5253 if (isa<ConstantSDNode>(N.getOperand(2)) &&
5254 isa<ConstantSDNode>(N.getOperand(3))) {
5255 int BitLen = N.getConstantOperandVal(2);
5256 int BitIdx = N.getConstantOperandVal(3);
5257 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5258 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5259 }
5260 break;
5261 case X86ISD::UNPCKH:
5262 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5263 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5264 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
5265 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5266 break;
5267 case X86ISD::UNPCKL:
5268 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5269 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5270 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
5271 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5272 break;
5273 case X86ISD::MOVHLPS:
5274 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5275 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5276 DecodeMOVHLPSMask(NumElems, Mask);
5277 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5278 break;
5279 case X86ISD::MOVLHPS:
5280 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5281 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5282 DecodeMOVLHPSMask(NumElems, Mask);
5283 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5284 break;
5285 case X86ISD::VALIGN:
5286 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
5287 "Only 32-bit and 64-bit elements are supported!");
5288 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5289 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5290 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5291 DecodeVALIGNMask(NumElems, ImmN, Mask);
5292 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5293 Ops.push_back(N.getOperand(1));
5294 Ops.push_back(N.getOperand(0));
5295 break;
5296 case X86ISD::PALIGNR:
5297 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5298 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5299 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5300 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5301 DecodePALIGNRMask(NumElems, ImmN, Mask);
5302 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5303 Ops.push_back(N.getOperand(1));
5304 Ops.push_back(N.getOperand(0));
5305 break;
5306 case X86ISD::VSHLDQ:
5307 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5308 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5309 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5310 DecodePSLLDQMask(NumElems, ImmN, Mask);
5311 IsUnary = true;
5312 break;
5313 case X86ISD::VSRLDQ:
5314 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5315 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5316 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5317 DecodePSRLDQMask(NumElems, ImmN, Mask);
5318 IsUnary = true;
5319 break;
5320 case X86ISD::PSHUFD:
5321 case X86ISD::VPERMILPI:
5322 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5323 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5324 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
5325 IsUnary = true;
5326 break;
5327 case X86ISD::PSHUFHW:
5328 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5329 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5330 DecodePSHUFHWMask(NumElems, ImmN, Mask);
5331 IsUnary = true;
5332 break;
5333 case X86ISD::PSHUFLW:
5334 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5335 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5336 DecodePSHUFLWMask(NumElems, ImmN, Mask);
5337 IsUnary = true;
5338 break;
5339 case X86ISD::VZEXT_MOVL:
5340 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5341 DecodeZeroMoveLowMask(NumElems, Mask);
5342 IsUnary = true;
5343 break;
5344 case X86ISD::VBROADCAST:
5345 // We only decode broadcasts of same-sized vectors, peeking through to
5346 // extracted subvectors is likely to cause hasOneUse issues with
5347 // SimplifyDemandedBits etc.
5348 if (N.getOperand(0).getValueType() == VT) {
5349 DecodeVectorBroadcast(NumElems, Mask);
5350 IsUnary = true;
5351 break;
5352 }
5353 return false;
5354 case X86ISD::VPERMILPV: {
5355 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5356 IsUnary = true;
5357 SDValue MaskNode = N.getOperand(1);
5358 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5359 RawUndefs)) {
5360 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
5361 break;
5362 }
5363 return false;
5364 }
5365 case X86ISD::PSHUFB: {
5366 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5367 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5368 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5369 IsUnary = true;
5370 SDValue MaskNode = N.getOperand(1);
5371 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5372 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
5373 break;
5374 }
5375 return false;
5376 }
5377 case X86ISD::VPERMI:
5378 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5379 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5380 DecodeVPERMMask(NumElems, ImmN, Mask);
5381 IsUnary = true;
5382 break;
5383 case X86ISD::MOVSS:
5384 case X86ISD::MOVSD:
5385 case X86ISD::MOVSH:
5386 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5387 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5388 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
5389 break;
5390 case X86ISD::VPERM2X128:
5391 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5392 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5393 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5394 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
5395 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5396 break;
5397 case X86ISD::SHUF128:
5398 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5399 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5400 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5401 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
5402 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5403 break;
5404 case X86ISD::MOVSLDUP:
5405 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5406 DecodeMOVSLDUPMask(NumElems, Mask);
5407 IsUnary = true;
5408 break;
5409 case X86ISD::MOVSHDUP:
5410 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5411 DecodeMOVSHDUPMask(NumElems, Mask);
5412 IsUnary = true;
5413 break;
5414 case X86ISD::MOVDDUP:
5415 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5416 DecodeMOVDDUPMask(NumElems, Mask);
5417 IsUnary = true;
5418 break;
5419 case X86ISD::VPERMIL2: {
5420 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5421 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5422 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5423 SDValue MaskNode = N.getOperand(2);
5424 SDValue CtrlNode = N.getOperand(3);
5425 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5426 unsigned CtrlImm = CtrlOp->getZExtValue();
5427 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5428 RawUndefs)) {
5429 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
5430 Mask);
5431 break;
5432 }
5433 }
5434 return false;
5435 }
5436 case X86ISD::VPPERM: {
5437 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5438 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5439 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5440 SDValue MaskNode = N.getOperand(2);
5441 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5442 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
5443 break;
5444 }
5445 return false;
5446 }
5447 case X86ISD::VPERMV: {
5448 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5449 IsUnary = true;
5450 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5451 Ops.push_back(N.getOperand(1));
5452 SDValue MaskNode = N.getOperand(0);
5453 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5454 RawUndefs)) {
5455 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
5456 break;
5457 }
5458 return false;
5459 }
5460 case X86ISD::VPERMV3: {
5461 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5462 assert(N.getOperand(2).getValueType() == VT && "Unexpected value type");
5463 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(2);
5464 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5465 Ops.push_back(N.getOperand(0));
5466 Ops.push_back(N.getOperand(2));
5467 SDValue MaskNode = N.getOperand(1);
5468 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5469 RawUndefs)) {
5470 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
5471 break;
5472 }
5473 return false;
5474 }
5475 default:
5476 llvm_unreachable("unknown target shuffle node");
5477 }
5478
5479 // Empty mask indicates the decode failed.
5480 if (Mask.empty())
5481 return false;
5482
5483 // Check if we're getting a shuffle mask with zero'd elements.
5484 if (!AllowSentinelZero && isAnyZero(Mask))
5485 return false;
5486
5487 // If we have a fake unary shuffle, the shuffle mask is spread across two
5488 // inputs that are actually the same node. Re-map the mask to always point
5489 // into the first input.
5490 if (IsFakeUnary)
5491 for (int &M : Mask)
5492 if (M >= (int)Mask.size())
5493 M -= Mask.size();
5494
5495 // If we didn't already add operands in the opcode-specific code, default to
5496 // adding 1 or 2 operands starting at 0.
5497 if (Ops.empty()) {
5498 Ops.push_back(N.getOperand(0));
5499 if (!IsUnary || IsFakeUnary)
5500 Ops.push_back(N.getOperand(1));
5501 }
5502
5503 return true;
5504}
5505
5506// Wrapper for getTargetShuffleMask with InUnary;
5507static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5509 SmallVectorImpl<int> &Mask) {
5510 bool IsUnary;
5511 return getTargetShuffleMask(N, AllowSentinelZero, Ops, Mask, IsUnary);
5512}
5513
5514/// Compute whether each element of a shuffle is zeroable.
5515///
5516/// A "zeroable" vector shuffle element is one which can be lowered to zero.
5517/// Either it is an undef element in the shuffle mask, the element of the input
5518/// referenced is undef, or the element of the input referenced is known to be
5519/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
5520/// as many lanes with this technique as possible to simplify the remaining
5521/// shuffle.
5523 SDValue V1, SDValue V2,
5524 APInt &KnownUndef, APInt &KnownZero) {
5525 int Size = Mask.size();
5526 KnownUndef = KnownZero = APInt::getZero(Size);
5527
5528 V1 = peekThroughBitcasts(V1);
5529 V2 = peekThroughBitcasts(V2);
5530
5531 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
5532 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
5533
5534 int VectorSizeInBits = V1.getValueSizeInBits();
5535 int ScalarSizeInBits = VectorSizeInBits / Size;
5536 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
5537
5538 for (int i = 0; i < Size; ++i) {
5539 int M = Mask[i];
5540 // Handle the easy cases.
5541 if (M < 0) {
5542 KnownUndef.setBit(i);
5543 continue;
5544 }
5545 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
5546 KnownZero.setBit(i);
5547 continue;
5548 }
5549
5550 // Determine shuffle input and normalize the mask.
5551 SDValue V = M < Size ? V1 : V2;
5552 M %= Size;
5553
5554 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
5555 if (V.getOpcode() != ISD::BUILD_VECTOR)
5556 continue;
5557
5558 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
5559 // the (larger) source element must be UNDEF/ZERO.
5560 if ((Size % V.getNumOperands()) == 0) {
5561 int Scale = Size / V->getNumOperands();
5562 SDValue Op = V.getOperand(M / Scale);
5563 if (Op.isUndef())
5564 KnownUndef.setBit(i);
5565 if (X86::isZeroNode(Op))
5566 KnownZero.setBit(i);
5567 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
5568 APInt Val = Cst->getAPIntValue();
5569 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5570 if (Val == 0)
5571 KnownZero.setBit(i);
5572 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5573 APInt Val = Cst->getValueAPF().bitcastToAPInt();
5574 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5575 if (Val == 0)
5576 KnownZero.setBit(i);
5577 }
5578 continue;
5579 }
5580
5581 // If the BUILD_VECTOR has more elements then all the (smaller) source
5582 // elements must be UNDEF or ZERO.
5583 if ((V.getNumOperands() % Size) == 0) {
5584 int Scale = V->getNumOperands() / Size;
5585 bool AllUndef = true;
5586 bool AllZero = true;
5587 for (int j = 0; j < Scale; ++j) {
5588 SDValue Op = V.getOperand((M * Scale) + j);
5589 AllUndef &= Op.isUndef();
5590 AllZero &= X86::isZeroNode(Op);
5591 }
5592 if (AllUndef)
5593 KnownUndef.setBit(i);
5594 if (AllZero)
5595 KnownZero.setBit(i);
5596 continue;
5597 }
5598 }
5599}
5600
5601/// Decode a target shuffle mask and inputs and see if any values are
5602/// known to be undef or zero from their inputs.
5603/// Returns true if the target shuffle mask was decoded.
5604/// FIXME: Merge this with computeZeroableShuffleElements?
5607 APInt &KnownUndef, APInt &KnownZero) {
5608 bool IsUnary;
5609 if (!isTargetShuffle(N.getOpcode()))
5610 return false;
5611
5612 MVT VT = N.getSimpleValueType();
5613 if (!getTargetShuffleMask(N, true, Ops, Mask, IsUnary))
5614 return false;
5615
5616 int Size = Mask.size();
5617 SDValue V1 = Ops[0];
5618 SDValue V2 = IsUnary ? V1 : Ops[1];
5619 KnownUndef = KnownZero = APInt::getZero(Size);
5620
5621 V1 = peekThroughBitcasts(V1);
5622 V2 = peekThroughBitcasts(V2);
5623
5624 assert((VT.getSizeInBits() % Size) == 0 &&
5625 "Illegal split of shuffle value type");
5626 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
5627
5628 // Extract known constant input data.
5629 APInt UndefSrcElts[2];
5630 SmallVector<APInt, 32> SrcEltBits[2];
5631 bool IsSrcConstant[2] = {
5632 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5633 SrcEltBits[0], /*AllowWholeUndefs*/ true,
5634 /*AllowPartialUndefs*/ false),
5635 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5636 SrcEltBits[1], /*AllowWholeUndefs*/ true,
5637 /*AllowPartialUndefs*/ false)};
5638
5639 for (int i = 0; i < Size; ++i) {
5640 int M = Mask[i];
5641
5642 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5643 if (M < 0) {
5644 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
5645 if (SM_SentinelUndef == M)
5646 KnownUndef.setBit(i);
5647 if (SM_SentinelZero == M)
5648 KnownZero.setBit(i);
5649 continue;
5650 }
5651
5652 // Determine shuffle input and normalize the mask.
5653 unsigned SrcIdx = M / Size;
5654 SDValue V = M < Size ? V1 : V2;
5655 M %= Size;
5656
5657 // We are referencing an UNDEF input.
5658 if (V.isUndef()) {
5659 KnownUndef.setBit(i);
5660 continue;
5661 }
5662
5663 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5664 // TODO: We currently only set UNDEF for integer types - floats use the same
5665 // registers as vectors and many of the scalar folded loads rely on the
5666 // SCALAR_TO_VECTOR pattern.
5667 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
5668 (Size % V.getValueType().getVectorNumElements()) == 0) {
5669 int Scale = Size / V.getValueType().getVectorNumElements();
5670 int Idx = M / Scale;
5671 if (Idx != 0 && !VT.isFloatingPoint())
5672 KnownUndef.setBit(i);
5673 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
5674 KnownZero.setBit(i);
5675 continue;
5676 }
5677
5678 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
5679 // base vectors.
5680 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
5681 SDValue Vec = V.getOperand(0);
5682 int NumVecElts = Vec.getValueType().getVectorNumElements();
5683 if (Vec.isUndef() && Size == NumVecElts) {
5684 int Idx = V.getConstantOperandVal(2);
5685 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
5686 if (M < Idx || (Idx + NumSubElts) <= M)
5687 KnownUndef.setBit(i);
5688 }
5689 continue;
5690 }
5691
5692 // Attempt to extract from the source's constant bits.
5693 if (IsSrcConstant[SrcIdx]) {
5694 if (UndefSrcElts[SrcIdx][M])
5695 KnownUndef.setBit(i);
5696 else if (SrcEltBits[SrcIdx][M] == 0)
5697 KnownZero.setBit(i);
5698 }
5699 }
5700
5701 assert(VT.getVectorNumElements() == (unsigned)Size &&
5702 "Different mask size from vector size!");
5703 return true;
5704}
5705
5706// Replace target shuffle mask elements with known undef/zero sentinels.
5708 const APInt &KnownUndef,
5709 const APInt &KnownZero,
5710 bool ResolveKnownZeros= true) {
5711 unsigned NumElts = Mask.size();
5712 assert(KnownUndef.getBitWidth() == NumElts &&
5713 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
5714
5715 for (unsigned i = 0; i != NumElts; ++i) {
5716 if (KnownUndef[i])
5717 Mask[i] = SM_SentinelUndef;
5718 else if (ResolveKnownZeros && KnownZero[i])
5719 Mask[i] = SM_SentinelZero;
5720 }
5721}
5722
5723// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
5725 APInt &KnownUndef,
5726 APInt &KnownZero) {
5727 unsigned NumElts = Mask.size();
5728 KnownUndef = KnownZero = APInt::getZero(NumElts);
5729
5730 for (unsigned i = 0; i != NumElts; ++i) {
5731 int M = Mask[i];
5732 if (SM_SentinelUndef == M)
5733 KnownUndef.setBit(i);
5734 if (SM_SentinelZero == M)
5735 KnownZero.setBit(i);
5736 }
5737}
5738
5739// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
5741 SDValue Cond, bool IsBLENDV = false) {
5742 EVT CondVT = Cond.getValueType();
5743 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
5744 unsigned NumElts = CondVT.getVectorNumElements();
5745
5746 APInt UndefElts;
5747 SmallVector<APInt, 32> EltBits;
5748 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
5749 /*AllowWholeUndefs*/ true,
5750 /*AllowPartialUndefs*/ false))
5751 return false;
5752
5753 Mask.resize(NumElts, SM_SentinelUndef);
5754
5755 for (int i = 0; i != (int)NumElts; ++i) {
5756 Mask[i] = i;
5757 // Arbitrarily choose from the 2nd operand if the select condition element
5758 // is undef.
5759 // TODO: Can we do better by matching patterns such as even/odd?
5760 if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
5761 (IsBLENDV && EltBits[i].isNonNegative()))
5762 Mask[i] += NumElts;
5763 }
5764
5765 return true;
5766}
5767
5768// Forward declaration (for getFauxShuffleMask recursive check).
5769static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
5772 const SelectionDAG &DAG, unsigned Depth,
5773 bool ResolveKnownElts);
5774
5775// Attempt to decode ops that could be represented as a shuffle mask.
5776// The decoded shuffle mask may contain a different number of elements to the
5777// destination value type.
5778// TODO: Merge into getTargetShuffleInputs()
5779static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
5782 const SelectionDAG &DAG, unsigned Depth,
5783 bool ResolveKnownElts) {
5784 Mask.clear();
5785 Ops.clear();
5786
5787 MVT VT = N.getSimpleValueType();
5788 unsigned NumElts = VT.getVectorNumElements();
5789 unsigned NumSizeInBits = VT.getSizeInBits();
5790 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
5791 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
5792 return false;
5793 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
5794 unsigned NumSizeInBytes = NumSizeInBits / 8;
5795 unsigned NumBytesPerElt = NumBitsPerElt / 8;
5796
5797 unsigned Opcode = N.getOpcode();
5798 switch (Opcode) {
5799 case ISD::VECTOR_SHUFFLE: {
5800 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
5801 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
5802 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
5803 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
5804 Ops.push_back(N.getOperand(0));
5805 Ops.push_back(N.getOperand(1));
5806 return true;
5807 }
5808 return false;
5809 }
5810 case ISD::AND:
5811 case X86ISD::ANDNP: {
5812 // Attempt to decode as a per-byte mask.
5813 APInt UndefElts;
5814 SmallVector<APInt, 32> EltBits;
5815 SDValue N0 = N.getOperand(0);
5816 SDValue N1 = N.getOperand(1);
5817 bool IsAndN = (X86ISD::ANDNP == Opcode);
5818 uint64_t ZeroMask = IsAndN ? 255 : 0;
5819 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits,
5820 /*AllowWholeUndefs*/ false,
5821 /*AllowPartialUndefs*/ false))
5822 return false;
5823 // We can't assume an undef src element gives an undef dst - the other src
5824 // might be zero.
5825 assert(UndefElts.isZero() && "Unexpected UNDEF element in AND/ANDNP mask");
5826 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
5827 const APInt &ByteBits = EltBits[i];
5828 if (ByteBits != 0 && ByteBits != 255)
5829 return false;
5830 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
5831 }
5832 Ops.push_back(IsAndN ? N1 : N0);
5833 return true;
5834 }
5835 case ISD::OR: {
5836 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
5837 // is a valid shuffle index.
5838 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
5839 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
5840 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
5841 return false;
5842
5843 SmallVector<int, 64> SrcMask0, SrcMask1;
5844 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
5847 if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,
5848 Depth + 1, true) ||
5849 !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,
5850 Depth + 1, true))
5851 return false;
5852
5853 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
5854 SmallVector<int, 64> Mask0, Mask1;
5855 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
5856 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
5857 for (int i = 0; i != (int)MaskSize; ++i) {
5858 // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
5859 // loops converting between OR and BLEND shuffles due to
5860 // canWidenShuffleElements merging away undef elements, meaning we
5861 // fail to recognise the OR as the undef element isn't known zero.
5862 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
5863 Mask.push_back(SM_SentinelZero);
5864 else if (Mask1[i] == SM_SentinelZero)
5865 Mask.push_back(i);
5866 else if (Mask0[i] == SM_SentinelZero)
5867 Mask.push_back(i + MaskSize);
5868 else
5869 return false;
5870 }
5871 Ops.push_back(N0);
5872 Ops.push_back(N1);
5873 return true;
5874 }
5875 case ISD::INSERT_SUBVECTOR: {
5876 SDValue Src = N.getOperand(0);
5877 SDValue Sub = N.getOperand(1);
5878 EVT SubVT = Sub.getValueType();
5879 unsigned NumSubElts = SubVT.getVectorNumElements();
5880 if (!N->isOnlyUserOf(Sub.getNode()))
5881 return false;
5882 SDValue SubBC = peekThroughBitcasts(Sub);
5883 uint64_t InsertIdx = N.getConstantOperandVal(2);
5884 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
5885 if (SubBC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5886 SubBC.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
5887 uint64_t ExtractIdx = SubBC.getConstantOperandVal(1);
5888 SDValue SubBCSrc = SubBC.getOperand(0);
5889 unsigned NumSubSrcBCElts = SubBCSrc.getValueType().getVectorNumElements();
5890 unsigned MaxElts = std::max(NumElts, NumSubSrcBCElts);
5891 assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcBCElts) == 0 &&
5892 "Subvector valuetype mismatch");
5893 InsertIdx *= (MaxElts / NumElts);
5894 ExtractIdx *= (MaxElts / NumSubSrcBCElts);
5895 NumSubElts *= (MaxElts / NumElts);
5896 bool SrcIsUndef = Src.isUndef();
5897 for (int i = 0; i != (int)MaxElts; ++i)
5898 Mask.push_back(SrcIsUndef ? SM_SentinelUndef : i);
5899 for (int i = 0; i != (int)NumSubElts; ++i)
5900 Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i;
5901 if (!SrcIsUndef)
5902 Ops.push_back(Src);
5903 Ops.push_back(SubBCSrc);
5904 return true;
5905 }
5906 // Handle CONCAT(SUB0, SUB1).
5907 // Limit this to vXi64 512-bit vector cases to make the most of AVX512
5908 // cross lane shuffles.
5909 if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
5910 NumBitsPerElt == 64 && NumSizeInBits == 512 &&
5911 Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
5912 Src.getOperand(0).isUndef() &&
5913 Src.getOperand(1).getValueType() == SubVT &&
5914 Src.getConstantOperandVal(2) == 0) {
5915 for (int i = 0; i != (int)NumSubElts; ++i)
5916 Mask.push_back(i);
5917 for (int i = 0; i != (int)NumSubElts; ++i)
5918 Mask.push_back(i + NumElts);
5919 Ops.push_back(Src.getOperand(1));
5920 Ops.push_back(Sub);
5921 return true;
5922 }
5923 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
5924 SmallVector<int, 64> SubMask;
5925 SmallVector<SDValue, 2> SubInputs;
5926 SDValue SubSrc = peekThroughOneUseBitcasts(Sub);
5927 EVT SubSrcVT = SubSrc.getValueType();
5928 if (!SubSrcVT.isVector())
5929 return false;
5930
5931 APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());
5932 if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,
5933 Depth + 1, ResolveKnownElts))
5934 return false;
5935
5936 // Subvector shuffle inputs must not be larger than the subvector.
5937 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
5938 return SubVT.getFixedSizeInBits() <
5939 SubInput.getValueSizeInBits().getFixedValue();
5940 }))
5941 return false;
5942
5943 if (SubMask.size() != NumSubElts) {
5944 assert(((SubMask.size() % NumSubElts) == 0 ||
5945 (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale");
5946 if ((NumSubElts % SubMask.size()) == 0) {
5947 int Scale = NumSubElts / SubMask.size();
5948 SmallVector<int,64> ScaledSubMask;
5949 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
5950 SubMask = ScaledSubMask;
5951 } else {
5952 int Scale = SubMask.size() / NumSubElts;
5953 NumSubElts = SubMask.size();
5954 NumElts *= Scale;
5955 InsertIdx *= Scale;
5956 }
5957 }
5958 Ops.push_back(Src);
5959 Ops.append(SubInputs.begin(), SubInputs.end());
5960 if (ISD::isBuildVectorAllZeros(Src.getNode()))
5961 Mask.append(NumElts, SM_SentinelZero);
5962 else
5963 for (int i = 0; i != (int)NumElts; ++i)
5964 Mask.push_back(i);
5965 for (int i = 0; i != (int)NumSubElts; ++i) {
5966 int M = SubMask[i];
5967 if (0 <= M) {
5968 int InputIdx = M / NumSubElts;
5969 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
5970 }
5971 Mask[i + InsertIdx] = M;
5972 }
5973 return true;
5974 }
5975 case X86ISD::PINSRB:
5976 case X86ISD::PINSRW:
5979 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
5980 // vector, for matching src/dst vector types.
5981 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
5982
5983 unsigned DstIdx = 0;
5984 if (Opcode != ISD::SCALAR_TO_VECTOR) {
5985 // Check we have an in-range constant insertion index.
5986 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
5987 N.getConstantOperandAPInt(2).uge(NumElts))
5988 return false;
5989 DstIdx = N.getConstantOperandVal(2);
5990
5991 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
5992 if (X86::isZeroNode(Scl)) {
5993 Ops.push_back(N.getOperand(0));
5994 for (unsigned i = 0; i != NumElts; ++i)
5995 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
5996 return true;
5997 }
5998 }
5999
6000 // Peek through trunc/aext/zext/bitcast.
6001 // TODO: aext shouldn't require SM_SentinelZero padding.
6002 // TODO: handle shift of scalars.
6003 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
6004 while (Scl.getOpcode() == ISD::TRUNCATE ||
6005 Scl.getOpcode() == ISD::ANY_EXTEND ||
6006 Scl.getOpcode() == ISD::ZERO_EXTEND ||
6007 (Scl.getOpcode() == ISD::BITCAST &&
6010 Scl = Scl.getOperand(0);
6011 MinBitsPerElt =
6012 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
6013 }
6014 if ((MinBitsPerElt % 8) != 0)
6015 return false;
6016
6017 // Attempt to find the source vector the scalar was extracted from.
6018 SDValue SrcExtract;
6019 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
6020 Scl.getOpcode() == X86ISD::PEXTRW ||
6021 Scl.getOpcode() == X86ISD::PEXTRB) &&
6022 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6023 SrcExtract = Scl;
6024 }
6025 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
6026 return false;
6027
6028 SDValue SrcVec = SrcExtract.getOperand(0);
6029 EVT SrcVT = SrcVec.getValueType();
6030 if (!SrcVT.getScalarType().isByteSized())
6031 return false;
6032 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6033 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
6034 unsigned DstByte = DstIdx * NumBytesPerElt;
6035 MinBitsPerElt =
6036 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
6037
6038 // Create 'identity' byte level shuffle mask and then add inserted bytes.
6039 if (Opcode == ISD::SCALAR_TO_VECTOR) {
6040 Ops.push_back(SrcVec);
6041 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6042 } else {
6043 Ops.push_back(SrcVec);
6044 Ops.push_back(N.getOperand(0));
6045 for (int i = 0; i != (int)NumSizeInBytes; ++i)
6046 Mask.push_back(NumSizeInBytes + i);
6047 }
6048
6049 unsigned MinBytesPerElts = MinBitsPerElt / 8;
6050 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
6051 for (unsigned i = 0; i != MinBytesPerElts; ++i)
6052 Mask[DstByte + i] = SrcByte + i;
6053 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
6054 Mask[DstByte + i] = SM_SentinelZero;
6055 return true;
6056 }
6057 case X86ISD::PACKSS:
6058 case X86ISD::PACKUS: {
6059 SDValue N0 = N.getOperand(0);
6060 SDValue N1 = N.getOperand(1);
6061 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6062 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6063 "Unexpected input value type");
6064
6065 APInt EltsLHS, EltsRHS;
6066 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
6067
6068 // If we know input saturation won't happen (or we don't care for particular
6069 // lanes), we can treat this as a truncation shuffle.
6070 bool Offset0 = false, Offset1 = false;
6071 if (Opcode == X86ISD::PACKSS) {
6072 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6073 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
6074 (!(N1.isUndef() || EltsRHS.isZero()) &&
6075 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
6076 return false;
6077 // We can't easily fold ASHR into a shuffle, but if it was feeding a
6078 // PACKSS then it was likely being used for sign-extension for a
6079 // truncation, so just peek through and adjust the mask accordingly.
6080 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
6081 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
6082 Offset0 = true;
6083 N0 = N0.getOperand(0);
6084 }
6085 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
6086 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
6087 Offset1 = true;
6088 N1 = N1.getOperand(0);
6089 }
6090 } else {
6091 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6092 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6093 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
6094 (!(N1.isUndef() || EltsRHS.isZero()) &&
6095 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
6096 return false;
6097 }
6098
6099 bool IsUnary = (N0 == N1);
6100
6101 Ops.push_back(N0);
6102 if (!IsUnary)
6103 Ops.push_back(N1);
6104
6105 createPackShuffleMask(VT, Mask, IsUnary);
6106
6107 if (Offset0 || Offset1) {
6108 for (int &M : Mask)
6109 if ((Offset0 && isInRange(M, 0, NumElts)) ||
6110 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
6111 ++M;
6112 }
6113 return true;
6114 }
6115 case ISD::VSELECT:
6116 case X86ISD::BLENDV: {
6117 SDValue Cond = N.getOperand(0);
6118 if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {
6119 Ops.push_back(N.getOperand(1));
6120 Ops.push_back(N.getOperand(2));
6121 return true;
6122 }
6123 return false;
6124 }
6125 case X86ISD::VTRUNC: {
6126 SDValue Src = N.getOperand(0);
6127 EVT SrcVT = Src.getValueType();
6128 // Truncated source must be a simple vector.
6129 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6130 (SrcVT.getScalarSizeInBits() % 8) != 0)
6131 return false;
6132 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6133 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6134 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
6135 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
6136 for (unsigned i = 0; i != NumSrcElts; ++i)
6137 Mask.push_back(i * Scale);
6138 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
6139 Ops.push_back(Src);
6140 return true;
6141 }
6142 case X86ISD::VSHLI:
6143 case X86ISD::VSRLI: {
6144 uint64_t ShiftVal = N.getConstantOperandVal(1);
6145 // Out of range bit shifts are guaranteed to be zero.
6146 if (NumBitsPerElt <= ShiftVal) {
6147 Mask.append(NumElts, SM_SentinelZero);
6148 return true;
6149 }
6150
6151 // We can only decode 'whole byte' bit shifts as shuffles.
6152 if ((ShiftVal % 8) != 0)
6153 break;
6154
6155 uint64_t ByteShift = ShiftVal / 8;
6156 Ops.push_back(N.getOperand(0));
6157
6158 // Clear mask to all zeros and insert the shifted byte indices.
6159 Mask.append(NumSizeInBytes, SM_SentinelZero);
6160
6161 if (X86ISD::VSHLI == Opcode) {
6162 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6163 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6164 Mask[i + j] = i + j - ByteShift;
6165 } else {
6166 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6167 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6168 Mask[i + j - ByteShift] = i + j;
6169 }
6170 return true;
6171 }
6172 case X86ISD::VROTLI:
6173 case X86ISD::VROTRI: {
6174 // We can only decode 'whole byte' bit rotates as shuffles.
6175 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
6176 if ((RotateVal % 8) != 0)
6177 return false;
6178 Ops.push_back(N.getOperand(0));
6179 int Offset = RotateVal / 8;
6180 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
6181 for (int i = 0; i != (int)NumElts; ++i) {
6182 int BaseIdx = i * NumBytesPerElt;
6183 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
6184 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
6185 }
6186 }
6187 return true;
6188 }
6189 case X86ISD::VBROADCAST: {
6190 SDValue Src = N.getOperand(0);
6191 if (!Src.getSimpleValueType().isVector()) {
6192 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6193 !isNullConstant(Src.getOperand(1)) ||
6194 Src.getOperand(0).getValueType().getScalarType() !=
6195 VT.getScalarType())
6196 return false;
6197 Src = Src.getOperand(0);
6198 }
6199 Ops.push_back(Src);
6200 Mask.append(NumElts, 0);
6201 return true;
6202 }
6204 SDValue Src = N.getOperand(0);
6205 EVT SrcVT = Src.getValueType();
6206 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6207
6208 // Extended source must be a simple vector.
6209 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6210 (NumBitsPerSrcElt % 8) != 0)
6211 return false;
6212
6213 // We can only handle all-signbits extensions.
6214 APInt DemandedSrcElts =
6215 DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
6216 if (DAG.ComputeNumSignBits(Src, DemandedSrcElts) != NumBitsPerSrcElt)
6217 return false;
6218
6219 assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 && "Unexpected extension");
6220 unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
6221 for (unsigned I = 0; I != NumElts; ++I)
6222 Mask.append(Scale, I);
6223 Ops.push_back(Src);
6224 return true;
6225 }
6226 case ISD::ZERO_EXTEND:
6227 case ISD::ANY_EXTEND:
6230 SDValue Src = N.getOperand(0);
6231 EVT SrcVT = Src.getValueType();
6232
6233 // Extended source must be a simple vector.
6234 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6235 (SrcVT.getScalarSizeInBits() % 8) != 0)
6236 return false;
6237
6238 bool IsAnyExtend =
6239 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
6240 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
6241 IsAnyExtend, Mask);
6242 Ops.push_back(Src);
6243 return true;
6244 }
6245 }
6246
6247 return false;
6248}
6249
6250/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
6252 SmallVectorImpl<int> &Mask) {
6253 int MaskWidth = Mask.size();
6254 SmallVector<SDValue, 16> UsedInputs;
6255 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6256 int lo = UsedInputs.size() * MaskWidth;
6257 int hi = lo + MaskWidth;
6258
6259 // Strip UNDEF input usage.
6260 if (Inputs[i].isUndef())
6261 for (int &M : Mask)
6262 if ((lo <= M) && (M < hi))
6263 M = SM_SentinelUndef;
6264
6265 // Check for unused inputs.
6266 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6267 for (int &M : Mask)
6268 if (lo <= M)
6269 M -= MaskWidth;
6270 continue;
6271 }
6272
6273 // Check for repeated inputs.
6274 bool IsRepeat = false;
6275 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
6276 if (UsedInputs[j] != Inputs[i])
6277 continue;
6278 for (int &M : Mask)
6279 if (lo <= M)
6280 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
6281 IsRepeat = true;
6282 break;
6283 }
6284 if (IsRepeat)
6285 continue;
6286
6287 UsedInputs.push_back(Inputs[i]);
6288 }
6289 Inputs = UsedInputs;
6290}
6291
6292/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
6293/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
6294/// Returns true if the target shuffle mask was decoded.
6295static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6298 APInt &KnownUndef, APInt &KnownZero,
6299 const SelectionDAG &DAG, unsigned Depth,
6300 bool ResolveKnownElts) {
6302 return false; // Limit search depth.
6303
6304 EVT VT = Op.getValueType();
6305 if (!VT.isSimple() || !VT.isVector())
6306 return false;
6307
6308 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
6309 if (ResolveKnownElts)
6310 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
6311 return true;
6312 }
6313 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
6314 ResolveKnownElts)) {
6315 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
6316 return true;
6317 }
6318 return false;
6319}
6320
6321static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6324 const SelectionDAG &DAG, unsigned Depth,
6325 bool ResolveKnownElts) {
6326 APInt KnownUndef, KnownZero;
6327 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
6328 KnownZero, DAG, Depth, ResolveKnownElts);
6329}
6330
6333 const SelectionDAG &DAG, unsigned Depth = 0,
6334 bool ResolveKnownElts = true) {
6335 EVT VT = Op.getValueType();
6336 if (!VT.isSimple() || !VT.isVector())
6337 return false;
6338
6339 unsigned NumElts = Op.getValueType().getVectorNumElements();
6340 APInt DemandedElts = APInt::getAllOnes(NumElts);
6341 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,
6342 ResolveKnownElts);
6343}
6344
6345// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
6346static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
6347 EVT MemVT, MemSDNode *Mem, unsigned Offset,
6348 SelectionDAG &DAG) {
6349 assert((Opcode == X86ISD::VBROADCAST_LOAD ||
6350 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&
6351 "Unknown broadcast load type");
6352
6353 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
6354 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
6355 return SDValue();
6356
6359 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
6360 SDValue Ops[] = {Mem->getChain(), Ptr};
6361 SDValue BcstLd = DAG.getMemIntrinsicNode(
6362 Opcode, DL, Tys, Ops, MemVT,
6364 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
6365 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
6366 return BcstLd;
6367}
6368
6369/// Returns the scalar element that will make up the i'th
6370/// element of the result of the vector shuffle.
6372 SelectionDAG &DAG, unsigned Depth) {
6374 return SDValue(); // Limit search depth.
6375
6376 EVT VT = Op.getValueType();
6377 unsigned Opcode = Op.getOpcode();
6378 unsigned NumElems = VT.getVectorNumElements();
6379
6380 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6381 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
6382 int Elt = SV->getMaskElt(Index);
6383
6384 if (Elt < 0)
6385 return DAG.getUNDEF(VT.getVectorElementType());
6386
6387 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
6388 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6389 }
6390
6391 // Recurse into target specific vector shuffles to find scalars.
6392 if (isTargetShuffle(Opcode)) {
6393 MVT ShufVT = VT.getSimpleVT();
6394 MVT ShufSVT = ShufVT.getVectorElementType();
6395 int NumElems = (int)ShufVT.getVectorNumElements();
6396 SmallVector<int, 16> ShuffleMask;
6398 if (!getTargetShuffleMask(Op, true, ShuffleOps, ShuffleMask))
6399 return SDValue();
6400
6401 int Elt = ShuffleMask[Index];
6402 if (Elt == SM_SentinelZero)
6403 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
6404 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
6405 if (Elt == SM_SentinelUndef)
6406 return DAG.getUNDEF(ShufSVT);
6407
6408 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
6409 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6410 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6411 }
6412
6413 // Recurse into insert_subvector base/sub vector to find scalars.
6414 if (Opcode == ISD::INSERT_SUBVECTOR) {
6415 SDValue Vec = Op.getOperand(0);
6416 SDValue Sub = Op.getOperand(1);
6417 uint64_t SubIdx = Op.getConstantOperandVal(2);
6418 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
6419
6420 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
6421 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
6422 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
6423 }
6424
6425 // Recurse into concat_vectors sub vector to find scalars.
6426 if (Opcode == ISD::CONCAT_VECTORS) {
6427 EVT SubVT = Op.getOperand(0).getValueType();
6428 unsigned NumSubElts = SubVT.getVectorNumElements();
6429 uint64_t SubIdx = Index / NumSubElts;
6430 uint64_t SubElt = Index % NumSubElts;
6431 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
6432 }
6433
6434 // Recurse into extract_subvector src vector to find scalars.
6435 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
6436 SDValue Src = Op.getOperand(0);
6437 uint64_t SrcIdx = Op.getConstantOperandVal(1);
6438 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
6439 }
6440
6441 // We only peek through bitcasts of the same vector width.
6442 if (Opcode == ISD::BITCAST) {
6443 SDValue Src = Op.getOperand(0);
6444 EVT SrcVT = Src.getValueType();
6445 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
6446 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
6447 return SDValue();
6448 }
6449
6450 // Actual nodes that may contain scalar elements
6451
6452 // For insert_vector_elt - either return the index matching scalar or recurse
6453 // into the base vector.
6454 if (Opcode == ISD::INSERT_VECTOR_ELT &&
6455 isa<ConstantSDNode>(Op.getOperand(2))) {
6456 if (Op.getConstantOperandAPInt(2) == Index)
6457 return Op.getOperand(1);
6458 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
6459 }
6460
6461 if (Opcode == ISD::SCALAR_TO_VECTOR)
6462 return (Index == 0) ? Op.getOperand(0)
6463 : DAG.getUNDEF(VT.getVectorElementType());
6464
6465 if (Opcode == ISD::BUILD_VECTOR)
6466 return Op.getOperand(Index);
6467
6468 return SDValue();
6469}
6470
6471// Use PINSRB/PINSRW/PINSRD to create a build vector.
6473 const APInt &NonZeroMask,
6474 unsigned NumNonZero, unsigned NumZero,
6475 SelectionDAG &DAG,
6476 const X86Subtarget &Subtarget) {
6477 MVT VT = Op.getSimpleValueType();
6478 unsigned NumElts = VT.getVectorNumElements();
6479 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
6480 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
6481 "Illegal vector insertion");
6482
6483 SDValue V;
6484 bool First = true;
6485
6486 for (unsigned i = 0; i < NumElts; ++i) {
6487 bool IsNonZero = NonZeroMask[i];
6488 if (!IsNonZero)
6489 continue;
6490
6491 // If the build vector contains zeros or our first insertion is not the
6492 // first index then insert into zero vector to break any register
6493 // dependency else use SCALAR_TO_VECTOR.
6494 if (First) {
6495 First = false;
6496 if (NumZero || 0 != i)
6497 V = getZeroVector(VT, Subtarget, DAG, DL);
6498 else {
6499 assert(0 == i && "Expected insertion into zero-index");
6500 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6501 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6502 V = DAG.getBitcast(VT, V);
6503 continue;
6504 }
6505 }
6506 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, V, Op.getOperand(i),
6507 DAG.getIntPtrConstant(i, DL));
6508 }
6509
6510 return V;
6511}
6512
6513/// Custom lower build_vector of v16i8.
6515 const APInt &NonZeroMask,
6516 unsigned NumNonZero, unsigned NumZero,
6517 SelectionDAG &DAG,
6518 const X86Subtarget &Subtarget) {
6519 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6520 return SDValue();
6521
6522 // SSE4.1 - use PINSRB to insert each byte directly.
6523 if (Subtarget.hasSSE41())
6524 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero,
6525 DAG, Subtarget);
6526
6527 SDValue V;
6528
6529 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6530 // If both the lowest 16-bits are non-zero, then convert to MOVD.
6531 if (!NonZeroMask.extractBits(2, 0).isZero() &&
6532 !NonZeroMask.extractBits(2, 2).isZero()) {
6533 for (unsigned I = 0; I != 4; ++I) {
6534 if (!NonZeroMask[I])
6535 continue;
6536 SDValue Elt = DAG.getZExtOrTrunc(Op.getOperand(I), DL, MVT::i32);
6537 if (I != 0)
6538 Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt,
6539 DAG.getConstant(I * 8, DL, MVT::i8));
6540 V = V ? DAG.getNode(ISD::OR, DL, MVT::i32, V, Elt) : Elt;
6541 }
6542 assert(V && "Failed to fold v16i8 vector to zero");
6543 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6544 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32, V);
6545 V = DAG.getBitcast(MVT::v8i16, V);
6546 }
6547 for (unsigned i = V ? 4 : 0; i < 16; i += 2) {
6548 bool ThisIsNonZero = NonZeroMask[i];
6549 bool NextIsNonZero = NonZeroMask[i + 1];
6550 if (!ThisIsNonZero && !NextIsNonZero)
6551 continue;
6552
6553 SDValue Elt;
6554 if (ThisIsNonZero) {
6555 if (NumZero || NextIsNonZero)
6556 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6557 else
6558 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6559 }
6560
6561 if (NextIsNonZero) {
6562 SDValue NextElt = Op.getOperand(i + 1);
6563 if (i == 0 && NumZero)
6564 NextElt = DAG.getZExtOrTrunc(NextElt, DL, MVT::i32);
6565 else
6566 NextElt = DAG.getAnyExtOrTrunc(NextElt, DL, MVT::i32);
6567 NextElt = DAG.getNode(ISD::SHL, DL, MVT::i32, NextElt,
6568 DAG.getConstant(8, DL, MVT::i8));
6569 if (ThisIsNonZero)
6570 Elt = DAG.getNode(ISD::OR, DL, MVT::i32, NextElt, Elt);
6571 else
6572 Elt = NextElt;
6573 }
6574
6575 // If our first insertion is not the first index or zeros are needed, then
6576 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
6577 // elements undefined).
6578 if (!V) {
6579 if (i != 0 || NumZero)
6580 V = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
6581 else {
6582 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, Elt);
6583 V = DAG.getBitcast(MVT::v8i16, V);
6584 continue;
6585 }
6586 }
6587 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6588 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16, V, Elt,
6589 DAG.getIntPtrConstant(i / 2, DL));
6590 }
6591
6592 return DAG.getBitcast(MVT::v16i8, V);
6593}
6594
6595/// Custom lower build_vector of v8i16.
6597 const APInt &NonZeroMask,
6598 unsigned NumNonZero, unsigned NumZero,
6599 SelectionDAG &DAG,
6600 const X86Subtarget &Subtarget) {
6601 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6602 return SDValue();
6603
6604 // Use PINSRW to insert each byte directly.
6605 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero, DAG,
6606 Subtarget);
6607}
6608
6609/// Custom lower build_vector of v4i32 or v4f32.
6611 SelectionDAG &DAG,
6612 const X86Subtarget &Subtarget) {
6613 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
6614 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
6615 // Because we're creating a less complicated build vector here, we may enable
6616 // further folding of the MOVDDUP via shuffle transforms.
6617 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
6618 Op.getOperand(0) == Op.getOperand(2) &&
6619 Op.getOperand(1) == Op.getOperand(3) &&
6620 Op.getOperand(0) != Op.getOperand(1)) {
6621 MVT VT = Op.getSimpleValueType();
6622 MVT EltVT = VT.getVectorElementType();
6623 // Create a new build vector with the first 2 elements followed by undef
6624 // padding, bitcast to v2f64, duplicate, and bitcast back.
6625 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
6626 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
6627 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
6628 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
6629 return DAG.getBitcast(VT, Dup);
6630 }
6631
6632 // Find all zeroable elements.
6633 std::bitset<4> Zeroable, Undefs;
6634 for (int i = 0; i < 4; ++i) {
6635 SDValue Elt = Op.getOperand(i);
6636 Undefs[i] = Elt.isUndef();
6637 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6638 }
6639 assert(Zeroable.size() - Zeroable.count() > 1 &&
6640 "We expect at least two non-zero elements!");
6641
6642 // We only know how to deal with build_vector nodes where elements are either
6643 // zeroable or extract_vector_elt with constant index.
6644 SDValue FirstNonZero;
6645 unsigned FirstNonZeroIdx;
6646 for (unsigned i = 0; i < 4; ++i) {
6647 if (Zeroable[i])
6648 continue;
6649 SDValue Elt = Op.getOperand(i);
6650 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6651 !isa<ConstantSDNode>(Elt.getOperand(1)))
6652 return SDValue();
6653 // Make sure that this node is extracting from a 128-bit vector.
6654 MVT VT = Elt.getOperand(0).getSimpleValueType();
6655 if (!VT.is128BitVector())
6656 return SDValue();
6657 if (!FirstNonZero.getNode()) {
6658 FirstNonZero = Elt;
6659 FirstNonZeroIdx = i;
6660 }
6661 }
6662
6663 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6664 SDValue V1 = FirstNonZero.getOperand(0);
6665 MVT VT = V1.getSimpleValueType();
6666
6667 // See if this build_vector can be lowered as a blend with zero.
6668 SDValue Elt;
6669 unsigned EltMaskIdx, EltIdx;
6670 int Mask[4];
6671 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6672 if (Zeroable[EltIdx]) {
6673 // The zero vector will be on the right hand side.
6674 Mask[EltIdx] = EltIdx+4;
6675 continue;
6676 }
6677
6678 Elt = Op->getOperand(EltIdx);
6679 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6680 EltMaskIdx = Elt.getConstantOperandVal(1);
6681 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6682 break;
6683 Mask[EltIdx] = EltIdx;
6684 }
6685
6686 if (EltIdx == 4) {
6687 // Let the shuffle legalizer deal with blend operations.
6688 SDValue VZeroOrUndef = (Zeroable == Undefs)
6689 ? DAG.getUNDEF(VT)
6690 : getZeroVector(VT, Subtarget, DAG, DL);
6691 if (V1.getSimpleValueType() != VT)
6692 V1 = DAG.getBitcast(VT, V1);
6693 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
6694 }
6695
6696 // See if we can lower this build_vector to a INSERTPS.
6697 if (!Subtarget.hasSSE41())
6698 return SDValue();
6699
6700 SDValue V2 = Elt.getOperand(0);
6701 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6702 V1 = SDValue();
6703
6704 bool CanFold = true;
6705 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6706 if (Zeroable[i])
6707 continue;
6708
6709 SDValue Current = Op->getOperand(i);
6710 SDValue SrcVector = Current->getOperand(0);
6711 if (!V1.getNode())
6712 V1 = SrcVector;
6713 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
6714 }
6715
6716 if (!CanFold)
6717 return SDValue();
6718
6719 assert(V1.getNode() && "Expected at least two non-zero elements!");
6720 if (V1.getSimpleValueType() != MVT::v4f32)
6721 V1 = DAG.getBitcast(MVT::v4f32, V1);
6722 if (V2.getSimpleValueType() != MVT::v4f32)
6723 V2 = DAG.getBitcast(MVT::v4f32, V2);
6724
6725 // Ok, we can emit an INSERTPS instruction.
6726 unsigned ZMask = Zeroable.to_ulong();
6727
6728 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6729 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6730 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6731 DAG.getIntPtrConstant(InsertPSMask, DL, true));
6732 return DAG.getBitcast(VT, Result);
6733}
6734
6735/// Return a vector logical shift node.
6736static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6737 SelectionDAG &DAG, const TargetLowering &TLI,
6738 const SDLoc &dl) {
6739 assert(VT.is128BitVector() && "Unknown type for VShift");
6740 MVT ShVT = MVT::v16i8;
6741 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6742 SrcOp = DAG.getBitcast(ShVT, SrcOp);
6743 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6744 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
6745 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6746}
6747
6749 SelectionDAG &DAG) {
6750
6751 // Check if the scalar load can be widened into a vector load. And if
6752 // the address is "base + cst" see if the cst can be "absorbed" into
6753 // the shuffle mask.
6754 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
6755 SDValue Ptr = LD->getBasePtr();
6756 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
6757 return SDValue();
6758 EVT PVT = LD->getValueType(0);
6759 if (PVT != MVT::i32 && PVT != MVT::f32)
6760 return SDValue();
6761
6762 int FI = -1;
6763 int64_t Offset = 0;
6764 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
6765 FI = FINode->getIndex();
6766 Offset = 0;
6767 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
6768 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
6769 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
6770 Offset = Ptr.getConstantOperandVal(1);
6771 Ptr = Ptr.getOperand(0);
6772 } else {
6773 return SDValue();
6774 }
6775
6776 // FIXME: 256-bit vector instructions don't require a strict alignment,
6777 // improve this code to support it better.
6778 Align RequiredAlign(VT.getSizeInBits() / 8);
6779 SDValue Chain = LD->getChain();
6780 // Make sure the stack object alignment is at least 16 or 32.
6782 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
6783 if (!InferredAlign || *InferredAlign < RequiredAlign) {
6784 if (MFI.isFixedObjectIndex(FI)) {
6785 // Can't change the alignment. FIXME: It's possible to compute
6786 // the exact stack offset and reference FI + adjust offset instead.
6787 // If someone *really* cares about this. That's the way to implement it.
6788 return SDValue();
6789 } else {
6790 MFI.setObjectAlignment(FI, RequiredAlign);
6791 }
6792 }
6793
6794 // (Offset % 16 or 32) must be multiple of 4. Then address is then
6795 // Ptr + (Offset & ~15).
6796 if (Offset < 0)
6797 return SDValue();
6798 if ((Offset % RequiredAlign.value()) & 3)
6799 return SDValue();
6800 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
6801 if (StartOffset) {
6802 SDLoc DL(Ptr);
6803 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6804 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
6805 }
6806
6807 int EltNo = (Offset - StartOffset) >> 2;
6808 unsigned NumElems = VT.getVectorNumElements();
6809
6810 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6811 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6812 LD->getPointerInfo().getWithOffset(StartOffset));
6813
6814 SmallVector<int, 8> Mask(NumElems, EltNo);
6815
6816 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
6817 }
6818
6819 return SDValue();
6820}
6821
6822// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
6823static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
6824 if (ISD::isNON_EXTLoad(Elt.getNode())) {
6825 auto *BaseLd = cast<LoadSDNode>(Elt);
6826 if (!BaseLd->isSimple())
6827 return false;
6828 Ld = BaseLd;
6829 ByteOffset = 0;
6830 return true;
6831 }
6832
6833 switch (Elt.getOpcode()) {
6834 case ISD::BITCAST:
6835 case ISD::TRUNCATE:
6837 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
6838 case ISD::SRL:
6839 if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
6840 uint64_t Amt = AmtC->getZExtValue();
6841 if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
6842 ByteOffset += Amt / 8;
6843 return true;
6844 }
6845 }
6846 break;
6848 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
6849 SDValue Src = Elt.getOperand(0);
6850 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
6851 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
6852 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
6853 findEltLoadSrc(Src, Ld, ByteOffset)) {
6854 uint64_t Idx = IdxC->getZExtValue();
6855 ByteOffset += Idx * (SrcSizeInBits / 8);
6856 return true;
6857 }
6858 }
6859 break;
6860 }
6861
6862 return false;
6863}
6864
6865/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6866/// elements can be replaced by a single large load which has the same value as
6867/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6868///
6869/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
6871 const SDLoc &DL, SelectionDAG &DAG,
6872 const X86Subtarget &Subtarget,
6873 bool IsAfterLegalize) {
6874 if ((VT.getScalarSizeInBits() % 8) != 0)
6875 return SDValue();
6876
6877 unsigned NumElems = Elts.size();
6878
6879 int LastLoadedElt = -1;
6880 APInt LoadMask = APInt::getZero(NumElems);
6881 APInt ZeroMask = APInt::getZero(NumElems);
6882 APInt UndefMask = APInt::getZero(NumElems);
6883
6884 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
6885 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
6886
6887 // For each element in the initializer, see if we've found a load, zero or an
6888 // undef.
6889 for (unsigned i = 0; i < NumElems; ++i) {
6890 SDValue Elt = peekThroughBitcasts(Elts[i]);
6891 if (!Elt.getNode())
6892 return SDValue();
6893 if (Elt.isUndef()) {
6894 UndefMask.setBit(i);
6895 continue;
6896 }
6898 ZeroMask.setBit(i);
6899 continue;
6900 }
6901
6902 // Each loaded element must be the correct fractional portion of the
6903 // requested vector load.
6904 unsigned EltSizeInBits = Elt.getValueSizeInBits();
6905 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
6906 return SDValue();
6907
6908 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
6909 return SDValue();
6910 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
6911 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
6912 return SDValue();
6913
6914 LoadMask.setBit(i);
6915 LastLoadedElt = i;
6916 }
6917 assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==
6918 NumElems &&
6919 "Incomplete element masks");
6920
6921 // Handle Special Cases - all undef or undef/zero.
6922 if (UndefMask.popcount() == NumElems)
6923 return DAG.getUNDEF(VT);
6924 if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)
6925 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
6926 : DAG.getConstantFP(0.0, DL, VT);
6927
6928 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6929 int FirstLoadedElt = LoadMask.countr_zero();
6930 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
6931 EVT EltBaseVT = EltBase.getValueType();
6932 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
6933 "Register/Memory size mismatch");
6934 LoadSDNode *LDBase = Loads[FirstLoadedElt];
6935 assert(LDBase && "Did not find base load for merging consecutive loads");
6936 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
6937 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
6938 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
6939 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
6940 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
6941
6942 // TODO: Support offsetting the base load.
6943 if (ByteOffsets[FirstLoadedElt] != 0)
6944 return SDValue();
6945
6946 // Check to see if the element's load is consecutive to the base load
6947 // or offset from a previous (already checked) load.
6948 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
6949 LoadSDNode *Ld = Loads[EltIdx];
6950 int64_t ByteOffset = ByteOffsets[EltIdx];
6951 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
6952 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
6953 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
6954 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
6955 }
6956 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
6957 EltIdx - FirstLoadedElt);
6958 };
6959
6960 // Consecutive loads can contain UNDEFS but not ZERO elements.
6961 // Consecutive loads with UNDEFs and ZEROs elements require a
6962 // an additional shuffle stage to clear the ZERO elements.
6963 bool IsConsecutiveLoad = true;
6964 bool IsConsecutiveLoadWithZeros = true;
6965 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
6966 if (LoadMask[i]) {
6967 if (!CheckConsecutiveLoad(LDBase, i)) {
6968 IsConsecutiveLoad = false;
6969 IsConsecutiveLoadWithZeros = false;
6970 break;
6971 }
6972 } else if (ZeroMask[i]) {
6973 IsConsecutiveLoad = false;
6974 }
6975 }
6976
6977 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
6978 auto MMOFlags = LDBase->getMemOperand()->getFlags();
6979 assert(LDBase->isSimple() &&
6980 "Cannot merge volatile or atomic loads.");
6981 SDValue NewLd =
6982 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6983 LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
6984 MMOFlags);
6985 for (auto *LD : Loads)
6986 if (LD)
6987 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
6988 return NewLd;
6989 };
6990
6991 // Check if the base load is entirely dereferenceable.
6992 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
6993 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
6994
6995 // LOAD - all consecutive load/undefs (must start/end with a load or be
6996 // entirely dereferenceable). If we have found an entire vector of loads and
6997 // undefs, then return a large load of the entire vector width starting at the
6998 // base pointer. If the vector contains zeros, then attempt to shuffle those
6999 // elements.
7000 if (FirstLoadedElt == 0 &&
7001 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
7002 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
7003 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
7004 return SDValue();
7005
7006 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
7007 // will lower to regular temporal loads and use the cache.
7008 if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
7009 VT.is256BitVector() && !Subtarget.hasInt256())
7010 return SDValue();
7011
7012 if (NumElems == 1)
7013 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
7014
7015 if (!ZeroMask)
7016 return CreateLoad(VT, LDBase);
7017
7018 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
7019 // vector and a zero vector to clear out the zero elements.
7020 if (!IsAfterLegalize && VT.isVector()) {
7021 unsigned NumMaskElts = VT.getVectorNumElements();
7022 if ((NumMaskElts % NumElems) == 0) {
7023 unsigned Scale = NumMaskElts / NumElems;
7024 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
7025 for (unsigned i = 0; i < NumElems; ++i) {
7026 if (UndefMask[i])
7027 continue;
7028 int Offset = ZeroMask[i] ? NumMaskElts : 0;
7029 for (unsigned j = 0; j != Scale; ++j)
7030 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
7031 }
7032 SDValue V = CreateLoad(VT, LDBase);
7033 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
7034 : DAG.getConstantFP(0.0, DL, VT);
7035 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
7036 }
7037 }
7038 }
7039
7040 // If the upper half of a ymm/zmm load is undef then just load the lower half.
7041 if (VT.is256BitVector() || VT.is512BitVector()) {
7042 unsigned HalfNumElems = NumElems / 2;
7043 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
7044 EVT HalfVT =
7045 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
7046 SDValue HalfLD =
7047 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
7048 DAG, Subtarget, IsAfterLegalize);
7049 if (HalfLD)
7050 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
7051 HalfLD, DAG.getIntPtrConstant(0, DL));
7052 }
7053 }
7054
7055 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
7056 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
7057 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
7058 LoadSizeInBits == 64) &&
7059 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
7060 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
7061 : MVT::getIntegerVT(LoadSizeInBits);
7062 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
7063 // Allow v4f32 on SSE1 only targets.
7064 // FIXME: Add more isel patterns so we can just use VT directly.
7065 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
7066 VecVT = MVT::v4f32;
7067 if (TLI.isTypeLegal(VecVT)) {
7068 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
7069 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
7070 SDValue ResNode = DAG.getMemIntrinsicNode(
7071 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
7073 for (auto *LD : Loads)
7074 if (LD)
7075 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
7076 return DAG.getBitcast(VT, ResNode);
7077 }
7078 }
7079
7080 // BROADCAST - match the smallest possible repetition pattern, load that
7081 // scalar/subvector element and then broadcast to the entire vector.
7082 if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
7083 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
7084 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
7085 unsigned RepeatSize = SubElems * BaseSizeInBits;
7086 unsigned ScalarSize = std::min(RepeatSize, 64u);
7087 if (!Subtarget.hasAVX2() && ScalarSize < 32)
7088 continue;
7089
7090 // Don't attempt a 1:N subvector broadcast - it should be caught by
7091 // combineConcatVectorOps, else will cause infinite loops.
7092 if (RepeatSize > ScalarSize && SubElems == 1)
7093 continue;
7094
7095 bool Match = true;
7096 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
7097 for (unsigned i = 0; i != NumElems && Match; ++i) {
7098 if (!LoadMask[i])
7099 continue;
7100 SDValue Elt = peekThroughBitcasts(Elts[i]);
7101 if (RepeatedLoads[i % SubElems].isUndef())
7102 RepeatedLoads[i % SubElems] = Elt;
7103 else
7104 Match &= (RepeatedLoads[i % SubElems] == Elt);
7105 }
7106
7107 // We must have loads at both ends of the repetition.
7108 Match &= !RepeatedLoads.front().isUndef();
7109 Match &= !RepeatedLoads.back().isUndef();
7110 if (!Match)
7111 continue;
7112
7113 EVT RepeatVT =
7114 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
7115 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
7116 : EVT::getFloatingPointVT(ScalarSize);
7117 if (RepeatSize > ScalarSize)
7118 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
7119 RepeatSize / ScalarSize);
7120 EVT BroadcastVT =
7121 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
7122 VT.getSizeInBits() / ScalarSize);
7123 if (TLI.isTypeLegal(BroadcastVT)) {
7124 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
7125 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
7126 SDValue Broadcast = RepeatLoad;
7127 if (RepeatSize > ScalarSize) {
7128 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
7129 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
7130 } else {
7131 if (!Subtarget.hasAVX2() &&
7133 RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
7134 Subtarget,
7135 /*AssumeSingleUse=*/true))
7136 return SDValue();
7137 Broadcast =
7138 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
7139 }
7140 return DAG.getBitcast(VT, Broadcast);
7141 }
7142 }
7143 }
7144 }
7145
7146 return SDValue();
7147}
7148
7149// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
7150// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
7151// are consecutive, non-overlapping, and in the right order.
7153 SelectionDAG &DAG,
7154 const X86Subtarget &Subtarget,
7155 bool IsAfterLegalize) {
7157 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
7158 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
7159 Elts.push_back(Elt);
7160 continue;
7161 }
7162 return SDValue();
7163 }
7164 assert(Elts.size() == VT.getVectorNumElements());
7165 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
7166 IsAfterLegalize);
7167}
7168
7170 const APInt &Undefs, LLVMContext &C) {
7171 unsigned ScalarSize = VT.getScalarSizeInBits();
7172 Type *Ty = EVT(VT.getScalarType()).getTypeForEVT(C);
7173
7174 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7175 if (VT.isFloatingPoint()) {
7176 if (ScalarSize == 16)
7177 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7178 if (ScalarSize == 32)
7179 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7180 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7181 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7182 }
7183 return Constant::getIntegerValue(Ty, Val);
7184 };
7185
7186 SmallVector<Constant *, 32> ConstantVec;
7187 for (unsigned I = 0, E = Bits.size(); I != E; ++I)
7188 ConstantVec.push_back(Undefs[I] ? UndefValue::get(Ty)
7189 : getConstantScalar(Bits[I]));
7190
7191 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7192}
7193
7194static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
7195 unsigned SplatBitSize, LLVMContext &C) {
7196 unsigned ScalarSize = VT.getScalarSizeInBits();
7197
7198 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7199 if (VT.isFloatingPoint()) {
7200 if (ScalarSize == 16)
7201 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7202 if (ScalarSize == 32)
7203 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7204 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7205 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7206 }
7207 return Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
7208 };
7209
7210 if (ScalarSize == SplatBitSize)
7211 return getConstantScalar(SplatValue);
7212
7213 unsigned NumElm = SplatBitSize / ScalarSize;
7214 SmallVector<Constant *, 32> ConstantVec;
7215 for (unsigned I = 0; I != NumElm; ++I) {
7216 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * I);
7217 ConstantVec.push_back(getConstantScalar(Val));
7218 }
7219 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7220}
7221
7223 for (auto *U : N->uses()) {
7224 unsigned Opc = U->getOpcode();
7225 // VPERMV/VPERMV3 shuffles can never fold their index operands.
7226 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
7227 return false;
7228 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
7229 return false;
7230 if (isTargetShuffle(Opc))
7231 return true;
7232 if (Opc == ISD::BITCAST) // Ignore bitcasts
7233 return isFoldableUseOfShuffle(U);
7234 if (N->hasOneUse()) {
7235 // TODO, there may be some general way to know if a SDNode can
7236 // be folded. We now only know whether an MI is foldable.
7237 if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
7238 return false;
7239 return true;
7240 }
7241 }
7242 return false;
7243}
7244
7245/// Attempt to use the vbroadcast instruction to generate a splat value
7246/// from a splat BUILD_VECTOR which uses:
7247/// a. A single scalar load, or a constant.
7248/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
7249///
7250/// The VBROADCAST node is returned when a pattern is found,
7251/// or SDValue() otherwise.
7253 const SDLoc &dl,
7254 const X86Subtarget &Subtarget,
7255 SelectionDAG &DAG) {
7256 // VBROADCAST requires AVX.
7257 // TODO: Splats could be generated for non-AVX CPUs using SSE
7258 // instructions, but there's less potential gain for only 128-bit vectors.
7259 if (!Subtarget.hasAVX())
7260 return SDValue();
7261
7262 MVT VT = BVOp->getSimpleValueType(0);
7263 unsigned NumElts = VT.getVectorNumElements();
7264 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7265 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
7266 "Unsupported vector type for broadcast.");
7267
7268 // See if the build vector is a repeating sequence of scalars (inc. splat).
7269 SDValue Ld;
7270 BitVector UndefElements;
7271 SmallVector<SDValue, 16> Sequence;
7272 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
7273 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
7274 if (Sequence.size() == 1)
7275 Ld = Sequence[0];
7276 }
7277
7278 // Attempt to use VBROADCASTM
7279 // From this pattern:
7280 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
7281 // b. t1 = (build_vector t0 t0)
7282 //
7283 // Create (VBROADCASTM v2i1 X)
7284 if (!Sequence.empty() && Subtarget.hasCDI()) {
7285 // If not a splat, are the upper sequence values zeroable?
7286 unsigned SeqLen = Sequence.size();
7287 bool UpperZeroOrUndef =
7288 SeqLen == 1 ||
7289 llvm::all_of(ArrayRef(Sequence).drop_front(), [](SDValue V) {
7290 return !V || V.isUndef() || isNullConstant(V);
7291 });
7292 SDValue Op0 = Sequence[0];
7293 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
7294 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
7295 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
7296 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
7297 ? Op0.getOperand(0)
7298 : Op0.getOperand(0).getOperand(0);
7299 MVT MaskVT = BOperand.getSimpleValueType();
7300 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
7301 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
7302 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
7303 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
7304 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
7305 unsigned Scale = 512 / VT.getSizeInBits();
7306 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
7307 }
7308 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
7309 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
7310 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
7311 return DAG.getBitcast(VT, Bcst);
7312 }
7313 }
7314 }
7315
7316 unsigned NumUndefElts = UndefElements.count();
7317 if (!Ld || (NumElts - NumUndefElts) <= 1) {
7318 APInt SplatValue, Undef;
7319 unsigned SplatBitSize;
7320 bool HasUndef;
7321 // Check if this is a repeated constant pattern suitable for broadcasting.
7322 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
7323 SplatBitSize > VT.getScalarSizeInBits() &&
7324 SplatBitSize < VT.getSizeInBits()) {
7325 // Avoid replacing with broadcast when it's a use of a shuffle
7326 // instruction to preserve the present custom lowering of shuffles.
7327 if (isFoldableUseOfShuffle(BVOp))
7328 return SDValue();
7329 // replace BUILD_VECTOR with broadcast of the repeated constants.
7330 LLVMContext *Ctx = DAG.getContext();
7331 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
7332 if (SplatBitSize == 32 || SplatBitSize == 64 ||
7333 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
7334 // Load the constant scalar/subvector and broadcast it.
7335 MVT CVT = MVT::getIntegerVT(SplatBitSize);
7336 Constant *C = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7337 SDValue CP = DAG.getConstantPool(C, PVT);
7338 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7339
7340 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7341 SDVTList Tys = DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
7342 SDValue Ops[] = {DAG.getEntryNode(), CP};
7343 MachinePointerInfo MPI =
7345 SDValue Brdcst =
7346 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7347 MPI, Alignment, MachineMemOperand::MOLoad);
7348 return DAG.getBitcast(VT, Brdcst);
7349 }
7350 if (SplatBitSize > 64) {
7351 // Load the vector of constants and broadcast it.
7352 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7353 SDValue VCP = DAG.getConstantPool(VecC, PVT);
7354 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
7355 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
7356 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
7357 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7358 SDValue Ops[] = {DAG.getEntryNode(), VCP};
7359 MachinePointerInfo MPI =
7362 Ops, VVT, MPI, Alignment,
7364 }
7365 }
7366
7367 // If we are moving a scalar into a vector (Ld must be set and all elements
7368 // but 1 are undef) and that operation is not obviously supported by
7369 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
7370 // That's better than general shuffling and may eliminate a load to GPR and
7371 // move from scalar to vector register.
7372 if (!Ld || NumElts - NumUndefElts != 1)
7373 return SDValue();
7374 unsigned ScalarSize = Ld.getValueSizeInBits();
7375 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
7376 return SDValue();
7377 }
7378
7379 bool ConstSplatVal =
7380 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
7381 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
7382
7383 // TODO: Handle broadcasts of non-constant sequences.
7384
7385 // Make sure that all of the users of a non-constant load are from the
7386 // BUILD_VECTOR node.
7387 // FIXME: Is the use count needed for non-constant, non-load case?
7388 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
7389 return SDValue();
7390
7391 unsigned ScalarSize = Ld.getValueSizeInBits();
7392 bool IsGE256 = (VT.getSizeInBits() >= 256);
7393
7394 // When optimizing for size, generate up to 5 extra bytes for a broadcast
7395 // instruction to save 8 or more bytes of constant pool data.
7396 // TODO: If multiple splats are generated to load the same constant,
7397 // it may be detrimental to overall size. There needs to be a way to detect
7398 // that condition to know if this is truly a size win.
7399 bool OptForSize = DAG.shouldOptForSize();
7400
7401 // Handle broadcasting a single constant scalar from the constant pool
7402 // into a vector.
7403 // On Sandybridge (no AVX2), it is still better to load a constant vector
7404 // from the constant pool and not to broadcast it from a scalar.
7405 // But override that restriction when optimizing for size.
7406 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
7407 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
7408 EVT CVT = Ld.getValueType();
7409 assert(!CVT.isVector() && "Must not broadcast a vector type");
7410
7411 // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
7412 // For size optimization, also splat v2f64 and v2i64, and for size opt
7413 // with AVX2, also splat i8 and i16.
7414 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
7415 if (ScalarSize == 32 ||
7416 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
7417 CVT == MVT::f16 ||
7418 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
7419 const Constant *C = nullptr;
7420 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
7421 C = CI->getConstantIntValue();
7422 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
7423 C = CF->getConstantFPValue();
7424
7425 assert(C && "Invalid constant type");
7426
7427 SDValue CP =
7429 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7430
7431 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7432 SDValue Ops[] = {DAG.getEntryNode(), CP};
7433 MachinePointerInfo MPI =
7435 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7436 MPI, Alignment, MachineMemOperand::MOLoad);
7437 }
7438 }
7439
7440 // Handle AVX2 in-register broadcasts.
7441 if (!IsLoad && Subtarget.hasInt256() &&
7442 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
7443 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7444
7445 // The scalar source must be a normal load.
7446 if (!IsLoad)
7447 return SDValue();
7448
7449 // Make sure the non-chain result is only used by this build vector.
7450 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
7451 return SDValue();
7452
7453 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7454 (Subtarget.hasVLX() && ScalarSize == 64)) {
7455 auto *LN = cast<LoadSDNode>(Ld);
7456 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7457 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7458 SDValue BCast =
7460 LN->getMemoryVT(), LN->getMemOperand());
7461 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7462 return BCast;
7463 }
7464
7465 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
7466 // double since there is no vbroadcastsd xmm
7467 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
7468 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
7469 auto *LN = cast<LoadSDNode>(Ld);
7470 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7471 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7472 SDValue BCast =
7474 LN->getMemoryVT(), LN->getMemOperand());
7475 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7476 return BCast;
7477 }
7478
7479 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
7480 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7481
7482 // Unsupported broadcast.
7483 return SDValue();
7484}
7485
7486/// For an EXTRACT_VECTOR_ELT with a constant index return the real
7487/// underlying vector and index.
7488///
7489/// Modifies \p ExtractedFromVec to the real vector and returns the real
7490/// index.
7491static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
7492 SDValue ExtIdx) {
7493 int Idx = ExtIdx->getAsZExtVal();
7494 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
7495 return Idx;
7496
7497 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7498 // lowered this:
7499 // (extract_vector_elt (v8f32 %1), Constant<6>)
7500 // to:
7501 // (extract_vector_elt (vector_shuffle<2,u,u,u>
7502 // (extract_subvector (v8f32 %0), Constant<4>),
7503 // undef)
7504 // Constant<0>)
7505 // In this case the vector is the extract_subvector expression and the index
7506 // is 2, as specified by the shuffle.
7507 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
7508 SDValue ShuffleVec = SVOp->getOperand(0);
7509 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
7510 assert(ShuffleVecVT.getVectorElementType() ==
7511 ExtractedFromVec.getSimpleValueType().getVectorElementType());
7512
7513 int ShuffleIdx = SVOp->getMaskElt(Idx);
7514 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
7515 ExtractedFromVec = ShuffleVec;
7516 return ShuffleIdx;
7517 }
7518 return Idx;
7519}
7520
7522 SelectionDAG &DAG) {
7523 MVT VT = Op.getSimpleValueType();
7524
7525 // Skip if insert_vec_elt is not supported.
7526 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7528 return SDValue();
7529
7530 unsigned NumElems = Op.getNumOperands();
7531 SDValue VecIn1;
7532 SDValue VecIn2;
7533 SmallVector<unsigned, 4> InsertIndices;
7534 SmallVector<int, 8> Mask(NumElems, -1);
7535
7536 for (unsigned i = 0; i != NumElems; ++i) {
7537 unsigned Opc = Op.getOperand(i).getOpcode();
7538
7539 if (Opc == ISD::UNDEF)
7540 continue;
7541
7542 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
7543 // Quit if more than 1 elements need inserting.
7544 if (InsertIndices.size() > 1)
7545 return SDValue();
7546
7547 InsertIndices.push_back(i);
7548 continue;
7549 }
7550
7551 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
7552 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
7553
7554 // Quit if non-constant index.
7555 if (!isa<ConstantSDNode>(ExtIdx))
7556 return SDValue();
7557 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
7558
7559 // Quit if extracted from vector of different type.
7560 if (ExtractedFromVec.getValueType() != VT)
7561 return SDValue();
7562
7563 if (!VecIn1.getNode())
7564 VecIn1 = ExtractedFromVec;
7565 else if (VecIn1 != ExtractedFromVec) {
7566 if (!VecIn2.getNode())
7567 VecIn2 = ExtractedFromVec;
7568 else if (VecIn2 != ExtractedFromVec)
7569 // Quit if more than 2 vectors to shuffle
7570 return SDValue();
7571 }
7572
7573 if (ExtractedFromVec == VecIn1)
7574 Mask[i] = Idx;
7575 else if (ExtractedFromVec == VecIn2)
7576 Mask[i] = Idx + NumElems;
7577 }
7578
7579 if (!VecIn1.getNode())
7580 return SDValue();
7581
7582 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
7583 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
7584
7585 for (unsigned Idx : InsertIndices)
7586 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
7587 DAG.getIntPtrConstant(Idx, DL));
7588
7589 return NV;
7590}
7591
7592// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.
7594 const X86Subtarget &Subtarget) {
7595 MVT VT = Op.getSimpleValueType();
7596 MVT IVT =
7597 VT.changeVectorElementType(Subtarget.hasFP16() ? MVT::f16 : MVT::i16);
7599 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
7600 NewOps.push_back(DAG.getBitcast(Subtarget.hasFP16() ? MVT::f16 : MVT::i16,
7601 Op.getOperand(I)));
7602 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
7603 return DAG.getBitcast(VT, Res);
7604}
7605
7606// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
7608 SelectionDAG &DAG,
7609 const X86Subtarget &Subtarget) {
7610
7611 MVT VT = Op.getSimpleValueType();
7612 assert((VT.getVectorElementType() == MVT::i1) &&
7613 "Unexpected type in LowerBUILD_VECTORvXi1!");
7614 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
7615 ISD::isBuildVectorAllOnes(Op.getNode()))
7616 return Op;
7617
7618 uint64_t Immediate = 0;
7619 SmallVector<unsigned, 16> NonConstIdx;
7620 bool IsSplat = true;
7621 bool HasConstElts = false;
7622 int SplatIdx = -1;
7623 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7624 SDValue In = Op.getOperand(idx);
7625 if (In.isUndef())
7626 continue;
7627 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
7628 Immediate |= (InC->getZExtValue() & 0x1) << idx;
7629 HasConstElts = true;
7630 } else {
7631 NonConstIdx.push_back(idx);
7632 }
7633 if (SplatIdx < 0)
7634 SplatIdx = idx;
7635 else if (In != Op.getOperand(SplatIdx))
7636 IsSplat = false;
7637 }
7638
7639 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
7640 if (IsSplat) {
7641 // The build_vector allows the scalar element to be larger than the vector
7642 // element type. We need to mask it to use as a condition unless we know
7643 // the upper bits are zero.
7644 // FIXME: Use computeKnownBits instead of checking specific opcode?
7645 SDValue Cond = Op.getOperand(SplatIdx);
7646 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
7647 if (Cond.getOpcode() != ISD::SETCC)
7648 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
7649 DAG.getConstant(1, dl, MVT::i8));
7650
7651 // Perform the select in the scalar domain so we can use cmov.
7652 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7653 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
7654 DAG.getAllOnesConstant(dl, MVT::i32),
7655 DAG.getConstant(0, dl, MVT::i32));
7656 Select = DAG.getBitcast(MVT::v32i1, Select);
7657 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
7658 } else {
7659 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
7660 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
7661 DAG.getAllOnesConstant(dl, ImmVT),
7662 DAG.getConstant(0, dl, ImmVT));
7663 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
7664 Select = DAG.getBitcast(VecVT, Select);
7665 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
7666 DAG.getIntPtrConstant(0, dl));
7667 }
7668 }
7669
7670 // insert elements one by one
7671 SDValue DstVec;
7672 if (HasConstElts) {
7673 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7674 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
7675 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
7676 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
7677 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
7678 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
7679 } else {
7680 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
7681 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
7682 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
7683 DstVec = DAG.getBitcast(VecVT, Imm);
7684 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
7685 DAG.getIntPtrConstant(0, dl));
7686 }
7687 } else
7688 DstVec = DAG.getUNDEF(VT);
7689
7690 for (unsigned InsertIdx : NonConstIdx) {
7691 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
7692 Op.getOperand(InsertIdx),
7693 DAG.getIntPtrConstant(InsertIdx, dl));
7694 }
7695 return DstVec;
7696}
7697
7698LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) {
7699 switch (Opcode) {
7700 case X86ISD::PACKSS:
7701 case X86ISD::PACKUS:
7702 case X86ISD::FHADD:
7703 case X86ISD::FHSUB:
7704 case X86ISD::HADD:
7705 case X86ISD::HSUB:
7706 return true;
7707 }
7708 return false;
7709}
7710
7711/// This is a helper function of LowerToHorizontalOp().
7712/// This function checks that the build_vector \p N in input implements a
7713/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
7714/// may not match the layout of an x86 256-bit horizontal instruction.
7715/// In other words, if this returns true, then some extraction/insertion will
7716/// be required to produce a valid horizontal instruction.
7717///
7718/// Parameter \p Opcode defines the kind of horizontal operation to match.
7719/// For example, if \p Opcode is equal to ISD::ADD, then this function
7720/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
7721/// is equal to ISD::SUB, then this function checks if this is a horizontal
7722/// arithmetic sub.
7723///
7724/// This function only analyzes elements of \p N whose indices are
7725/// in range [BaseIdx, LastIdx).
7726///
7727/// TODO: This function was originally used to match both real and fake partial
7728/// horizontal operations, but the index-matching logic is incorrect for that.
7729/// See the corrected implementation in isHopBuildVector(). Can we reduce this
7730/// code because it is only used for partial h-op matching now?
7731static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
7732 const SDLoc &DL, SelectionDAG &DAG,
7733 unsigned BaseIdx, unsigned LastIdx,
7734 SDValue &V0, SDValue &V1) {
7735 EVT VT = N->getValueType(0);
7736 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
7737 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
7738 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
7739 "Invalid Vector in input!");
7740
7741 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
7742 bool CanFold = true;
7743 unsigned ExpectedVExtractIdx = BaseIdx;
7744 unsigned NumElts = LastIdx - BaseIdx;
7745 V0 = DAG.getUNDEF(VT);
7746 V1 = DAG.getUNDEF(VT);
7747
7748 // Check if N implements a horizontal binop.
7749 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
7750 SDValue Op = N->getOperand(i + BaseIdx);
7751
7752 // Skip UNDEFs.
7753 if (Op->isUndef()) {
7754 // Update the expected vector extract index.
7755 if (i * 2 == NumElts)
7756 ExpectedVExtractIdx = BaseIdx;
7757 ExpectedVExtractIdx += 2;
7758 continue;
7759 }
7760
7761 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
7762
7763 if (!CanFold)
7764 break;
7765
7766 SDValue Op0 = Op.getOperand(0);
7767 SDValue Op1 = Op.getOperand(1);
7768
7769 // Try to match the following pattern:
7770 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
7771 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7773 Op0.getOperand(0) == Op1.getOperand(0) &&
7774 isa<ConstantSDNode>(Op0.getOperand(1)) &&
7775 isa<ConstantSDNode>(Op1.getOperand(1)));
7776 if (!CanFold)
7777 break;
7778
7779 unsigned I0 = Op0.getConstantOperandVal(1);
7780 unsigned I1 = Op1.getConstantOperandVal(1);
7781
7782 if (i * 2 < NumElts) {
7783 if (V0.isUndef()) {
7784 V0 = Op0.getOperand(0);
7785 if (V0.getValueType() != VT)
7786 return false;
7787 }
7788 } else {
7789 if (V1.isUndef()) {
7790 V1 = Op0.getOperand(0);
7791 if (V1.getValueType() != VT)
7792 return false;
7793 }
7794 if (i * 2 == NumElts)
7795 ExpectedVExtractIdx = BaseIdx;
7796 }
7797
7798 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
7799 if (I0 == ExpectedVExtractIdx)
7800 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
7801 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
7802 // Try to match the following dag sequence:
7803 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
7804 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
7805 } else
7806 CanFold = false;
7807
7808 ExpectedVExtractIdx += 2;
7809 }
7810
7811 return CanFold;
7812}
7813
7814/// Emit a sequence of two 128-bit horizontal add/sub followed by
7815/// a concat_vector.
7816///
7817/// This is a helper function of LowerToHorizontalOp().
7818/// This function expects two 256-bit vectors called V0 and V1.
7819/// At first, each vector is split into two separate 128-bit vectors.
7820/// Then, the resulting 128-bit vectors are used to implement two
7821/// horizontal binary operations.
7822///
7823/// The kind of horizontal binary operation is defined by \p X86Opcode.
7824///
7825/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
7826/// the two new horizontal binop.
7827/// When Mode is set, the first horizontal binop dag node would take as input
7828/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
7829/// horizontal binop dag node would take as input the lower 128-bit of V1
7830/// and the upper 128-bit of V1.
7831/// Example:
7832/// HADD V0_LO, V0_HI
7833/// HADD V1_LO, V1_HI
7834///
7835/// Otherwise, the first horizontal binop dag node takes as input the lower
7836/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
7837/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
7838/// Example:
7839/// HADD V0_LO, V1_LO
7840/// HADD V0_HI, V1_HI
7841///
7842/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
7843/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
7844/// the upper 128-bits of the result.
7845static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
7846 const SDLoc &DL, SelectionDAG &DAG,
7847 unsigned X86Opcode, bool Mode,
7848 bool isUndefLO, bool isUndefHI) {
7849 MVT VT = V0.getSimpleValueType();
7850 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
7851 "Invalid nodes in input!");
7852
7853 unsigned NumElts = VT.getVectorNumElements();
7854 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
7855 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
7856 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
7857 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
7858 MVT NewVT = V0_LO.getSimpleValueType();
7859
7860 SDValue LO = DAG.getUNDEF(NewVT);
7861 SDValue HI = DAG.getUNDEF(NewVT);
7862
7863 if (Mode) {
7864 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7865 if (!isUndefLO && !V0->isUndef())
7866 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
7867 if (!isUndefHI && !V1->isUndef())
7868 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
7869 } else {
7870 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7871 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
7872 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
7873
7874 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
7875 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
7876 }
7877
7878 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
7879}
7880
7881/// Returns true iff \p BV builds a vector with the result equivalent to
7882/// the result of ADDSUB/SUBADD operation.
7883/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
7884/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
7885/// \p Opnd0 and \p Opnd1.
7887 const X86Subtarget &Subtarget, SelectionDAG &DAG,
7888 SDValue &Opnd0, SDValue &Opnd1,
7889 unsigned &NumExtracts,
7890 bool &IsSubAdd) {
7891
7892 MVT VT = BV->getSimpleValueType(0);
7893 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
7894 return false;
7895
7896 unsigned NumElts = VT.getVectorNumElements();
7897 SDValue InVec0 = DAG.getUNDEF(VT);
7898 SDValue InVec1 = DAG.getUNDEF(VT);
7899
7900 NumExtracts = 0;
7901
7902 // Odd-numbered elements in the input build vector are obtained from
7903 // adding/subtracting two integer/float elements.
7904 // Even-numbered elements in the input build vector are obtained from
7905 // subtracting/adding two integer/float elements.
7906 unsigned Opc[2] = {0, 0};
7907 for (unsigned i = 0, e = NumElts; i != e; ++i) {
7908 SDValue Op = BV->getOperand(i);
7909
7910 // Skip 'undef' values.
7911 unsigned Opcode = Op.getOpcode();
7912 if (Opcode == ISD::UNDEF)
7913 continue;
7914
7915 // Early exit if we found an unexpected opcode.
7916 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
7917 return false;
7918
7919 SDValue Op0 = Op.getOperand(0);
7920 SDValue Op1 = Op.getOperand(1);
7921
7922 // Try to match the following pattern:
7923 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
7924 // Early exit if we cannot match that sequence.
7925 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7927 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
7928 Op0.getOperand(1) != Op1.getOperand(1))
7929 return false;
7930
7931 unsigned I0 = Op0.getConstantOperandVal(1);
7932 if (I0 != i)
7933 return false;
7934
7935 // We found a valid add/sub node, make sure its the same opcode as previous
7936 // elements for this parity.
7937 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
7938 return false;
7939 Opc[i % 2] = Opcode;
7940
7941 // Update InVec0 and InVec1.
7942 if (InVec0.isUndef()) {
7943 InVec0 = Op0.getOperand(0);
7944 if (InVec0.getSimpleValueType() != VT)
7945 return false;
7946 }
7947 if (InVec1.isUndef()) {
7948 InVec1 = Op1.getOperand(0);
7949 if (InVec1.getSimpleValueType() != VT)
7950 return false;
7951 }
7952
7953 // Make sure that operands in input to each add/sub node always
7954 // come from a same pair of vectors.
7955 if (InVec0 != Op0.getOperand(0)) {
7956 if (Opcode == ISD::FSUB)
7957 return false;
7958
7959 // FADD is commutable. Try to commute the operands
7960 // and then test again.
7961 std::swap(Op0, Op1);
7962 if (InVec0 != Op0.getOperand(0))
7963 return false;
7964 }
7965
7966 if (InVec1 != Op1.getOperand(0))
7967 return false;
7968
7969 // Increment the number of extractions done.
7970 ++NumExtracts;
7971 }
7972
7973 // Ensure we have found an opcode for both parities and that they are
7974 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
7975 // inputs are undef.
7976 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
7977 InVec0.isUndef() || InVec1.isUndef())
7978 return false;
7979
7980 IsSubAdd = Opc[0] == ISD::FADD;
7981
7982 Opnd0 = InVec0;
7983 Opnd1 = InVec1;
7984 return true;
7985}
7986
7987/// Returns true if is possible to fold MUL and an idiom that has already been
7988/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
7989/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
7990/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
7991///
7992/// Prior to calling this function it should be known that there is some
7993/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
7994/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
7995/// before replacement of such SDNode with ADDSUB operation. Thus the number
7996/// of \p Opnd0 uses is expected to be equal to 2.
7997/// For example, this function may be called for the following IR:
7998/// %AB = fmul fast <2 x double> %A, %B
7999/// %Sub = fsub fast <2 x double> %AB, %C
8000/// %Add = fadd fast <2 x double> %AB, %C
8001/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
8002/// <2 x i32> <i32 0, i32 3>
8003/// There is a def for %Addsub here, which potentially can be replaced by
8004/// X86ISD::ADDSUB operation:
8005/// %Addsub = X86ISD::ADDSUB %AB, %C
8006/// and such ADDSUB can further be replaced with FMADDSUB:
8007/// %Addsub = FMADDSUB %A, %B, %C.
8008///
8009/// The main reason why this method is called before the replacement of the
8010/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
8011/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
8012/// FMADDSUB is.
8013static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
8014 SelectionDAG &DAG,
8015 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
8016 unsigned ExpectedUses) {
8017 if (Opnd0.getOpcode() != ISD::FMUL ||
8018 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
8019 return false;
8020
8021 // FIXME: These checks must match the similar ones in
8022 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
8023 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
8024 // or MUL + ADDSUB to FMADDSUB.
8025 const TargetOptions &Options = DAG.getTarget().Options;
8026 bool AllowFusion =
8027 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
8028 if (!AllowFusion)
8029 return false;
8030
8031 Opnd2 = Opnd1;
8032 Opnd1 = Opnd0.getOperand(1);
8033 Opnd0 = Opnd0.getOperand(0);
8034
8035 return true;
8036}
8037
8038/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
8039/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
8040/// X86ISD::FMSUBADD node.
8042 const SDLoc &DL,
8043 const X86Subtarget &Subtarget,
8044 SelectionDAG &DAG) {
8045 SDValue Opnd0, Opnd1;
8046 unsigned NumExtracts;
8047 bool IsSubAdd;
8048 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
8049 IsSubAdd))
8050 return SDValue();
8051
8052 MVT VT = BV->getSimpleValueType(0);
8053
8054 // Try to generate X86ISD::FMADDSUB node here.
8055 SDValue Opnd2;
8056 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
8057 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
8058 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
8059 }
8060
8061 // We only support ADDSUB.
8062 if (IsSubAdd)
8063 return SDValue();
8064
8065 // There are no known X86 targets with 512-bit ADDSUB instructions!
8066 // Convert to blend(fsub,fadd).
8067 if (VT.is512BitVector()) {
8068 SmallVector<int> Mask;
8069 for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
8070 Mask.push_back(I);
8071 Mask.push_back(I + E + 1);
8072 }
8073 SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
8074 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
8075 return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
8076 }
8077
8078 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
8079}
8080
8082 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
8083 // Initialize outputs to known values.
8084 MVT VT = BV->getSimpleValueType(0);
8085 HOpcode = ISD::DELETED_NODE;
8086 V0 = DAG.getUNDEF(VT);
8087 V1 = DAG.getUNDEF(VT);
8088
8089 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
8090 // half of the result is calculated independently from the 128-bit halves of
8091 // the inputs, so that makes the index-checking logic below more complicated.
8092 unsigned NumElts = VT.getVectorNumElements();
8093 unsigned GenericOpcode = ISD::DELETED_NODE;
8094 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
8095 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
8096 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
8097 for (unsigned i = 0; i != Num128BitChunks; ++i) {
8098 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
8099 // Ignore undef elements.
8100 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
8101 if (Op.isUndef())
8102 continue;
8103
8104 // If there's an opcode mismatch, we're done.
8105 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
8106 return false;
8107
8108 // Initialize horizontal opcode.
8109 if (HOpcode == ISD::DELETED_NODE) {
8110 GenericOpcode = Op.getOpcode();
8111 switch (GenericOpcode) {
8112 // clang-format off
8113 case ISD::ADD: HOpcode = X86ISD::HADD; break;
8114 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
8115 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
8116 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
8117 default: return false;
8118 // clang-format on
8119 }
8120 }
8121
8122 SDValue Op0 = Op.getOperand(0);
8123 SDValue Op1 = Op.getOperand(1);
8124 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8126 Op0.getOperand(0) != Op1.getOperand(0) ||
8127 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
8128 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
8129 return false;
8130
8131 // The source vector is chosen based on which 64-bit half of the
8132 // destination vector is being calculated.
8133 if (j < NumEltsIn64Bits) {
8134 if (V0.isUndef())
8135 V0 = Op0.getOperand(0);
8136 } else {
8137 if (V1.isUndef())
8138 V1 = Op0.getOperand(0);
8139 }
8140
8141 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
8142 if (SourceVec != Op0.getOperand(0))
8143 return false;
8144
8145 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
8146 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
8147 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
8148 unsigned ExpectedIndex = i * NumEltsIn128Bits +
8149 (j % NumEltsIn64Bits) * 2;
8150 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
8151 continue;
8152
8153 // If this is not a commutative op, this does not match.
8154 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
8155 return false;
8156
8157 // Addition is commutative, so try swapping the extract indexes.
8158 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
8159 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
8160 continue;
8161
8162 // Extract indexes do not match horizontal requirement.
8163 return false;
8164 }
8165 }
8166 // We matched. Opcode and operands are returned by reference as arguments.
8167 return true;
8168}
8169
8171 const SDLoc &DL, SelectionDAG &DAG,
8172 unsigned HOpcode, SDValue V0, SDValue V1) {
8173 // If either input vector is not the same size as the build vector,
8174 // extract/insert the low bits to the correct size.
8175 // This is free (examples: zmm --> xmm, xmm --> ymm).
8176 MVT VT = BV->getSimpleValueType(0);
8177 unsigned Width = VT.getSizeInBits();
8178 if (V0.getValueSizeInBits() > Width)
8179 V0 = extractSubVector(V0, 0, DAG, DL, Width);
8180 else if (V0.getValueSizeInBits() < Width)
8181 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, DL, Width);
8182
8183 if (V1.getValueSizeInBits() > Width)
8184 V1 = extractSubVector(V1, 0, DAG, DL, Width);
8185 else if (V1.getValueSizeInBits() < Width)
8186 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, DL, Width);
8187
8188 unsigned NumElts = VT.getVectorNumElements();
8189 APInt DemandedElts = APInt::getAllOnes(NumElts);
8190 for (unsigned i = 0; i != NumElts; ++i)
8191 if (BV->getOperand(i).isUndef())
8192 DemandedElts.clearBit(i);
8193
8194 // If we don't need the upper xmm, then perform as a xmm hop.
8195 unsigned HalfNumElts = NumElts / 2;
8196 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
8197 MVT HalfVT = VT.getHalfNumVectorElementsVT();
8198 V0 = extractSubVector(V0, 0, DAG, DL, 128);
8199 V1 = extractSubVector(V1, 0, DAG, DL, 128);
8200 SDValue Half = DAG.getNode(HOpcode, DL, HalfVT, V0, V1);
8201 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, DL, 256);
8202 }
8203
8204 return DAG.getNode(HOpcode, DL, VT, V0, V1);
8205}
8206
8207/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
8209 const X86Subtarget &Subtarget,
8210 SelectionDAG &DAG) {
8211 // We need at least 2 non-undef elements to make this worthwhile by default.
8212 unsigned NumNonUndefs =
8213 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
8214 if (NumNonUndefs < 2)
8215 return SDValue();
8216
8217 // There are 4 sets of horizontal math operations distinguished by type:
8218 // int/FP at 128-bit/256-bit. Each type was introduced with a different
8219 // subtarget feature. Try to match those "native" patterns first.
8220 MVT VT = BV->getSimpleValueType(0);
8221 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
8222 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
8223 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
8224 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
8225 unsigned HOpcode;
8226 SDValue V0, V1;
8227 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
8228 return getHopForBuildVector(BV, DL, DAG, HOpcode, V0, V1);
8229 }
8230
8231 // Try harder to match 256-bit ops by using extract/concat.
8232 if (!Subtarget.hasAVX() || !VT.is256BitVector())
8233 return SDValue();
8234
8235 // Count the number of UNDEF operands in the build_vector in input.
8236 unsigned NumElts = VT.getVectorNumElements();
8237 unsigned Half = NumElts / 2;
8238 unsigned NumUndefsLO = 0;
8239 unsigned NumUndefsHI = 0;
8240 for (unsigned i = 0, e = Half; i != e; ++i)
8241 if (BV->getOperand(i)->isUndef())
8242 NumUndefsLO++;
8243
8244 for (unsigned i = Half, e = NumElts; i != e; ++i)
8245 if (BV->getOperand(i)->isUndef())
8246 NumUndefsHI++;
8247
8248 SDValue InVec0, InVec1;
8249 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
8250 SDValue InVec2, InVec3;
8251 unsigned X86Opcode;
8252 bool CanFold = true;
8253
8254 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, Half, InVec0, InVec1) &&
8255 isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, Half, NumElts, InVec2,
8256 InVec3) &&
8257 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8258 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8259 X86Opcode = X86ISD::HADD;
8260 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, Half, InVec0,
8261 InVec1) &&
8262 isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, Half, NumElts, InVec2,
8263 InVec3) &&
8264 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8265 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8266 X86Opcode = X86ISD::HSUB;
8267 else
8268 CanFold = false;
8269
8270 if (CanFold) {
8271 // Do not try to expand this build_vector into a pair of horizontal
8272 // add/sub if we can emit a pair of scalar add/sub.
8273 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8274 return SDValue();
8275
8276 // Convert this build_vector into a pair of horizontal binops followed by
8277 // a concat vector. We must adjust the outputs from the partial horizontal
8278 // matching calls above to account for undefined vector halves.
8279 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
8280 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
8281 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
8282 bool isUndefLO = NumUndefsLO == Half;
8283 bool isUndefHI = NumUndefsHI == Half;
8284 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
8285 isUndefHI);
8286 }
8287 }
8288
8289 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
8290 VT == MVT::v16i16) {
8291 unsigned X86Opcode;
8292 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, NumElts, InVec0,
8293 InVec1))
8294 X86Opcode = X86ISD::HADD;
8295 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, NumElts, InVec0,
8296 InVec1))
8297 X86Opcode = X86ISD::HSUB;
8298 else if (isHorizontalBinOpPart(BV, ISD::FADD, DL, DAG, 0, NumElts, InVec0,
8299 InVec1))
8300 X86Opcode = X86ISD::FHADD;
8301 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DL, DAG, 0, NumElts, InVec0,
8302 InVec1))
8303 X86Opcode = X86ISD::FHSUB;
8304 else
8305 return SDValue();
8306
8307 // Don't try to expand this build_vector into a pair of horizontal add/sub
8308 // if we can simply emit a pair of scalar add/sub.
8309 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8310 return SDValue();
8311
8312 // Convert this build_vector into two horizontal add/sub followed by
8313 // a concat vector.
8314 bool isUndefLO = NumUndefsLO == Half;
8315 bool isUndefHI = NumUndefsHI == Half;
8316 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
8317 isUndefLO, isUndefHI);
8318 }
8319
8320 return SDValue();
8321}
8322
8323static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
8324 SelectionDAG &DAG);
8325
8326/// If a BUILD_VECTOR's source elements all apply the same bit operation and
8327/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
8328/// just apply the bit to the vectors.
8329/// NOTE: Its not in our interest to start make a general purpose vectorizer
8330/// from this, but enough scalar bit operations are created from the later
8331/// legalization + scalarization stages to need basic support.
8333 const X86Subtarget &Subtarget,
8334 SelectionDAG &DAG) {
8335 MVT VT = Op->getSimpleValueType(0);
8336 unsigned NumElems = VT.getVectorNumElements();
8337 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8338
8339 // Check that all elements have the same opcode.
8340 // TODO: Should we allow UNDEFS and if so how many?
8341 unsigned Opcode = Op->getOperand(0).getOpcode();
8342 for (unsigned i = 1; i < NumElems; ++i)
8343 if (Opcode != Op->getOperand(i).getOpcode())
8344 return SDValue();
8345
8346 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
8347 bool IsShift = false;
8348 switch (Opcode) {
8349 default:
8350 return SDValue();
8351 case ISD::SHL:
8352 case ISD::SRL:
8353 case ISD::SRA:
8354 IsShift = true;
8355 break;
8356 case ISD::AND:
8357 case ISD::XOR:
8358 case ISD::OR:
8359 // Don't do this if the buildvector is a splat - we'd replace one
8360 // constant with an entire vector.
8361 if (Op->getSplatValue())
8362 return SDValue();
8363 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
8364 return SDValue();
8365 break;
8366 }
8367
8368 SmallVector<SDValue, 4> LHSElts, RHSElts;
8369 for (SDValue Elt : Op->ops()) {
8370 SDValue LHS = Elt.getOperand(0);
8371 SDValue RHS = Elt.getOperand(1);
8372
8373 // We expect the canonicalized RHS operand to be the constant.
8374 if (!isa<ConstantSDNode>(RHS))
8375 return SDValue();
8376
8377 // Extend shift amounts.
8378 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
8379 if (!IsShift)
8380 return SDValue();
8381 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
8382 }
8383
8384 LHSElts.push_back(LHS);
8385 RHSElts.push_back(RHS);
8386 }
8387
8388 // Limit to shifts by uniform immediates.
8389 // TODO: Only accept vXi8/vXi64 special cases?
8390 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
8391 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
8392 return SDValue();
8393
8394 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
8395 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
8396 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
8397
8398 if (!IsShift)
8399 return Res;
8400
8401 // Immediately lower the shift to ensure the constant build vector doesn't
8402 // get converted to a constant pool before the shift is lowered.
8403 return LowerShift(Res, Subtarget, DAG);
8404}
8405
8406/// Create a vector constant without a load. SSE/AVX provide the bare minimum
8407/// functionality to do this, so it's all zeros, all ones, or some derivation
8408/// that is cheap to calculate.
8410 SelectionDAG &DAG,
8411 const X86Subtarget &Subtarget) {
8412 MVT VT = Op.getSimpleValueType();
8413
8414 // Vectors containing all zeros can be matched by pxor and xorps.
8415 if (ISD::isBuildVectorAllZeros(Op.getNode()))
8416 return Op;
8417
8418 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
8419 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
8420 // vpcmpeqd on 256-bit vectors.
8421 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
8422 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
8423 return Op;
8424
8425 return getOnesVector(VT, DAG, DL);
8426 }
8427
8428 return SDValue();
8429}
8430
8431/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
8432/// from a vector of source values and a vector of extraction indices.
8433/// The vectors might be manipulated to match the type of the permute op.
8434static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
8435 const SDLoc &DL, SelectionDAG &DAG,
8436 const X86Subtarget &Subtarget) {
8437 MVT ShuffleVT = VT;
8438 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8439 unsigned NumElts = VT.getVectorNumElements();
8440 unsigned SizeInBits = VT.getSizeInBits();
8441
8442 // Adjust IndicesVec to match VT size.
8443 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
8444 "Illegal variable permute mask size");
8445 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
8446 // Narrow/widen the indices vector to the correct size.
8447 if (IndicesVec.getValueSizeInBits() > SizeInBits)
8448 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
8449 NumElts * VT.getScalarSizeInBits());
8450 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
8451 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
8452 SDLoc(IndicesVec), SizeInBits);
8453 // Zero-extend the index elements within the vector.
8454 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
8455 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
8456 IndicesVT, IndicesVec);
8457 }
8458 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
8459
8460 // Handle SrcVec that don't match VT type.
8461 if (SrcVec.getValueSizeInBits() != SizeInBits) {
8462 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
8463 // Handle larger SrcVec by treating it as a larger permute.
8464 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
8465 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
8466 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8467 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
8468 Subtarget, DAG, SDLoc(IndicesVec));
8469 SDValue NewSrcVec =
8470 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8471 if (NewSrcVec)
8472 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
8473 return SDValue();
8474 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
8475 // Widen smaller SrcVec to match VT.
8476 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
8477 } else
8478 return SDValue();
8479 }
8480
8481 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
8482 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
8483 EVT SrcVT = Idx.getValueType();
8484 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
8485 uint64_t IndexScale = 0;
8486 uint64_t IndexOffset = 0;
8487
8488 // If we're scaling a smaller permute op, then we need to repeat the
8489 // indices, scaling and offsetting them as well.
8490 // e.g. v4i32 -> v16i8 (Scale = 4)
8491 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
8492 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
8493 for (uint64_t i = 0; i != Scale; ++i) {
8494 IndexScale |= Scale << (i * NumDstBits);
8495 IndexOffset |= i << (i * NumDstBits);
8496 }
8497
8498 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
8499 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
8500 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
8501 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
8502 return Idx;
8503 };
8504
8505 unsigned Opcode = 0;
8506 switch (VT.SimpleTy) {
8507 default:
8508 break;
8509 case MVT::v16i8:
8510 if (Subtarget.hasSSSE3())
8511 Opcode = X86ISD::PSHUFB;
8512 break;
8513 case MVT::v8i16:
8514 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8515 Opcode = X86ISD::VPERMV;
8516 else if (Subtarget.hasSSSE3()) {
8517 Opcode = X86ISD::PSHUFB;
8518 ShuffleVT = MVT::v16i8;
8519 }
8520 break;
8521 case MVT::v4f32:
8522 case MVT::v4i32:
8523 if (Subtarget.hasAVX()) {
8524 Opcode = X86ISD::VPERMILPV;
8525 ShuffleVT = MVT::v4f32;
8526 } else if (Subtarget.hasSSSE3()) {
8527 Opcode = X86ISD::PSHUFB;
8528 ShuffleVT = MVT::v16i8;
8529 }
8530 break;
8531 case MVT::v2f64:
8532 case MVT::v2i64:
8533 if (Subtarget.hasAVX()) {
8534 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
8535 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8536 Opcode = X86ISD::VPERMILPV;
8537 ShuffleVT = MVT::v2f64;
8538 } else if (Subtarget.hasSSE41()) {
8539 // SSE41 can compare v2i64 - select between indices 0 and 1.
8540 return DAG.getSelectCC(
8541 DL, IndicesVec,
8542 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
8543 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
8544 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
8546 }
8547 break;
8548 case MVT::v32i8:
8549 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
8550 Opcode = X86ISD::VPERMV;
8551 else if (Subtarget.hasXOP()) {
8552 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
8553 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
8554 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
8555 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
8556 return DAG.getNode(
8558 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
8559 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
8560 } else if (Subtarget.hasAVX()) {
8561 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
8562 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
8563 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
8564 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
8565 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
8566 ArrayRef<SDValue> Ops) {
8567 // Permute Lo and Hi and then select based on index range.
8568 // This works as SHUFB uses bits[3:0] to permute elements and we don't
8569 // care about the bit[7] as its just an index vector.
8570 SDValue Idx = Ops[2];
8571 EVT VT = Idx.getValueType();
8572 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
8573 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
8574 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
8576 };
8577 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
8578 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
8579 PSHUFBBuilder);
8580 }
8581 break;
8582 case MVT::v16i16:
8583 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8584 Opcode = X86ISD::VPERMV;
8585 else if (Subtarget.hasAVX()) {
8586 // Scale to v32i8 and perform as v32i8.
8587 IndicesVec = ScaleIndices(IndicesVec, 2);
8588 return DAG.getBitcast(
8590 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
8591 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
8592 }
8593 break;
8594 case MVT::v8f32:
8595 case MVT::v8i32:
8596 if (Subtarget.hasAVX2())
8597 Opcode = X86ISD::VPERMV;
8598 else if (Subtarget.hasAVX()) {
8599 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
8600 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
8601 {0, 1, 2, 3, 0, 1, 2, 3});
8602 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
8603 {4, 5, 6, 7, 4, 5, 6, 7});
8604 if (Subtarget.hasXOP())
8605 return DAG.getBitcast(
8606 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
8607 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
8608 // Permute Lo and Hi and then select based on index range.
8609 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
8610 SDValue Res = DAG.getSelectCC(
8611 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
8612 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
8613 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
8615 return DAG.getBitcast(VT, Res);
8616 }
8617 break;
8618 case MVT::v4i64:
8619 case MVT::v4f64:
8620 if (Subtarget.hasAVX512()) {
8621 if (!Subtarget.hasVLX()) {
8622 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
8623 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
8624 SDLoc(SrcVec));
8625 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
8626 DAG, SDLoc(IndicesVec));
8627 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
8628 DAG, Subtarget);
8629 return extract256BitVector(Res, 0, DAG, DL);
8630 }
8631 Opcode = X86ISD::VPERMV;
8632 } else if (Subtarget.hasAVX()) {
8633 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
8634 SDValue LoLo =
8635 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
8636 SDValue HiHi =
8637 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
8638 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
8639 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8640 if (Subtarget.hasXOP())
8641 return DAG.getBitcast(
8642 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
8643 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
8644 // Permute Lo and Hi and then select based on index range.
8645 // This works as VPERMILPD only uses index bit[1] to permute elements.
8646 SDValue Res = DAG.getSelectCC(
8647 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
8648 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
8649 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
8651 return DAG.getBitcast(VT, Res);
8652 }
8653 break;
8654 case MVT::v64i8:
8655 if (Subtarget.hasVBMI())
8656 Opcode = X86ISD::VPERMV;
8657 break;
8658 case MVT::v32i16:
8659 if (Subtarget.hasBWI())
8660 Opcode = X86ISD::VPERMV;
8661 break;
8662 case MVT::v16f32:
8663 case MVT::v16i32:
8664 case MVT::v8f64:
8665 case MVT::v8i64:
8666 if (Subtarget.hasAVX512())
8667 Opcode = X86ISD::VPERMV;
8668 break;
8669 }
8670 if (!Opcode)
8671 return SDValue();
8672
8673 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
8674 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
8675 "Illegal variable permute shuffle type");
8676
8677 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
8678 if (Scale > 1)
8679 IndicesVec = ScaleIndices(IndicesVec, Scale);
8680
8681 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
8682 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
8683
8684 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
8685 SDValue Res = Opcode == X86ISD::VPERMV
8686 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
8687 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
8688 return DAG.getBitcast(VT, Res);
8689}
8690
8691// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
8692// reasoned to be a permutation of a vector by indices in a non-constant vector.
8693// (build_vector (extract_elt V, (extract_elt I, 0)),
8694// (extract_elt V, (extract_elt I, 1)),
8695// ...
8696// ->
8697// (vpermv I, V)
8698//
8699// TODO: Handle undefs
8700// TODO: Utilize pshufb and zero mask blending to support more efficient
8701// construction of vectors with constant-0 elements.
8702static SDValue
8704 SelectionDAG &DAG,
8705 const X86Subtarget &Subtarget) {
8706 SDValue SrcVec, IndicesVec;
8707 // Check for a match of the permute source vector and permute index elements.
8708 // This is done by checking that the i-th build_vector operand is of the form:
8709 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
8710 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
8711 SDValue Op = V.getOperand(Idx);
8712 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8713 return SDValue();
8714
8715 // If this is the first extract encountered in V, set the source vector,
8716 // otherwise verify the extract is from the previously defined source
8717 // vector.
8718 if (!SrcVec)
8719 SrcVec = Op.getOperand(0);
8720 else if (SrcVec != Op.getOperand(0))
8721 return SDValue();
8722 SDValue ExtractedIndex = Op->getOperand(1);
8723 // Peek through extends.
8724 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
8725 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
8726 ExtractedIndex = ExtractedIndex.getOperand(0);
8727 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8728 return SDValue();
8729
8730 // If this is the first extract from the index vector candidate, set the
8731 // indices vector, otherwise verify the extract is from the previously
8732 // defined indices vector.
8733 if (!IndicesVec)
8734 IndicesVec = ExtractedIndex.getOperand(0);
8735 else if (IndicesVec != ExtractedIndex.getOperand(0))
8736 return SDValue();
8737
8738 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
8739 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
8740 return SDValue();
8741 }
8742
8743 MVT VT = V.getSimpleValueType();
8744 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8745}
8746
8747SDValue
8748X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
8749 SDLoc dl(Op);
8750
8751 MVT VT = Op.getSimpleValueType();
8752 MVT EltVT = VT.getVectorElementType();
8753 MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
8754 unsigned NumElems = Op.getNumOperands();
8755
8756 // Generate vectors for predicate vectors.
8757 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
8758 return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget);
8759
8760 if (VT.getVectorElementType() == MVT::bf16 &&
8761 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
8762 return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
8763
8764 if (SDValue VectorCst = materializeVectorConstant(Op, dl, DAG, Subtarget))
8765 return VectorCst;
8766
8767 unsigned EVTBits = EltVT.getSizeInBits();
8768 APInt UndefMask = APInt::getZero(NumElems);
8769 APInt FrozenUndefMask = APInt::getZero(NumElems);
8770 APInt ZeroMask = APInt::getZero(NumElems);
8771 APInt NonZeroMask = APInt::getZero(NumElems);
8772 bool IsAllConstants = true;
8773 bool OneUseFrozenUndefs = true;
8774 SmallSet<SDValue, 8> Values;
8775 unsigned NumConstants = NumElems;
8776 for (unsigned i = 0; i < NumElems; ++i) {
8777 SDValue Elt = Op.getOperand(i);
8778 if (Elt.isUndef()) {
8779 UndefMask.setBit(i);
8780 continue;
8781 }
8782 if (ISD::isFreezeUndef(Elt.getNode())) {
8783 OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse();
8784 FrozenUndefMask.setBit(i);
8785 continue;
8786 }
8787 Values.insert(Elt);
8788 if (!isIntOrFPConstant(Elt)) {
8789 IsAllConstants = false;
8790 NumConstants--;
8791 }
8792 if (X86::isZeroNode(Elt)) {
8793 ZeroMask.setBit(i);
8794 } else {
8795 NonZeroMask.setBit(i);
8796 }
8797 }
8798
8799 // All undef vector. Return an UNDEF.
8800 if (UndefMask.isAllOnes())
8801 return DAG.getUNDEF(VT);
8802
8803 // All undef/freeze(undef) vector. Return a FREEZE UNDEF.
8804 if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())
8805 return DAG.getFreeze(DAG.getUNDEF(VT));
8806
8807 // All undef/freeze(undef)/zero vector. Return a zero vector.
8808 if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())
8809 return getZeroVector(VT, Subtarget, DAG, dl);
8810
8811 // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
8812 // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in
8813 // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
8814 // and blend the FREEZE-UNDEF operands back in.
8815 // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
8816 if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();
8817 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
8818 SmallVector<int, 16> BlendMask(NumElems, -1);
8819 SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));
8820 for (unsigned i = 0; i < NumElems; ++i) {
8821 if (UndefMask[i]) {
8822 BlendMask[i] = -1;
8823 continue;
8824 }
8825 BlendMask[i] = i;
8826 if (!FrozenUndefMask[i])
8827 Elts[i] = Op.getOperand(i);
8828 else
8829 BlendMask[i] += NumElems;
8830 }
8831 SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);
8832 SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));
8833 SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);
8834 return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);
8835 }
8836
8837 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
8838
8839 // If the upper elts of a ymm/zmm are undef/freeze(undef)/zero then we might
8840 // be better off lowering to a smaller build vector and padding with
8841 // undef/zero.
8842 if ((VT.is256BitVector() || VT.is512BitVector()) &&
8844 unsigned UpperElems = NumElems / 2;
8845 APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;
8846 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();
8847 if (NumUpperUndefsOrZeros >= UpperElems) {
8848 if (VT.is512BitVector() &&
8849 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
8850 UpperElems = NumElems - (NumElems / 4);
8851 // If freeze(undef) is in any upper elements, force to zero.
8852 bool UndefUpper = UndefMask.countl_one() >= UpperElems;
8853 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
8854 SDValue NewBV =
8855 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
8856 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
8857 }
8858 }
8859
8860 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, dl, Subtarget, DAG))
8861 return AddSub;
8862 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, dl, Subtarget, DAG))
8863 return HorizontalOp;
8864 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, dl, Subtarget, DAG))
8865 return Broadcast;
8866 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG))
8867 return BitOp;
8868
8869 unsigned NumZero = ZeroMask.popcount();
8870 unsigned NumNonZero = NonZeroMask.popcount();
8871
8872 // If we are inserting one variable into a vector of non-zero constants, try
8873 // to avoid loading each constant element as a scalar. Load the constants as a
8874 // vector and then insert the variable scalar element. If insertion is not
8875 // supported, fall back to a shuffle to get the scalar blended with the
8876 // constants. Insertion into a zero vector is handled as a special-case
8877 // somewhere below here.
8878 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
8879 FrozenUndefMask.isZero() &&
8882 // Create an all-constant vector. The variable element in the old
8883 // build vector is replaced by undef in the constant vector. Save the
8884 // variable scalar element and its index for use in the insertelement.
8885 LLVMContext &Context = *DAG.getContext();
8886 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
8887 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
8888 SDValue VarElt;
8889 SDValue InsIndex;
8890 for (unsigned i = 0; i != NumElems; ++i) {
8891 SDValue Elt = Op.getOperand(i);
8892 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
8893 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
8894 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
8895 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
8896 else if (!Elt.isUndef()) {
8897 assert(!VarElt.getNode() && !InsIndex.getNode() &&
8898 "Expected one variable element in this vector");
8899 VarElt = Elt;
8900 InsIndex = DAG.getVectorIdxConstant(i, dl);
8901 }
8902 }
8903 Constant *CV = ConstantVector::get(ConstVecOps);
8904 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
8905
8906 // The constants we just created may not be legal (eg, floating point). We
8907 // must lower the vector right here because we can not guarantee that we'll
8908 // legalize it before loading it. This is also why we could not just create
8909 // a new build vector here. If the build vector contains illegal constants,
8910 // it could get split back up into a series of insert elements.
8911 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
8912 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
8915 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
8916 unsigned InsertC = InsIndex->getAsZExtVal();
8917 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
8918 if (InsertC < NumEltsInLow128Bits)
8919 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
8920
8921 // There's no good way to insert into the high elements of a >128-bit
8922 // vector, so use shuffles to avoid an extract/insert sequence.
8923 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
8924 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
8925 SmallVector<int, 8> ShuffleMask;
8926 unsigned NumElts = VT.getVectorNumElements();
8927 for (unsigned i = 0; i != NumElts; ++i)
8928 ShuffleMask.push_back(i == InsertC ? NumElts : i);
8929 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
8930 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
8931 }
8932
8933 // Special case for single non-zero, non-undef, element.
8934 if (NumNonZero == 1) {
8935 unsigned Idx = NonZeroMask.countr_zero();
8936 SDValue Item = Op.getOperand(Idx);
8937
8938 // If we have a constant or non-constant insertion into the low element of
8939 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
8940 // the rest of the elements. This will be matched as movd/movq/movss/movsd
8941 // depending on what the source datatype is.
8942 if (Idx == 0) {
8943 if (NumZero == 0)
8944 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8945
8946 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
8947 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
8948 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
8949 assert((VT.is128BitVector() || VT.is256BitVector() ||
8950 VT.is512BitVector()) &&
8951 "Expected an SSE value type!");
8952 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8953 // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
8954 // zero vector.
8955 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8956 }
8957
8958 // We can't directly insert an i8 or i16 into a vector, so zero extend
8959 // it to i32 first.
8960 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
8961 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
8962 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
8963 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
8964 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8965 return DAG.getBitcast(VT, Item);
8966 }
8967 }
8968
8969 // Is it a vector logical left shift?
8970 if (NumElems == 2 && Idx == 1 &&
8971 X86::isZeroNode(Op.getOperand(0)) &&
8972 !X86::isZeroNode(Op.getOperand(1))) {
8973 unsigned NumBits = VT.getSizeInBits();
8974 return getVShift(true, VT,
8976 VT, Op.getOperand(1)),
8977 NumBits/2, DAG, *this, dl);
8978 }
8979
8980 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
8981 return SDValue();
8982
8983 // Otherwise, if this is a vector with i32 or f32 elements, and the element
8984 // is a non-constant being inserted into an element other than the low one,
8985 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
8986 // movd/movss) to move this into the low element, then shuffle it into
8987 // place.
8988 if (EVTBits == 32) {
8989 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8990 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
8991 }
8992 }
8993
8994 // Splat is obviously ok. Let legalizer expand it to a shuffle.
8995 if (Values.size() == 1) {
8996 if (EVTBits == 32) {
8997 // Instead of a shuffle like this:
8998 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
8999 // Check if it's possible to issue this instead.
9000 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
9001 unsigned Idx = NonZeroMask.countr_zero();
9002 SDValue Item = Op.getOperand(Idx);
9003 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
9004 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
9005 }
9006 return SDValue();
9007 }
9008
9009 // A vector full of immediates; various special cases are already
9010 // handled, so this is best done with a single constant-pool load.
9011 if (IsAllConstants)
9012 return SDValue();
9013
9014 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, dl, DAG, Subtarget))
9015 return V;
9016
9017 // See if we can use a vector load to get all of the elements.
9018 {
9019 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
9020 if (SDValue LD =
9021 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
9022 return LD;
9023 }
9024
9025 // If this is a splat of pairs of 32-bit elements, we can use a narrower
9026 // build_vector and broadcast it.
9027 // TODO: We could probably generalize this more.
9028 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
9029 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
9030 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
9031 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
9032 // Make sure all the even/odd operands match.
9033 for (unsigned i = 2; i != NumElems; ++i)
9034 if (Ops[i % 2] != Op.getOperand(i))
9035 return false;
9036 return true;
9037 };
9038 if (CanSplat(Op, NumElems, Ops)) {
9039 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
9040 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
9041 // Create a new build vector and cast to v2i64/v2f64.
9042 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
9043 DAG.getBuildVector(NarrowVT, dl, Ops));
9044 // Broadcast from v2i64/v2f64 and cast to final VT.
9045 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
9046 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
9047 NewBV));
9048 }
9049 }
9050
9051 // For AVX-length vectors, build the individual 128-bit pieces and use
9052 // shuffles to put them in place.
9053 if (VT.getSizeInBits() > 128) {
9054 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
9055
9056 // Build both the lower and upper subvector.
9057 SDValue Lower =
9058 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
9060 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
9061
9062 // Recreate the wider vector with the lower and upper part.
9063 return concatSubVectors(Lower, Upper, DAG, dl);
9064 }
9065
9066 // Let legalizer expand 2-wide build_vectors.
9067 if (EVTBits == 64) {
9068 if (NumNonZero == 1) {
9069 // One half is zero or undef.
9070 unsigned Idx = NonZeroMask.countr_zero();
9072 Op.getOperand(Idx));
9073 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
9074 }
9075 return SDValue();
9076 }
9077
9078 // If element VT is < 32 bits, convert it to inserts into a zero vector.
9079 if (EVTBits == 8 && NumElems == 16)
9080 if (SDValue V = LowerBuildVectorv16i8(Op, dl, NonZeroMask, NumNonZero,
9081 NumZero, DAG, Subtarget))
9082 return V;
9083
9084 if (EltVT == MVT::i16 && NumElems == 8)
9085 if (SDValue V = LowerBuildVectorv8i16(Op, dl, NonZeroMask, NumNonZero,
9086 NumZero, DAG, Subtarget))
9087 return V;
9088
9089 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
9090 if (EVTBits == 32 && NumElems == 4)
9091 if (SDValue V = LowerBuildVectorv4x32(Op, dl, DAG, Subtarget))
9092 return V;
9093
9094 // If element VT is == 32 bits, turn it into a number of shuffles.
9095 if (NumElems == 4 && NumZero > 0) {
9096 SmallVector<SDValue, 8> Ops(NumElems);
9097 for (unsigned i = 0; i < 4; ++i) {
9098 bool isZero = !NonZeroMask[i];
9099 if (isZero)
9100 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
9101 else
9102 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9103 }
9104
9105 for (unsigned i = 0; i < 2; ++i) {
9106 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
9107 default: llvm_unreachable("Unexpected NonZero count");
9108 case 0:
9109 Ops[i] = Ops[i*2]; // Must be a zero vector.
9110 break;
9111 case 1:
9112 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
9113 break;
9114 case 2:
9115 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9116 break;
9117 case 3:
9118 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9119 break;
9120 }
9121 }
9122
9123 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
9124 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
9125 int MaskVec[] = {
9126 Reverse1 ? 1 : 0,
9127 Reverse1 ? 0 : 1,
9128 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
9129 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
9130 };
9131 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
9132 }
9133
9134 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
9135
9136 // Check for a build vector from mostly shuffle plus few inserting.
9137 if (SDValue Sh = buildFromShuffleMostly(Op, dl, DAG))
9138 return Sh;
9139
9140 // For SSE 4.1, use insertps to put the high elements into the low element.
9141 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
9143 if (!Op.getOperand(0).isUndef())
9144 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
9145 else
9146 Result = DAG.getUNDEF(VT);
9147
9148 for (unsigned i = 1; i < NumElems; ++i) {
9149 if (Op.getOperand(i).isUndef()) continue;
9150 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
9151 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
9152 }
9153 return Result;
9154 }
9155
9156 // Otherwise, expand into a number of unpckl*, start by extending each of
9157 // our (non-undef) elements to the full vector width with the element in the
9158 // bottom slot of the vector (which generates no code for SSE).
9159 SmallVector<SDValue, 8> Ops(NumElems);
9160 for (unsigned i = 0; i < NumElems; ++i) {
9161 if (!Op.getOperand(i).isUndef())
9162 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9163 else
9164 Ops[i] = DAG.getUNDEF(VT);
9165 }
9166
9167 // Next, we iteratively mix elements, e.g. for v4f32:
9168 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
9169 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
9170 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
9171 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
9172 // Generate scaled UNPCKL shuffle mask.
9174 for(unsigned i = 0; i != Scale; ++i)
9175 Mask.push_back(i);
9176 for (unsigned i = 0; i != Scale; ++i)
9177 Mask.push_back(NumElems+i);
9178 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
9179
9180 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
9181 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
9182 }
9183 return Ops[0];
9184}
9185
9186// 256-bit AVX can use the vinsertf128 instruction
9187// to create 256-bit vectors from two other 128-bit ones.
9188// TODO: Detect subvector broadcast here instead of DAG combine?
9190 const X86Subtarget &Subtarget) {
9191 SDLoc dl(Op);
9192 MVT ResVT = Op.getSimpleValueType();
9193
9194 assert((ResVT.is256BitVector() ||
9195 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
9196
9197 unsigned NumOperands = Op.getNumOperands();
9198 unsigned NumFreezeUndef = 0;
9199 unsigned NumZero = 0;
9200 unsigned NumNonZero = 0;
9201 unsigned NonZeros = 0;
9202 for (unsigned i = 0; i != NumOperands; ++i) {
9203 SDValue SubVec = Op.getOperand(i);
9204 if (SubVec.isUndef())
9205 continue;
9206 if (ISD::isFreezeUndef(SubVec.getNode())) {
9207 // If the freeze(undef) has multiple uses then we must fold to zero.
9208 if (SubVec.hasOneUse())
9209 ++NumFreezeUndef;
9210 else
9211 ++NumZero;
9212 }
9213 else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9214 ++NumZero;
9215 else {
9216 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9217 NonZeros |= 1 << i;
9218 ++NumNonZero;
9219 }
9220 }
9221
9222 // If we have more than 2 non-zeros, build each half separately.
9223 if (NumNonZero > 2) {
9224 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9225 ArrayRef<SDUse> Ops = Op->ops();
9226 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9227 Ops.slice(0, NumOperands/2));
9228 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9229 Ops.slice(NumOperands/2));
9230 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9231 }
9232
9233 // Otherwise, build it up through insert_subvectors.
9234 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
9235 : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
9236 : DAG.getUNDEF(ResVT));
9237
9238 MVT SubVT = Op.getOperand(0).getSimpleValueType();
9239 unsigned NumSubElems = SubVT.getVectorNumElements();
9240 for (unsigned i = 0; i != NumOperands; ++i) {
9241 if ((NonZeros & (1 << i)) == 0)
9242 continue;
9243
9244 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
9245 Op.getOperand(i),
9246 DAG.getIntPtrConstant(i * NumSubElems, dl));
9247 }
9248
9249 return Vec;
9250}
9251
9252// Returns true if the given node is a type promotion (by concatenating i1
9253// zeros) of the result of a node that already zeros all upper bits of
9254// k-register.
9255// TODO: Merge this with LowerAVXCONCAT_VECTORS?
9257 const X86Subtarget &Subtarget,
9258 SelectionDAG & DAG) {
9259 SDLoc dl(Op);
9260 MVT ResVT = Op.getSimpleValueType();
9261 unsigned NumOperands = Op.getNumOperands();
9262
9263 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
9264 "Unexpected number of operands in CONCAT_VECTORS");
9265
9266 uint64_t Zeros = 0;
9267 uint64_t NonZeros = 0;
9268 for (unsigned i = 0; i != NumOperands; ++i) {
9269 SDValue SubVec = Op.getOperand(i);
9270 if (SubVec.isUndef())
9271 continue;
9272 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9273 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9274 Zeros |= (uint64_t)1 << i;
9275 else
9276 NonZeros |= (uint64_t)1 << i;
9277 }
9278
9279 unsigned NumElems = ResVT.getVectorNumElements();
9280
9281 // If we are inserting non-zero vector and there are zeros in LSBs and undef
9282 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
9283 // insert_subvector will give us two kshifts.
9284 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
9285 Log2_64(NonZeros) != NumOperands - 1) {
9286 unsigned Idx = Log2_64(NonZeros);
9287 SDValue SubVec = Op.getOperand(Idx);
9288 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9289 MVT ShiftVT = widenMaskVectorType(ResVT, Subtarget);
9290 Op = widenSubVector(ShiftVT, SubVec, false, Subtarget, DAG, dl);
9291 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, Op,
9292 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
9293 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
9294 DAG.getIntPtrConstant(0, dl));
9295 }
9296
9297 // If there are zero or one non-zeros we can handle this very simply.
9298 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
9299 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
9300 if (!NonZeros)
9301 return Vec;
9302 unsigned Idx = Log2_64(NonZeros);
9303 SDValue SubVec = Op.getOperand(Idx);
9304 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9305 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
9306 DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
9307 }
9308
9309 if (NumOperands > 2) {
9310 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9311 ArrayRef<SDUse> Ops = Op->ops();
9312 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9313 Ops.slice(0, NumOperands/2));
9314 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9315 Ops.slice(NumOperands/2));
9316 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9317 }
9318
9319 assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?");
9320
9321 if (ResVT.getVectorNumElements() >= 16)
9322 return Op; // The operation is legal with KUNPCK
9323
9324 SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
9325 DAG.getUNDEF(ResVT), Op.getOperand(0),
9326 DAG.getIntPtrConstant(0, dl));
9327 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
9328 DAG.getIntPtrConstant(NumElems/2, dl));
9329}
9330
9332 const X86Subtarget &Subtarget,
9333 SelectionDAG &DAG) {
9334 MVT VT = Op.getSimpleValueType();
9335 if (VT.getVectorElementType() == MVT::i1)
9336 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
9337
9338 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
9339 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
9340 Op.getNumOperands() == 4)));
9341
9342 // AVX can use the vinsertf128 instruction to create 256-bit vectors
9343 // from two other 128-bit ones.
9344
9345 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
9346 return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
9347}
9348
9349//===----------------------------------------------------------------------===//
9350// Vector shuffle lowering
9351//
9352// This is an experimental code path for lowering vector shuffles on x86. It is
9353// designed to handle arbitrary vector shuffles and blends, gracefully
9354// degrading performance as necessary. It works hard to recognize idiomatic
9355// shuffles and lower them to optimal instruction patterns without leaving
9356// a framework that allows reasonably efficient handling of all vector shuffle
9357// patterns.
9358//===----------------------------------------------------------------------===//
9359
9360/// Tiny helper function to identify a no-op mask.
9361///
9362/// This is a somewhat boring predicate function. It checks whether the mask
9363/// array input, which is assumed to be a single-input shuffle mask of the kind
9364/// used by the X86 shuffle instructions (not a fully general
9365/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
9366/// in-place shuffle are 'no-op's.
9368 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9369 assert(Mask[i] >= -1 && "Out of bound mask element!");
9370 if (Mask[i] >= 0 && Mask[i] != i)
9371 return false;
9372 }
9373 return true;
9374}
9375
9376/// Test whether there are elements crossing LaneSizeInBits lanes in this
9377/// shuffle mask.
9378///
9379/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
9380/// and we routinely test for these.
9381static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
9382 unsigned ScalarSizeInBits,
9383 ArrayRef<int> Mask) {
9384 assert(LaneSizeInBits && ScalarSizeInBits &&
9385 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9386 "Illegal shuffle lane size");
9387 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
9388 int Size = Mask.size();
9389 for (int i = 0; i < Size; ++i)
9390 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
9391 return true;
9392 return false;
9393}
9394
9395/// Test whether there are elements crossing 128-bit lanes in this
9396/// shuffle mask.
9398 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
9399}
9400
9401/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
9402/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
9403/// better support 'repeated mask + lane permute' style shuffles.
9404static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
9405 unsigned ScalarSizeInBits,
9406 ArrayRef<int> Mask) {
9407 assert(LaneSizeInBits && ScalarSizeInBits &&
9408 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9409 "Illegal shuffle lane size");
9410 int NumElts = Mask.size();
9411 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
9412 int NumLanes = NumElts / NumEltsPerLane;
9413 if (NumLanes > 1) {
9414 for (int i = 0; i != NumLanes; ++i) {
9415 int SrcLane = -1;
9416 for (int j = 0; j != NumEltsPerLane; ++j) {
9417 int M = Mask[(i * NumEltsPerLane) + j];
9418 if (M < 0)
9419 continue;
9420 int Lane = (M % NumElts) / NumEltsPerLane;
9421 if (SrcLane >= 0 && SrcLane != Lane)
9422 return true;
9423 SrcLane = Lane;
9424 }
9425 }
9426 }
9427 return false;
9428}
9429
9430/// Test whether a shuffle mask is equivalent within each sub-lane.
9431///
9432/// This checks a shuffle mask to see if it is performing the same
9433/// lane-relative shuffle in each sub-lane. This trivially implies
9434/// that it is also not lane-crossing. It may however involve a blend from the
9435/// same lane of a second vector.
9436///
9437/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
9438/// non-trivial to compute in the face of undef lanes. The representation is
9439/// suitable for use with existing 128-bit shuffles as entries from the second
9440/// vector have been remapped to [LaneSize, 2*LaneSize).
9441static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
9442 ArrayRef<int> Mask,
9443 SmallVectorImpl<int> &RepeatedMask) {
9444 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
9445 RepeatedMask.assign(LaneSize, -1);
9446 int Size = Mask.size();
9447 for (int i = 0; i < Size; ++i) {
9448 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
9449 if (Mask[i] < 0)
9450 continue;
9451 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
9452 // This entry crosses lanes, so there is no way to model this shuffle.
9453 return false;
9454
9455 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
9456 // Adjust second vector indices to start at LaneSize instead of Size.
9457 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
9458 : Mask[i] % LaneSize + LaneSize;
9459 if (RepeatedMask[i % LaneSize] < 0)
9460 // This is the first non-undef entry in this slot of a 128-bit lane.
9461 RepeatedMask[i % LaneSize] = LocalM;
9462 else if (RepeatedMask[i % LaneSize] != LocalM)
9463 // Found a mismatch with the repeated mask.
9464 return false;
9465 }
9466 return true;
9467}
9468
9469/// Test whether a shuffle mask is equivalent within each 128-bit lane.
9470static bool
9472 SmallVectorImpl<int> &RepeatedMask) {
9473 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
9474}
9475
9476static bool
9478 SmallVector<int, 32> RepeatedMask;
9479 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
9480}
9481
9482/// Test whether a shuffle mask is equivalent within each 256-bit lane.
9483static bool
9485 SmallVectorImpl<int> &RepeatedMask) {
9486 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
9487}
9488
9489/// Test whether a target shuffle mask is equivalent within each sub-lane.
9490/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
9491static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
9492 unsigned EltSizeInBits,
9493 ArrayRef<int> Mask,
9494 SmallVectorImpl<int> &RepeatedMask) {
9495 int LaneSize = LaneSizeInBits / EltSizeInBits;
9496 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
9497 int Size = Mask.size();
9498 for (int i = 0; i < Size; ++i) {
9499 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
9500 if (Mask[i] == SM_SentinelUndef)
9501 continue;
9502 if (Mask[i] == SM_SentinelZero) {
9503 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
9504 return false;
9505 RepeatedMask[i % LaneSize] = SM_SentinelZero;
9506 continue;
9507 }
9508 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
9509 // This entry crosses lanes, so there is no way to model this shuffle.
9510 return false;
9511
9512 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
9513 // later vector indices to start at multiples of LaneSize instead of Size.
9514 int LaneM = Mask[i] / Size;
9515 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
9516 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
9517 // This is the first non-undef entry in this slot of a 128-bit lane.
9518 RepeatedMask[i % LaneSize] = LocalM;
9519 else if (RepeatedMask[i % LaneSize] != LocalM)
9520 // Found a mismatch with the repeated mask.
9521 return false;
9522 }
9523 return true;
9524}
9525
9526/// Test whether a target shuffle mask is equivalent within each sub-lane.
9527/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
9528static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
9529 ArrayRef<int> Mask,
9530 SmallVectorImpl<int> &RepeatedMask) {
9531 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
9532 Mask, RepeatedMask);
9533}
9534
9535/// Checks whether the vector elements referenced by two shuffle masks are
9536/// equivalent.
9537static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
9538 int Idx, int ExpectedIdx) {
9539 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
9540 ExpectedIdx < MaskSize && "Out of range element index");
9541 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
9542 return false;
9543
9544 switch (Op.getOpcode()) {
9545 case ISD::BUILD_VECTOR:
9546 // If the values are build vectors, we can look through them to find
9547 // equivalent inputs that make the shuffles equivalent.
9548 // TODO: Handle MaskSize != Op.getNumOperands()?
9549 if (MaskSize == (int)Op.getNumOperands() &&
9550 MaskSize == (int)ExpectedOp.getNumOperands())
9551 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
9552 break;
9553 case X86ISD::VBROADCAST:
9555 // TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?
9556 return (Op == ExpectedOp &&
9557 (int)Op.getValueType().getVectorNumElements() == MaskSize);
9558 case X86ISD::HADD:
9559 case X86ISD::HSUB:
9560 case X86ISD::FHADD:
9561 case X86ISD::FHSUB:
9562 case X86ISD::PACKSS:
9563 case X86ISD::PACKUS:
9564 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
9565 // TODO: Handle MaskSize != NumElts?
9566 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
9567 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
9568 MVT VT = Op.getSimpleValueType();
9569 int NumElts = VT.getVectorNumElements();
9570 if (MaskSize == NumElts) {
9571 int NumLanes = VT.getSizeInBits() / 128;
9572 int NumEltsPerLane = NumElts / NumLanes;
9573 int NumHalfEltsPerLane = NumEltsPerLane / 2;
9574 bool SameLane =
9575 (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
9576 bool SameElt =
9577 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
9578 return SameLane && SameElt;
9579 }
9580 }
9581 break;
9582 }
9583
9584 return false;
9585}
9586
9587/// Checks whether a shuffle mask is equivalent to an explicit list of
9588/// arguments.
9589///
9590/// This is a fast way to test a shuffle mask against a fixed pattern:
9591///
9592/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
9593///
9594/// It returns true if the mask is exactly as wide as the argument list, and
9595/// each element of the mask is either -1 (signifying undef) or the value given
9596/// in the argument.
9597static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
9598 SDValue V1 = SDValue(),
9599 SDValue V2 = SDValue()) {
9600 int Size = Mask.size();
9601 if (Size != (int)ExpectedMask.size())
9602 return false;
9603
9604 for (int i = 0; i < Size; ++i) {
9605 assert(Mask[i] >= -1 && "Out of bound mask element!");
9606 int MaskIdx = Mask[i];
9607 int ExpectedIdx = ExpectedMask[i];
9608 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
9609 SDValue MaskV = MaskIdx < Size ? V1 : V2;
9610 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
9611 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
9612 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9613 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
9614 return false;
9615 }
9616 }
9617 return true;
9618}
9619
9620/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
9621///
9622/// The masks must be exactly the same width.
9623///
9624/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
9625/// value in ExpectedMask is always accepted. Otherwise the indices must match.
9626///
9627/// SM_SentinelZero is accepted as a valid negative index but must match in
9628/// both, or via a known bits test.
9630 ArrayRef<int> ExpectedMask,
9631 const SelectionDAG &DAG,
9632 SDValue V1 = SDValue(),
9633 SDValue V2 = SDValue()) {
9634 int Size = Mask.size();
9635 if (Size != (int)ExpectedMask.size())
9636 return false;
9637 assert(llvm::all_of(ExpectedMask,
9638 [Size](int M) { return isInRange(M, 0, 2 * Size); }) &&
9639 "Illegal target shuffle mask");
9640
9641 // Check for out-of-range target shuffle mask indices.
9642 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
9643 return false;
9644
9645 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
9646 if (V1 && (V1.getValueSizeInBits() != VT.getSizeInBits() ||
9647 !V1.getValueType().isVector()))
9648 V1 = SDValue();
9649 if (V2 && (V2.getValueSizeInBits() != VT.getSizeInBits() ||
9650 !V2.getValueType().isVector()))
9651 V2 = SDValue();
9652
9653 APInt ZeroV1 = APInt::getZero(Size);
9654 APInt ZeroV2 = APInt::getZero(Size);
9655
9656 for (int i = 0; i < Size; ++i) {
9657 int MaskIdx = Mask[i];
9658 int ExpectedIdx = ExpectedMask[i];
9659 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
9660 continue;
9661 if (MaskIdx == SM_SentinelZero) {
9662 // If we need this expected index to be a zero element, then update the
9663 // relevant zero mask and perform the known bits at the end to minimize
9664 // repeated computes.
9665 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
9666 if (ExpectedV &&
9667 Size == (int)ExpectedV.getValueType().getVectorNumElements()) {
9668 int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9669 APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;
9670 ZeroMask.setBit(BitIdx);
9671 continue;
9672 }
9673 }
9674 if (MaskIdx >= 0) {
9675 SDValue MaskV = MaskIdx < Size ? V1 : V2;
9676 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
9677 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
9678 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9679 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
9680 continue;
9681 }
9682 return false;
9683 }
9684 return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&
9685 (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));
9686}
9687
9688// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
9689// instructions.
9691 const SelectionDAG &DAG) {
9692 if (VT != MVT::v8i32 && VT != MVT::v8f32)
9693 return false;
9694
9695 SmallVector<int, 8> Unpcklwd;
9696 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
9697 /* Unary = */ false);
9698 SmallVector<int, 8> Unpckhwd;
9699 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
9700 /* Unary = */ false);
9701 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||
9702 isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));
9703 return IsUnpackwdMask;
9704}
9705
9707 const SelectionDAG &DAG) {
9708 // Create 128-bit vector type based on mask size.
9709 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
9710 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
9711
9712 // We can't assume a canonical shuffle mask, so try the commuted version too.
9713 SmallVector<int, 4> CommutedMask(Mask);
9715
9716 // Match any of unary/binary or low/high.
9717 for (unsigned i = 0; i != 4; ++i) {
9718 SmallVector<int, 16> UnpackMask;
9719 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
9720 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||
9721 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))
9722 return true;
9723 }
9724 return false;
9725}
9726
9727/// Return true if a shuffle mask chooses elements identically in its top and
9728/// bottom halves. For example, any splat mask has the same top and bottom
9729/// halves. If an element is undefined in only one half of the mask, the halves
9730/// are not considered identical.
9732 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
9733 unsigned HalfSize = Mask.size() / 2;
9734 for (unsigned i = 0; i != HalfSize; ++i) {
9735 if (Mask[i] != Mask[i + HalfSize])
9736 return false;
9737 }
9738 return true;
9739}
9740
9741/// Get a 4-lane 8-bit shuffle immediate for a mask.
9742///
9743/// This helper function produces an 8-bit shuffle immediate corresponding to
9744/// the ubiquitous shuffle encoding scheme used in x86 instructions for
9745/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
9746/// example.
9747///
9748/// NB: We rely heavily on "undef" masks preserving the input lane.
9749static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
9750 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
9751 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
9752 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
9753 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
9754 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
9755
9756 // If the mask only uses one non-undef element, then fully 'splat' it to
9757 // improve later broadcast matching.
9758 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
9759 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
9760
9761 int FirstElt = Mask[FirstIndex];
9762 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
9763 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
9764
9765 unsigned Imm = 0;
9766 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
9767 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
9768 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
9769 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
9770 return Imm;
9771}
9772
9774 SelectionDAG &DAG) {
9775 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
9776}
9777
9778// The Shuffle result is as follow:
9779// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
9780// Each Zeroable's element correspond to a particular Mask's element.
9781// As described in computeZeroableShuffleElements function.
9782//
9783// The function looks for a sub-mask that the nonzero elements are in
9784// increasing order. If such sub-mask exist. The function returns true.
9785static bool isNonZeroElementsInOrder(const APInt &Zeroable,
9786 ArrayRef<int> Mask, const EVT &VectorType,
9787 bool &IsZeroSideLeft) {
9788 int NextElement = -1;
9789 // Check if the Mask's nonzero elements are in increasing order.
9790 for (int i = 0, e = Mask.size(); i < e; i++) {
9791 // Checks if the mask's zeros elements are built from only zeros.
9792 assert(Mask[i] >= -1 && "Out of bound mask element!");
9793 if (Mask[i] < 0)
9794 return false;
9795 if (Zeroable[i])
9796 continue;
9797 // Find the lowest non zero element
9798 if (NextElement < 0) {
9799 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
9800 IsZeroSideLeft = NextElement != 0;
9801 }
9802 // Exit if the mask's non zero elements are not in increasing order.
9803 if (NextElement != Mask[i])
9804 return false;
9805 NextElement++;
9806 }
9807 return true;
9808}
9809
9810/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
9812 ArrayRef<int> Mask, SDValue V1,
9813 SDValue V2, const APInt &Zeroable,
9814 const X86Subtarget &Subtarget,
9815 SelectionDAG &DAG) {
9816 int Size = Mask.size();
9817 int LaneSize = 128 / VT.getScalarSizeInBits();
9818 const int NumBytes = VT.getSizeInBits() / 8;
9819 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
9820
9821 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
9822 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
9823 (Subtarget.hasBWI() && VT.is512BitVector()));
9824
9825 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
9826 // Sign bit set in i8 mask means zero element.
9827 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
9828
9829 SDValue V;
9830 for (int i = 0; i < NumBytes; ++i) {
9831 int M = Mask[i / NumEltBytes];
9832 if (M < 0) {
9833 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
9834 continue;
9835 }
9836 if (Zeroable[i / NumEltBytes]) {
9837 PSHUFBMask[i] = ZeroMask;
9838 continue;
9839 }
9840
9841 // We can only use a single input of V1 or V2.
9842 SDValue SrcV = (M >= Size ? V2 : V1);
9843 if (V && V != SrcV)
9844 return SDValue();
9845 V = SrcV;
9846 M %= Size;
9847
9848 // PSHUFB can't cross lanes, ensure this doesn't happen.
9849 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
9850 return SDValue();
9851
9852 M = M % LaneSize;
9853 M = M * NumEltBytes + (i % NumEltBytes);
9854 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
9855 }
9856 assert(V && "Failed to find a source input");
9857
9858 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
9859 return DAG.getBitcast(
9860 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
9861 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
9862}
9863
9864static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
9865 const X86Subtarget &Subtarget, SelectionDAG &DAG,
9866 const SDLoc &dl);
9867
9868// X86 has dedicated shuffle that can be lowered to VEXPAND
9870 const APInt &Zeroable,
9871 ArrayRef<int> Mask, SDValue &V1,
9872 SDValue &V2, SelectionDAG &DAG,
9873 const X86Subtarget &Subtarget) {
9874 bool IsLeftZeroSide = true;
9875 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
9876 IsLeftZeroSide))
9877 return SDValue();
9878 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
9880 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
9881 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
9882 unsigned NumElts = VT.getVectorNumElements();
9883 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
9884 "Unexpected number of vector elements");
9885 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
9886 Subtarget, DAG, DL);
9887 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
9888 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
9889 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
9890}
9891
9892static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
9893 unsigned &UnpackOpcode, bool IsUnary,
9894 ArrayRef<int> TargetMask, const SDLoc &DL,
9895 SelectionDAG &DAG,
9896 const X86Subtarget &Subtarget) {
9897 int NumElts = VT.getVectorNumElements();
9898
9899 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
9900 for (int i = 0; i != NumElts; i += 2) {
9901 int M1 = TargetMask[i + 0];
9902 int M2 = TargetMask[i + 1];
9903 Undef1 &= (SM_SentinelUndef == M1);
9904 Undef2 &= (SM_SentinelUndef == M2);
9905 Zero1 &= isUndefOrZero(M1);
9906 Zero2 &= isUndefOrZero(M2);
9907 }
9908 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
9909 "Zeroable shuffle detected");
9910
9911 // Attempt to match the target mask against the unpack lo/hi mask patterns.
9912 SmallVector<int, 64> Unpckl, Unpckh;
9913 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
9914 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,
9915 (IsUnary ? V1 : V2))) {
9916 UnpackOpcode = X86ISD::UNPCKL;
9917 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
9918 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
9919 return true;
9920 }
9921
9922 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
9923 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,
9924 (IsUnary ? V1 : V2))) {
9925 UnpackOpcode = X86ISD::UNPCKH;
9926 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
9927 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
9928 return true;
9929 }
9930
9931 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
9932 if (IsUnary && (Zero1 || Zero2)) {
9933 // Don't bother if we can blend instead.
9934 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
9935 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
9936 return false;
9937
9938 bool MatchLo = true, MatchHi = true;
9939 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
9940 int M = TargetMask[i];
9941
9942 // Ignore if the input is known to be zero or the index is undef.
9943 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
9944 (M == SM_SentinelUndef))
9945 continue;
9946
9947 MatchLo &= (M == Unpckl[i]);
9948 MatchHi &= (M == Unpckh[i]);
9949 }
9950
9951 if (MatchLo || MatchHi) {
9952 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
9953 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
9954 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
9955 return true;
9956 }
9957 }
9958
9959 // If a binary shuffle, commute and try again.
9960 if (!IsUnary) {
9962 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {
9963 UnpackOpcode = X86ISD::UNPCKL;
9964 std::swap(V1, V2);
9965 return true;
9966 }
9967
9969 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {
9970 UnpackOpcode = X86ISD::UNPCKH;
9971 std::swap(V1, V2);
9972 return true;
9973 }
9974 }
9975
9976 return false;
9977}
9978
9979// X86 has dedicated unpack instructions that can handle specific blend
9980// operations: UNPCKH and UNPCKL.
9982 ArrayRef<int> Mask, SDValue V1, SDValue V2,
9983 SelectionDAG &DAG) {
9984 SmallVector<int, 8> Unpckl;
9985 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
9986 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
9987 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
9988
9989 SmallVector<int, 8> Unpckh;
9990 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
9991 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
9992 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
9993
9994 // Commute and try again.
9996 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
9997 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
9998
10000 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10001 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
10002
10003 return SDValue();
10004}
10005
10006/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
10007/// followed by unpack 256-bit.
10009 ArrayRef<int> Mask, SDValue V1,
10010 SDValue V2, SelectionDAG &DAG) {
10011 SmallVector<int, 32> Unpckl, Unpckh;
10012 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
10013 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
10014
10015 unsigned UnpackOpcode;
10016 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10017 UnpackOpcode = X86ISD::UNPCKL;
10018 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10019 UnpackOpcode = X86ISD::UNPCKH;
10020 else
10021 return SDValue();
10022
10023 // This is a "natural" unpack operation (rather than the 128-bit sectored
10024 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
10025 // input in order to use the x86 instruction.
10026 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
10027 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
10028 V1 = DAG.getBitcast(VT, V1);
10029 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
10030}
10031
10032// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
10033// source into the lower elements and zeroing the upper elements.
10034static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
10035 ArrayRef<int> Mask, const APInt &Zeroable,
10036 const X86Subtarget &Subtarget) {
10037 if (!VT.is512BitVector() && !Subtarget.hasVLX())
10038 return false;
10039
10040 unsigned NumElts = Mask.size();
10041 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10042 unsigned MaxScale = 64 / EltSizeInBits;
10043
10044 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10045 unsigned SrcEltBits = EltSizeInBits * Scale;
10046 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10047 continue;
10048 unsigned NumSrcElts = NumElts / Scale;
10049 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
10050 continue;
10051 unsigned UpperElts = NumElts - NumSrcElts;
10052 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10053 continue;
10054 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
10055 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
10056 DstVT = MVT::getIntegerVT(EltSizeInBits);
10057 if ((NumSrcElts * EltSizeInBits) >= 128) {
10058 // ISD::TRUNCATE
10059 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
10060 } else {
10061 // X86ISD::VTRUNC
10062 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
10063 }
10064 return true;
10065 }
10066
10067 return false;
10068}
10069
10070// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
10071// element padding to the final DstVT.
10072static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
10073 const X86Subtarget &Subtarget,
10074 SelectionDAG &DAG, bool ZeroUppers) {
10075 MVT SrcVT = Src.getSimpleValueType();
10076 MVT DstSVT = DstVT.getScalarType();
10077 unsigned NumDstElts = DstVT.getVectorNumElements();
10078 unsigned NumSrcElts = SrcVT.getVectorNumElements();
10079 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
10080
10081 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
10082 return SDValue();
10083
10084 // Perform a direct ISD::TRUNCATE if possible.
10085 if (NumSrcElts == NumDstElts)
10086 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
10087
10088 if (NumSrcElts > NumDstElts) {
10089 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10090 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10091 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
10092 }
10093
10094 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
10095 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10096 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10097 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10098 DstVT.getSizeInBits());
10099 }
10100
10101 // Non-VLX targets must truncate from a 512-bit type, so we need to
10102 // widen, truncate and then possibly extract the original subvector.
10103 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
10104 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
10105 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
10106 }
10107
10108 // Fallback to a X86ISD::VTRUNC, padding if necessary.
10109 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
10110 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
10111 if (DstVT != TruncVT)
10112 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10113 DstVT.getSizeInBits());
10114 return Trunc;
10115}
10116
10117// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
10118//
10119// An example is the following:
10120//
10121// t0: ch = EntryToken
10122// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
10123// t25: v4i32 = truncate t2
10124// t41: v8i16 = bitcast t25
10125// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
10126// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
10127// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
10128// t18: v2i64 = bitcast t51
10129//
10130// One can just use a single vpmovdw instruction, without avx512vl we need to
10131// use the zmm variant and extract the lower subvector, padding with zeroes.
10132// TODO: Merge with lowerShuffleAsVTRUNC.
10134 SDValue V2, ArrayRef<int> Mask,
10135 const APInt &Zeroable,
10136 const X86Subtarget &Subtarget,
10137 SelectionDAG &DAG) {
10138 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
10139 if (!Subtarget.hasAVX512())
10140 return SDValue();
10141
10142 unsigned NumElts = VT.getVectorNumElements();
10143 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10144 unsigned MaxScale = 64 / EltSizeInBits;
10145 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10146 unsigned SrcEltBits = EltSizeInBits * Scale;
10147 unsigned NumSrcElts = NumElts / Scale;
10148 unsigned UpperElts = NumElts - NumSrcElts;
10149 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
10150 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10151 continue;
10152
10153 // Attempt to find a matching source truncation, but as a fall back VLX
10154 // cases can use the VPMOV directly.
10155 SDValue Src = peekThroughBitcasts(V1);
10156 if (Src.getOpcode() == ISD::TRUNCATE &&
10157 Src.getScalarValueSizeInBits() == SrcEltBits) {
10158 Src = Src.getOperand(0);
10159 } else if (Subtarget.hasVLX()) {
10160 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10161 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10162 Src = DAG.getBitcast(SrcVT, Src);
10163 // Don't do this if PACKSS/PACKUS could perform it cheaper.
10164 if (Scale == 2 &&
10165 ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
10166 (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
10167 return SDValue();
10168 } else
10169 return SDValue();
10170
10171 // VPMOVWB is only available with avx512bw.
10172 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
10173 return SDValue();
10174
10175 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
10176 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10177 }
10178
10179 return SDValue();
10180}
10181
10182// Attempt to match binary shuffle patterns as a truncate.
10184 SDValue V2, ArrayRef<int> Mask,
10185 const APInt &Zeroable,
10186 const X86Subtarget &Subtarget,
10187 SelectionDAG &DAG) {
10188 assert((VT.is128BitVector() || VT.is256BitVector()) &&
10189 "Unexpected VTRUNC type");
10190 if (!Subtarget.hasAVX512())
10191 return SDValue();
10192
10193 unsigned NumElts = VT.getVectorNumElements();
10194 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10195 unsigned MaxScale = 64 / EltSizeInBits;
10196 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10197 // TODO: Support non-BWI VPMOVWB truncations?
10198 unsigned SrcEltBits = EltSizeInBits * Scale;
10199 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10200 continue;
10201
10202 // Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>
10203 // Bail if the V2 elements are undef.
10204 unsigned NumHalfSrcElts = NumElts / Scale;
10205 unsigned NumSrcElts = 2 * NumHalfSrcElts;
10206 for (unsigned Offset = 0; Offset != Scale; ++Offset) {
10207 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||
10208 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
10209 continue;
10210
10211 // The elements beyond the truncation must be undef/zero.
10212 unsigned UpperElts = NumElts - NumSrcElts;
10213 if (UpperElts > 0 &&
10214 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10215 continue;
10216 bool UndefUppers =
10217 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
10218
10219 // For offset truncations, ensure that the concat is cheap.
10220 if (Offset) {
10221 auto IsCheapConcat = [&](SDValue Lo, SDValue Hi) {
10222 if (Lo.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
10223 Hi.getOpcode() == ISD::EXTRACT_SUBVECTOR)
10224 return Lo.getOperand(0) == Hi.getOperand(0);
10225 if (ISD::isNormalLoad(Lo.getNode()) &&
10226 ISD::isNormalLoad(Hi.getNode())) {
10227 auto *LDLo = cast<LoadSDNode>(Lo);
10228 auto *LDHi = cast<LoadSDNode>(Hi);
10230 LDHi, LDLo, Lo.getValueType().getStoreSize(), 1);
10231 }
10232 return false;
10233 };
10234 if (!IsCheapConcat(V1, V2))
10235 continue;
10236 }
10237
10238 // As we're using both sources then we need to concat them together
10239 // and truncate from the double-sized src.
10240 MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
10241 SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
10242
10243 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10244 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10245 Src = DAG.getBitcast(SrcVT, Src);
10246
10247 // Shift the offset'd elements into place for the truncation.
10248 // TODO: Use getTargetVShiftByConstNode.
10249 if (Offset)
10250 Src = DAG.getNode(
10251 X86ISD::VSRLI, DL, SrcVT, Src,
10252 DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));
10253
10254 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10255 }
10256 }
10257
10258 return SDValue();
10259}
10260
10261/// Check whether a compaction lowering can be done by dropping even/odd
10262/// elements and compute how many times even/odd elements must be dropped.
10263///
10264/// This handles shuffles which take every Nth element where N is a power of
10265/// two. Example shuffle masks:
10266///
10267/// (even)
10268/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
10269/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
10270/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
10271/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
10272/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
10273/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
10274///
10275/// (odd)
10276/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14
10277/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
10278///
10279/// Any of these lanes can of course be undef.
10280///
10281/// This routine only supports N <= 3.
10282/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
10283/// for larger N.
10284///
10285/// \returns N above, or the number of times even/odd elements must be dropped
10286/// if there is such a number. Otherwise returns zero.
10287static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
10288 bool IsSingleInput) {
10289 // The modulus for the shuffle vector entries is based on whether this is
10290 // a single input or not.
10291 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
10292 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
10293 "We should only be called with masks with a power-of-2 size!");
10294
10295 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
10296 int Offset = MatchEven ? 0 : 1;
10297
10298 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
10299 // and 2^3 simultaneously. This is because we may have ambiguity with
10300 // partially undef inputs.
10301 bool ViableForN[3] = {true, true, true};
10302
10303 for (int i = 0, e = Mask.size(); i < e; ++i) {
10304 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
10305 // want.
10306 if (Mask[i] < 0)
10307 continue;
10308
10309 bool IsAnyViable = false;
10310 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10311 if (ViableForN[j]) {
10312 uint64_t N = j + 1;
10313
10314 // The shuffle mask must be equal to (i * 2^N) % M.
10315 if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
10316 IsAnyViable = true;
10317 else
10318 ViableForN[j] = false;
10319 }
10320 // Early exit if we exhaust the possible powers of two.
10321 if (!IsAnyViable)
10322 break;
10323 }
10324
10325 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10326 if (ViableForN[j])
10327 return j + 1;
10328
10329 // Return 0 as there is no viable power of two.
10330 return 0;
10331}
10332
10333// X86 has dedicated pack instructions that can handle specific truncation
10334// operations: PACKSS and PACKUS.
10335// Checks for compaction shuffle masks if MaxStages > 1.
10336// TODO: Add support for matching multiple PACKSS/PACKUS stages.
10337static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
10338 unsigned &PackOpcode, ArrayRef<int> TargetMask,
10339 const SelectionDAG &DAG,
10340 const X86Subtarget &Subtarget,
10341 unsigned MaxStages = 1) {
10342 unsigned NumElts = VT.getVectorNumElements();
10343 unsigned BitSize = VT.getScalarSizeInBits();
10344 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
10345 "Illegal maximum compaction");
10346
10347 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
10348 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
10349 unsigned NumPackedBits = NumSrcBits - BitSize;
10350 N1 = peekThroughBitcasts(N1);
10351 N2 = peekThroughBitcasts(N2);
10352 unsigned NumBits1 = N1.getScalarValueSizeInBits();
10353 unsigned NumBits2 = N2.getScalarValueSizeInBits();
10354 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
10355 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
10356 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
10357 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
10358 return false;
10359 if (Subtarget.hasSSE41() || BitSize == 8) {
10360 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
10361 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
10362 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
10363 V1 = N1;
10364 V2 = N2;
10365 SrcVT = PackVT;
10366 PackOpcode = X86ISD::PACKUS;
10367 return true;
10368 }
10369 }
10370 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
10371 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
10372 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
10373 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
10374 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
10375 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
10376 V1 = N1;
10377 V2 = N2;
10378 SrcVT = PackVT;
10379 PackOpcode = X86ISD::PACKSS;
10380 return true;
10381 }
10382 return false;
10383 };
10384
10385 // Attempt to match against wider and wider compaction patterns.
10386 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
10387 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
10388 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
10389
10390 // Try binary shuffle.
10391 SmallVector<int, 32> BinaryMask;
10392 createPackShuffleMask(VT, BinaryMask, false, NumStages);
10393 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))
10394 if (MatchPACK(V1, V2, PackVT))
10395 return true;
10396
10397 // Try unary shuffle.
10398 SmallVector<int, 32> UnaryMask;
10399 createPackShuffleMask(VT, UnaryMask, true, NumStages);
10400 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))
10401 if (MatchPACK(V1, V1, PackVT))
10402 return true;
10403 }
10404
10405 return false;
10406}
10407
10409 SDValue V1, SDValue V2, SelectionDAG &DAG,
10410 const X86Subtarget &Subtarget) {
10411 MVT PackVT;
10412 unsigned PackOpcode;
10413 unsigned SizeBits = VT.getSizeInBits();
10414 unsigned EltBits = VT.getScalarSizeInBits();
10415 unsigned MaxStages = Log2_32(64 / EltBits);
10416 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
10417 Subtarget, MaxStages))
10418 return SDValue();
10419
10420 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
10421 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
10422
10423 // Don't lower multi-stage packs on AVX512, truncation is better.
10424 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
10425 return SDValue();
10426
10427 // Pack to the largest type possible:
10428 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
10429 unsigned MaxPackBits = 16;
10430 if (CurrentEltBits > 16 &&
10431 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
10432 MaxPackBits = 32;
10433
10434 // Repeatedly pack down to the target size.
10435 SDValue Res;
10436 for (unsigned i = 0; i != NumStages; ++i) {
10437 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
10438 unsigned NumSrcElts = SizeBits / SrcEltBits;
10439 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10440 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
10441 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10442 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
10443 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
10444 DAG.getBitcast(SrcVT, V2));
10445 V1 = V2 = Res;
10446 CurrentEltBits /= 2;
10447 }
10448 assert(Res && Res.getValueType() == VT &&
10449 "Failed to lower compaction shuffle");
10450 return Res;
10451}
10452
10453/// Try to emit a bitmask instruction for a shuffle.
10454///
10455/// This handles cases where we can model a blend exactly as a bitmask due to
10456/// one of the inputs being zeroable.
10458 SDValue V2, ArrayRef<int> Mask,
10459 const APInt &Zeroable,
10460 const X86Subtarget &Subtarget,
10461 SelectionDAG &DAG) {
10462 MVT MaskVT = VT;
10463 MVT EltVT = VT.getVectorElementType();
10464 SDValue Zero, AllOnes;
10465 // Use f64 if i64 isn't legal.
10466 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
10467 EltVT = MVT::f64;
10468 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
10469 }
10470
10471 MVT LogicVT = VT;
10472 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
10473 Zero = DAG.getConstantFP(0.0, DL, EltVT);
10474 APFloat AllOnesValue =
10476 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
10477 LogicVT =
10478 MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
10479 } else {
10480 Zero = DAG.getConstant(0, DL, EltVT);
10481 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
10482 }
10483
10484 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
10485 SDValue V;
10486 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10487 if (Zeroable[i])
10488 continue;
10489 if (Mask[i] % Size != i)
10490 return SDValue(); // Not a blend.
10491 if (!V)
10492 V = Mask[i] < Size ? V1 : V2;
10493 else if (V != (Mask[i] < Size ? V1 : V2))
10494 return SDValue(); // Can only let one input through the mask.
10495
10496 VMaskOps[i] = AllOnes;
10497 }
10498 if (!V)
10499 return SDValue(); // No non-zeroable elements!
10500
10501 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
10502 VMask = DAG.getBitcast(LogicVT, VMask);
10503 V = DAG.getBitcast(LogicVT, V);
10504 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
10505 return DAG.getBitcast(VT, And);
10506}
10507
10508/// Try to emit a blend instruction for a shuffle using bit math.
10509///
10510/// This is used as a fallback approach when first class blend instructions are
10511/// unavailable. Currently it is only suitable for integer vectors, but could
10512/// be generalized for floating point vectors if desirable.
10514 SDValue V2, ArrayRef<int> Mask,
10515 SelectionDAG &DAG) {
10516 assert(VT.isInteger() && "Only supports integer vector types!");
10517 MVT EltVT = VT.getVectorElementType();
10518 SDValue Zero = DAG.getConstant(0, DL, EltVT);
10519 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
10521 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10522 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
10523 return SDValue(); // Shuffled input!
10524 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
10525 }
10526
10527 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
10528 return getBitSelect(DL, VT, V1, V2, V1Mask, DAG);
10529}
10530
10532 SDValue PreservedSrc,
10533 const X86Subtarget &Subtarget,
10534 SelectionDAG &DAG);
10535
10538 const APInt &Zeroable, bool &ForceV1Zero,
10539 bool &ForceV2Zero, uint64_t &BlendMask) {
10540 bool V1IsZeroOrUndef =
10542 bool V2IsZeroOrUndef =
10543 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
10544
10545 BlendMask = 0;
10546 ForceV1Zero = false, ForceV2Zero = false;
10547 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
10548
10549 int NumElts = Mask.size();
10550 int NumLanes = VT.getSizeInBits() / 128;
10551 int NumEltsPerLane = NumElts / NumLanes;
10552 assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch");
10553
10554 // For 32/64-bit elements, if we only reference one input (plus any undefs),
10555 // then ensure the blend mask part for that lane just references that input.
10556 bool ForceWholeLaneMasks =
10557 VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;
10558
10559 // Attempt to generate the binary blend mask. If an input is zero then
10560 // we can use any lane.
10561 for (int Lane = 0; Lane != NumLanes; ++Lane) {
10562 // Keep track of the inputs used per lane.
10563 bool LaneV1InUse = false;
10564 bool LaneV2InUse = false;
10565 uint64_t LaneBlendMask = 0;
10566 for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
10567 int Elt = (Lane * NumEltsPerLane) + LaneElt;
10568 int M = Mask[Elt];
10569 if (M == SM_SentinelUndef)
10570 continue;
10571 if (M == Elt || (0 <= M && M < NumElts &&
10572 IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
10573 Mask[Elt] = Elt;
10574 LaneV1InUse = true;
10575 continue;
10576 }
10577 if (M == (Elt + NumElts) ||
10578 (NumElts <= M &&
10579 IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {
10580 LaneBlendMask |= 1ull << LaneElt;
10581 Mask[Elt] = Elt + NumElts;
10582 LaneV2InUse = true;
10583 continue;
10584 }
10585 if (Zeroable[Elt]) {
10586 if (V1IsZeroOrUndef) {
10587 ForceV1Zero = true;
10588 Mask[Elt] = Elt;
10589 LaneV1InUse = true;
10590 continue;
10591 }
10592 if (V2IsZeroOrUndef) {
10593 ForceV2Zero = true;
10594 LaneBlendMask |= 1ull << LaneElt;
10595 Mask[Elt] = Elt + NumElts;
10596 LaneV2InUse = true;
10597 continue;
10598 }
10599 }
10600 return false;
10601 }
10602
10603 // If we only used V2 then splat the lane blend mask to avoid any demanded
10604 // elts from V1 in this lane (the V1 equivalent is implicit with a zero
10605 // blend mask bit).
10606 if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
10607 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
10608
10609 BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
10610 }
10611 return true;
10612}
10613
10614/// Try to emit a blend instruction for a shuffle.
10615///
10616/// This doesn't do any checks for the availability of instructions for blending
10617/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
10618/// be matched in the backend with the type given. What it does check for is
10619/// that the shuffle mask is a blend, or convertible into a blend with zero.
10621 SDValue V2, ArrayRef<int> Original,
10622 const APInt &Zeroable,
10623 const X86Subtarget &Subtarget,
10624 SelectionDAG &DAG) {
10625 uint64_t BlendMask = 0;
10626 bool ForceV1Zero = false, ForceV2Zero = false;
10627 SmallVector<int, 64> Mask(Original);
10628 if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
10629 BlendMask))
10630 return SDValue();
10631
10632 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
10633 if (ForceV1Zero)
10634 V1 = getZeroVector(VT, Subtarget, DAG, DL);
10635 if (ForceV2Zero)
10636 V2 = getZeroVector(VT, Subtarget, DAG, DL);
10637
10638 unsigned NumElts = VT.getVectorNumElements();
10639
10640 switch (VT.SimpleTy) {
10641 case MVT::v4i64:
10642 case MVT::v8i32:
10643 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
10644 [[fallthrough]];
10645 case MVT::v4f64:
10646 case MVT::v8f32:
10647 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
10648 [[fallthrough]];
10649 case MVT::v2f64:
10650 case MVT::v2i64:
10651 case MVT::v4f32:
10652 case MVT::v4i32:
10653 case MVT::v8i16:
10654 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
10655 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
10656 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
10657 case MVT::v16i16: {
10658 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
10659 SmallVector<int, 8> RepeatedMask;
10660 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
10661 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
10662 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
10663 BlendMask = 0;
10664 for (int i = 0; i < 8; ++i)
10665 if (RepeatedMask[i] >= 8)
10666 BlendMask |= 1ull << i;
10667 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10668 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
10669 }
10670 // Use PBLENDW for lower/upper lanes and then blend lanes.
10671 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
10672 // merge to VSELECT where useful.
10673 uint64_t LoMask = BlendMask & 0xFF;
10674 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
10675 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
10676 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10677 DAG.getTargetConstant(LoMask, DL, MVT::i8));
10678 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10679 DAG.getTargetConstant(HiMask, DL, MVT::i8));
10680 return DAG.getVectorShuffle(
10681 MVT::v16i16, DL, Lo, Hi,
10682 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
10683 }
10684 [[fallthrough]];
10685 }
10686 case MVT::v32i8:
10687 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
10688 [[fallthrough]];
10689 case MVT::v16i8: {
10690 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
10691
10692 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
10693 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
10694 Subtarget, DAG))
10695 return Masked;
10696
10697 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
10698 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
10699 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
10700 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
10701 }
10702
10703 // If we have VPTERNLOG, we can use that as a bit blend.
10704 if (Subtarget.hasVLX())
10705 if (SDValue BitBlend =
10706 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
10707 return BitBlend;
10708
10709 // Scale the blend by the number of bytes per element.
10710 int Scale = VT.getScalarSizeInBits() / 8;
10711
10712 // This form of blend is always done on bytes. Compute the byte vector
10713 // type.
10714 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
10715
10716 // x86 allows load folding with blendvb from the 2nd source operand. But
10717 // we are still using LLVM select here (see comment below), so that's V1.
10718 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
10719 // allow that load-folding possibility.
10720 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
10722 std::swap(V1, V2);
10723 }
10724
10725 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
10726 // mix of LLVM's code generator and the x86 backend. We tell the code
10727 // generator that boolean values in the elements of an x86 vector register
10728 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
10729 // mapping a select to operand #1, and 'false' mapping to operand #2. The
10730 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
10731 // of the element (the remaining are ignored) and 0 in that high bit would
10732 // mean operand #1 while 1 in the high bit would mean operand #2. So while
10733 // the LLVM model for boolean values in vector elements gets the relevant
10734 // bit set, it is set backwards and over constrained relative to x86's
10735 // actual model.
10736 SmallVector<SDValue, 32> VSELECTMask;
10737 for (int i = 0, Size = Mask.size(); i < Size; ++i)
10738 for (int j = 0; j < Scale; ++j)
10739 VSELECTMask.push_back(
10740 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
10741 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
10742 MVT::i8));
10743
10744 V1 = DAG.getBitcast(BlendVT, V1);
10745 V2 = DAG.getBitcast(BlendVT, V2);
10746 return DAG.getBitcast(
10747 VT,
10748 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
10749 V1, V2));
10750 }
10751 case MVT::v16f32:
10752 case MVT::v8f64:
10753 case MVT::v8i64:
10754 case MVT::v16i32:
10755 case MVT::v32i16:
10756 case MVT::v64i8: {
10757 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
10758 bool OptForSize = DAG.shouldOptForSize();
10759 if (!OptForSize) {
10760 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
10761 Subtarget, DAG))
10762 return Masked;
10763 }
10764
10765 // Otherwise load an immediate into a GPR, cast to k-register, and use a
10766 // masked move.
10767 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
10768 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
10769 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
10770 }
10771 default:
10772 llvm_unreachable("Not a supported integer vector type!");
10773 }
10774}
10775
10776/// Try to lower as a blend of elements from two inputs followed by
10777/// a single-input permutation.
10778///
10779/// This matches the pattern where we can blend elements from two inputs and
10780/// then reduce the shuffle to a single-input permutation.
10782 SDValue V1, SDValue V2,
10783 ArrayRef<int> Mask,
10784 SelectionDAG &DAG,
10785 bool ImmBlends = false) {
10786 // We build up the blend mask while checking whether a blend is a viable way
10787 // to reduce the shuffle.
10788 SmallVector<int, 32> BlendMask(Mask.size(), -1);
10789 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
10790
10791 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10792 if (Mask[i] < 0)
10793 continue;
10794
10795 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
10796
10797 if (BlendMask[Mask[i] % Size] < 0)
10798 BlendMask[Mask[i] % Size] = Mask[i];
10799 else if (BlendMask[Mask[i] % Size] != Mask[i])
10800 return SDValue(); // Can't blend in the needed input!
10801
10802 PermuteMask[i] = Mask[i] % Size;
10803 }
10804
10805 // If only immediate blends, then bail if the blend mask can't be widened to
10806 // i16.
10807 unsigned EltSize = VT.getScalarSizeInBits();
10808 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
10809 return SDValue();
10810
10811 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
10812 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
10813}
10814
10815/// Try to lower as an unpack of elements from two inputs followed by
10816/// a single-input permutation.
10817///
10818/// This matches the pattern where we can unpack elements from two inputs and
10819/// then reduce the shuffle to a single-input (wider) permutation.
10821 SDValue V1, SDValue V2,
10822 ArrayRef<int> Mask,
10823 SelectionDAG &DAG) {
10824 int NumElts = Mask.size();
10825 int NumLanes = VT.getSizeInBits() / 128;
10826 int NumLaneElts = NumElts / NumLanes;
10827 int NumHalfLaneElts = NumLaneElts / 2;
10828
10829 bool MatchLo = true, MatchHi = true;
10830 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
10831
10832 // Determine UNPCKL/UNPCKH type and operand order.
10833 for (int Elt = 0; Elt != NumElts; ++Elt) {
10834 int M = Mask[Elt];
10835 if (M < 0)
10836 continue;
10837
10838 // Normalize the mask value depending on whether it's V1 or V2.
10839 int NormM = M;
10840 SDValue &Op = Ops[Elt & 1];
10841 if (M < NumElts && (Op.isUndef() || Op == V1))
10842 Op = V1;
10843 else if (NumElts <= M && (Op.isUndef() || Op == V2)) {
10844 Op = V2;
10845 NormM -= NumElts;
10846 } else
10847 return SDValue();
10848
10849 bool MatchLoAnyLane = false, MatchHiAnyLane = false;
10850 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
10851 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
10852 MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid);
10853 MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi);
10854 if (MatchLoAnyLane || MatchHiAnyLane) {
10855 assert((MatchLoAnyLane ^ MatchHiAnyLane) &&
10856 "Failed to match UNPCKLO/UNPCKHI");
10857 break;
10858 }
10859 }
10860 MatchLo &= MatchLoAnyLane;
10861 MatchHi &= MatchHiAnyLane;
10862 if (!MatchLo && !MatchHi)
10863 return SDValue();
10864 }
10865 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
10866
10867 // Element indices have changed after unpacking. Calculate permute mask
10868 // so that they will be put back to the position as dictated by the
10869 // original shuffle mask indices.
10870 SmallVector<int, 32> PermuteMask(NumElts, -1);
10871 for (int Elt = 0; Elt != NumElts; ++Elt) {
10872 int M = Mask[Elt];
10873 if (M < 0)
10874 continue;
10875 int NormM = M;
10876 if (NumElts <= M)
10877 NormM -= NumElts;
10878 bool IsFirstOp = M < NumElts;
10879 int BaseMaskElt =
10880 NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));
10881 if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0]))
10882 PermuteMask[Elt] = BaseMaskElt;
10883 else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1]))
10884 PermuteMask[Elt] = BaseMaskElt + 1;
10885 assert(PermuteMask[Elt] != -1 &&
10886 "Input mask element is defined but failed to assign permute mask");
10887 }
10888
10889 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
10890 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
10891 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
10892}
10893
10894/// Try to lower a shuffle as a permute of the inputs followed by an
10895/// UNPCK instruction.
10896///
10897/// This specifically targets cases where we end up with alternating between
10898/// the two inputs, and so can permute them into something that feeds a single
10899/// UNPCK instruction. Note that this routine only targets integer vectors
10900/// because for floating point vectors we have a generalized SHUFPS lowering
10901/// strategy that handles everything that doesn't *exactly* match an unpack,
10902/// making this clever lowering unnecessary.
10904 SDValue V1, SDValue V2,
10905 ArrayRef<int> Mask,
10906 const X86Subtarget &Subtarget,
10907 SelectionDAG &DAG) {
10908 int Size = Mask.size();
10909 assert(Mask.size() >= 2 && "Single element masks are invalid.");
10910
10911 // This routine only supports 128-bit integer dual input vectors.
10912 if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())
10913 return SDValue();
10914
10915 int NumLoInputs =
10916 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
10917 int NumHiInputs =
10918 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
10919
10920 bool UnpackLo = NumLoInputs >= NumHiInputs;
10921
10922 auto TryUnpack = [&](int ScalarSize, int Scale) {
10923 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
10924 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
10925
10926 for (int i = 0; i < Size; ++i) {
10927 if (Mask[i] < 0)
10928 continue;
10929
10930 // Each element of the unpack contains Scale elements from this mask.
10931 int UnpackIdx = i / Scale;
10932
10933 // We only handle the case where V1 feeds the first slots of the unpack.
10934 // We rely on canonicalization to ensure this is the case.
10935 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
10936 return SDValue();
10937
10938 // Setup the mask for this input. The indexing is tricky as we have to
10939 // handle the unpack stride.
10940 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
10941 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
10942 Mask[i] % Size;
10943 }
10944
10945 // If we will have to shuffle both inputs to use the unpack, check whether
10946 // we can just unpack first and shuffle the result. If so, skip this unpack.
10947 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
10948 !isNoopShuffleMask(V2Mask))
10949 return SDValue();
10950
10951 // Shuffle the inputs into place.
10952 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
10953 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
10954
10955 // Cast the inputs to the type we will use to unpack them.
10956 MVT UnpackVT =
10957 MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
10958 V1 = DAG.getBitcast(UnpackVT, V1);
10959 V2 = DAG.getBitcast(UnpackVT, V2);
10960
10961 // Unpack the inputs and cast the result back to the desired type.
10962 return DAG.getBitcast(
10963 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
10964 UnpackVT, V1, V2));
10965 };
10966
10967 // We try each unpack from the largest to the smallest to try and find one
10968 // that fits this mask.
10969 int OrigScalarSize = VT.getScalarSizeInBits();
10970 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
10971 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
10972 return Unpack;
10973
10974 // If we're shuffling with a zero vector then we're better off not doing
10975 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
10977 ISD::isBuildVectorAllZeros(V2.getNode()))
10978 return SDValue();
10979
10980 // If none of the unpack-rooted lowerings worked (or were profitable) try an
10981 // initial unpack.
10982 if (NumLoInputs == 0 || NumHiInputs == 0) {
10983 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
10984 "We have to have *some* inputs!");
10985 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
10986
10987 // FIXME: We could consider the total complexity of the permute of each
10988 // possible unpacking. Or at the least we should consider how many
10989 // half-crossings are created.
10990 // FIXME: We could consider commuting the unpacks.
10991
10992 SmallVector<int, 32> PermMask((unsigned)Size, -1);
10993 for (int i = 0; i < Size; ++i) {
10994 if (Mask[i] < 0)
10995 continue;
10996
10997 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
10998
10999 PermMask[i] =
11000 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
11001 }
11002 return DAG.getVectorShuffle(
11003 VT, DL,
11004 DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,
11005 V1, V2),
11006 DAG.getUNDEF(VT), PermMask);
11007 }
11008
11009 return SDValue();
11010}
11011
11012/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
11013/// permuting the elements of the result in place.
11015 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11016 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11017 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
11018 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
11019 (VT.is512BitVector() && !Subtarget.hasBWI()))
11020 return SDValue();
11021
11022 // We don't currently support lane crossing permutes.
11023 if (is128BitLaneCrossingShuffleMask(VT, Mask))
11024 return SDValue();
11025
11026 int Scale = VT.getScalarSizeInBits() / 8;
11027 int NumLanes = VT.getSizeInBits() / 128;
11028 int NumElts = VT.getVectorNumElements();
11029 int NumEltsPerLane = NumElts / NumLanes;
11030
11031 // Determine range of mask elts.
11032 bool Blend1 = true;
11033 bool Blend2 = true;
11034 std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
11035 std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
11036 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11037 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11038 int M = Mask[Lane + Elt];
11039 if (M < 0)
11040 continue;
11041 if (M < NumElts) {
11042 Blend1 &= (M == (Lane + Elt));
11043 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11044 M = M % NumEltsPerLane;
11045 Range1.first = std::min(Range1.first, M);
11046 Range1.second = std::max(Range1.second, M);
11047 } else {
11048 M -= NumElts;
11049 Blend2 &= (M == (Lane + Elt));
11050 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11051 M = M % NumEltsPerLane;
11052 Range2.first = std::min(Range2.first, M);
11053 Range2.second = std::max(Range2.second, M);
11054 }
11055 }
11056 }
11057
11058 // Bail if we don't need both elements.
11059 // TODO - it might be worth doing this for unary shuffles if the permute
11060 // can be widened.
11061 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
11062 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
11063 return SDValue();
11064
11065 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
11066 return SDValue();
11067
11068 // Rotate the 2 ops so we can access both ranges, then permute the result.
11069 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
11070 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11071 SDValue Rotate = DAG.getBitcast(
11072 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
11073 DAG.getBitcast(ByteVT, Lo),
11074 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
11075 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
11076 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11077 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11078 int M = Mask[Lane + Elt];
11079 if (M < 0)
11080 continue;
11081 if (M < NumElts)
11082 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
11083 else
11084 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
11085 }
11086 }
11087 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
11088 };
11089
11090 // Check if the ranges are small enough to rotate from either direction.
11091 if (Range2.second < Range1.first)
11092 return RotateAndPermute(V1, V2, Range1.first, 0);
11093 if (Range1.second < Range2.first)
11094 return RotateAndPermute(V2, V1, Range2.first, NumElts);
11095 return SDValue();
11096}
11097
11099 return isUndefOrEqual(Mask, 0);
11100}
11101
11103 return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
11104}
11105
11106/// Check if the Mask consists of the same element repeated multiple times.
11108 size_t NumUndefs = 0;
11109 std::optional<int> UniqueElt;
11110 for (int Elt : Mask) {
11111 if (Elt == SM_SentinelUndef) {
11112 NumUndefs++;
11113 continue;
11114 }
11115 if (UniqueElt.has_value() && UniqueElt.value() != Elt)
11116 return false;
11117 UniqueElt = Elt;
11118 }
11119 // Make sure the element is repeated enough times by checking the number of
11120 // undefs is small.
11121 return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
11122}
11123
11124/// Generic routine to decompose a shuffle and blend into independent
11125/// blends and permutes.
11126///
11127/// This matches the extremely common pattern for handling combined
11128/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
11129/// operations. It will try to pick the best arrangement of shuffles and
11130/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
11132 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11133 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11134 int NumElts = Mask.size();
11135 int NumLanes = VT.getSizeInBits() / 128;
11136 int NumEltsPerLane = NumElts / NumLanes;
11137
11138 // Shuffle the input elements into the desired positions in V1 and V2 and
11139 // unpack/blend them together.
11140 bool IsAlternating = true;
11141 SmallVector<int, 32> V1Mask(NumElts, -1);
11142 SmallVector<int, 32> V2Mask(NumElts, -1);
11143 SmallVector<int, 32> FinalMask(NumElts, -1);
11144 for (int i = 0; i < NumElts; ++i) {
11145 int M = Mask[i];
11146 if (M >= 0 && M < NumElts) {
11147 V1Mask[i] = M;
11148 FinalMask[i] = i;
11149 IsAlternating &= (i & 1) == 0;
11150 } else if (M >= NumElts) {
11151 V2Mask[i] = M - NumElts;
11152 FinalMask[i] = i + NumElts;
11153 IsAlternating &= (i & 1) == 1;
11154 }
11155 }
11156
11157 // If we effectively only demand the 0'th element of \p Input, and not only
11158 // as 0'th element, then broadcast said input,
11159 // and change \p InputMask to be a no-op (identity) mask.
11160 auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
11161 &DAG](SDValue &Input,
11162 MutableArrayRef<int> InputMask) {
11163 unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
11164 if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
11165 !X86::mayFoldLoad(Input, Subtarget)))
11166 return;
11167 if (isNoopShuffleMask(InputMask))
11168 return;
11169 assert(isBroadcastShuffleMask(InputMask) &&
11170 "Expected to demand only the 0'th element.");
11171 Input = DAG.getNode(X86ISD::VBROADCAST, DL, VT, Input);
11172 for (auto I : enumerate(InputMask)) {
11173 int &InputMaskElt = I.value();
11174 if (InputMaskElt >= 0)
11175 InputMaskElt = I.index();
11176 }
11177 };
11178
11179 // Currently, we may need to produce one shuffle per input, and blend results.
11180 // It is possible that the shuffle for one of the inputs is already a no-op.
11181 // See if we can simplify non-no-op shuffles into broadcasts,
11182 // which we consider to be strictly better than an arbitrary shuffle.
11183 if (isNoopOrBroadcastShuffleMask(V1Mask) &&
11185 canonicalizeBroadcastableInput(V1, V1Mask);
11186 canonicalizeBroadcastableInput(V2, V2Mask);
11187 }
11188
11189 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
11190 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
11191 // the shuffle may be able to fold with a load or other benefit. However, when
11192 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
11193 // pre-shuffle first is a better strategy.
11194 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
11195 // Only prefer immediate blends to unpack/rotate.
11196 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11197 DAG, true))
11198 return BlendPerm;
11199 // If either input vector provides only a single element which is repeated
11200 // multiple times, unpacking from both input vectors would generate worse
11201 // code. e.g. for
11202 // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
11203 // it is better to process t4 first to create a vector of t4[0], then unpack
11204 // that vector with t2.
11205 if (!isSingleElementRepeatedMask(V1Mask) &&
11207 if (SDValue UnpackPerm =
11208 lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
11209 return UnpackPerm;
11211 DL, VT, V1, V2, Mask, Subtarget, DAG))
11212 return RotatePerm;
11213 // Unpack/rotate failed - try again with variable blends.
11214 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11215 DAG))
11216 return BlendPerm;
11217 if (VT.getScalarSizeInBits() >= 32)
11218 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
11219 DL, VT, V1, V2, Mask, Subtarget, DAG))
11220 return PermUnpack;
11221 }
11222
11223 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
11224 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
11225 // TODO: It doesn't have to be alternating - but each lane mustn't have more
11226 // than half the elements coming from each source.
11227 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
11228 V1Mask.assign(NumElts, -1);
11229 V2Mask.assign(NumElts, -1);
11230 FinalMask.assign(NumElts, -1);
11231 for (int i = 0; i != NumElts; i += NumEltsPerLane)
11232 for (int j = 0; j != NumEltsPerLane; ++j) {
11233 int M = Mask[i + j];
11234 if (M >= 0 && M < NumElts) {
11235 V1Mask[i + (j / 2)] = M;
11236 FinalMask[i + j] = i + (j / 2);
11237 } else if (M >= NumElts) {
11238 V2Mask[i + (j / 2)] = M - NumElts;
11239 FinalMask[i + j] = i + (j / 2) + NumElts;
11240 }
11241 }
11242 }
11243
11244 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11245 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11246 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
11247}
11248
11249static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
11250 const X86Subtarget &Subtarget,
11251 ArrayRef<int> Mask) {
11252 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11253 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
11254
11255 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
11256 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
11257 int MaxSubElts = 64 / EltSizeInBits;
11258 unsigned RotateAmt, NumSubElts;
11259 if (!ShuffleVectorInst::isBitRotateMask(Mask, EltSizeInBits, MinSubElts,
11260 MaxSubElts, NumSubElts, RotateAmt))
11261 return -1;
11262 unsigned NumElts = Mask.size();
11263 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
11264 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
11265 return RotateAmt;
11266}
11267
11268/// Lower shuffle using X86ISD::VROTLI rotations.
11270 ArrayRef<int> Mask,
11271 const X86Subtarget &Subtarget,
11272 SelectionDAG &DAG) {
11273 // Only XOP + AVX512 targets have bit rotation instructions.
11274 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
11275 bool IsLegal =
11276 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
11277 if (!IsLegal && Subtarget.hasSSE3())
11278 return SDValue();
11279
11280 MVT RotateVT;
11281 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
11282 Subtarget, Mask);
11283 if (RotateAmt < 0)
11284 return SDValue();
11285
11286 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
11287 // expanded to OR(SRL,SHL), will be more efficient, but if they can
11288 // widen to vXi16 or more then existing lowering should will be better.
11289 if (!IsLegal) {
11290 if ((RotateAmt % 16) == 0)
11291 return SDValue();
11292 // TODO: Use getTargetVShiftByConstNode.
11293 unsigned ShlAmt = RotateAmt;
11294 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
11295 V1 = DAG.getBitcast(RotateVT, V1);
11296 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
11297 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
11298 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
11299 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
11300 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
11301 return DAG.getBitcast(VT, Rot);
11302 }
11303
11304 SDValue Rot =
11305 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
11306 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
11307 return DAG.getBitcast(VT, Rot);
11308}
11309
11310/// Try to match a vector shuffle as an element rotation.
11311///
11312/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
11314 ArrayRef<int> Mask) {
11315 int NumElts = Mask.size();
11316
11317 // We need to detect various ways of spelling a rotation:
11318 // [11, 12, 13, 14, 15, 0, 1, 2]
11319 // [-1, 12, 13, 14, -1, -1, 1, -1]
11320 // [-1, -1, -1, -1, -1, -1, 1, 2]
11321 // [ 3, 4, 5, 6, 7, 8, 9, 10]
11322 // [-1, 4, 5, 6, -1, -1, 9, -1]
11323 // [-1, 4, 5, 6, -1, -1, -1, -1]
11324 int Rotation = 0;
11325 SDValue Lo, Hi;
11326 for (int i = 0; i < NumElts; ++i) {
11327 int M = Mask[i];
11328 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
11329 "Unexpected mask index.");
11330 if (M < 0)
11331 continue;
11332
11333 // Determine where a rotated vector would have started.
11334 int StartIdx = i - (M % NumElts);
11335 if (StartIdx == 0)
11336 // The identity rotation isn't interesting, stop.
11337 return -1;
11338
11339 // If we found the tail of a vector the rotation must be the missing
11340 // front. If we found the head of a vector, it must be how much of the
11341 // head.
11342 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
11343
11344 if (Rotation == 0)
11345 Rotation = CandidateRotation;
11346 else if (Rotation != CandidateRotation)
11347 // The rotations don't match, so we can't match this mask.
11348 return -1;
11349
11350 // Compute which value this mask is pointing at.
11351 SDValue MaskV = M < NumElts ? V1 : V2;
11352
11353 // Compute which of the two target values this index should be assigned
11354 // to. This reflects whether the high elements are remaining or the low
11355 // elements are remaining.
11356 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
11357
11358 // Either set up this value if we've not encountered it before, or check
11359 // that it remains consistent.
11360 if (!TargetV)
11361 TargetV = MaskV;
11362 else if (TargetV != MaskV)
11363 // This may be a rotation, but it pulls from the inputs in some
11364 // unsupported interleaving.
11365 return -1;
11366 }
11367
11368 // Check that we successfully analyzed the mask, and normalize the results.
11369 assert(Rotation != 0 && "Failed to locate a viable rotation!");
11370 assert((Lo || Hi) && "Failed to find a rotated input vector!");
11371 if (!Lo)
11372 Lo = Hi;
11373 else if (!Hi)
11374 Hi = Lo;
11375
11376 V1 = Lo;
11377 V2 = Hi;
11378
11379 return Rotation;
11380}
11381
11382/// Try to lower a vector shuffle as a byte rotation.
11383///
11384/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
11385/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
11386/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
11387/// try to generically lower a vector shuffle through such an pattern. It
11388/// does not check for the profitability of lowering either as PALIGNR or
11389/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
11390/// This matches shuffle vectors that look like:
11391///
11392/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
11393///
11394/// Essentially it concatenates V1 and V2, shifts right by some number of
11395/// elements, and takes the low elements as the result. Note that while this is
11396/// specified as a *right shift* because x86 is little-endian, it is a *left
11397/// rotate* of the vector lanes.
11399 ArrayRef<int> Mask) {
11400 // Don't accept any shuffles with zero elements.
11401 if (isAnyZero(Mask))
11402 return -1;
11403
11404 // PALIGNR works on 128-bit lanes.
11405 SmallVector<int, 16> RepeatedMask;
11406 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
11407 return -1;
11408
11409 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
11410 if (Rotation <= 0)
11411 return -1;
11412
11413 // PALIGNR rotates bytes, so we need to scale the
11414 // rotation based on how many bytes are in the vector lane.
11415 int NumElts = RepeatedMask.size();
11416 int Scale = 16 / NumElts;
11417 return Rotation * Scale;
11418}
11419
11421 SDValue V2, ArrayRef<int> Mask,
11422 const X86Subtarget &Subtarget,
11423 SelectionDAG &DAG) {
11424 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11425
11426 SDValue Lo = V1, Hi = V2;
11427 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
11428 if (ByteRotation <= 0)
11429 return SDValue();
11430
11431 // Cast the inputs to i8 vector of correct length to match PALIGNR or
11432 // PSLLDQ/PSRLDQ.
11433 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11434 Lo = DAG.getBitcast(ByteVT, Lo);
11435 Hi = DAG.getBitcast(ByteVT, Hi);
11436
11437 // SSSE3 targets can use the palignr instruction.
11438 if (Subtarget.hasSSSE3()) {
11439 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
11440 "512-bit PALIGNR requires BWI instructions");
11441 return DAG.getBitcast(
11442 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
11443 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
11444 }
11445
11446 assert(VT.is128BitVector() &&
11447 "Rotate-based lowering only supports 128-bit lowering!");
11448 assert(Mask.size() <= 16 &&
11449 "Can shuffle at most 16 bytes in a 128-bit vector!");
11450 assert(ByteVT == MVT::v16i8 &&
11451 "SSE2 rotate lowering only needed for v16i8!");
11452
11453 // Default SSE2 implementation
11454 int LoByteShift = 16 - ByteRotation;
11455 int HiByteShift = ByteRotation;
11456
11457 SDValue LoShift =
11458 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
11459 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
11460 SDValue HiShift =
11461 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
11462 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
11463 return DAG.getBitcast(VT,
11464 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
11465}
11466
11467/// Try to lower a vector shuffle as a dword/qword rotation.
11468///
11469/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
11470/// rotation of the concatenation of two vectors; This routine will
11471/// try to generically lower a vector shuffle through such an pattern.
11472///
11473/// Essentially it concatenates V1 and V2, shifts right by some number of
11474/// elements, and takes the low elements as the result. Note that while this is
11475/// specified as a *right shift* because x86 is little-endian, it is a *left
11476/// rotate* of the vector lanes.
11478 SDValue V2, ArrayRef<int> Mask,
11479 const APInt &Zeroable,
11480 const X86Subtarget &Subtarget,
11481 SelectionDAG &DAG) {
11482 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
11483 "Only 32-bit and 64-bit elements are supported!");
11484
11485 // 128/256-bit vectors are only supported with VLX.
11486 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
11487 && "VLX required for 128/256-bit vectors");
11488
11489 SDValue Lo = V1, Hi = V2;
11490 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
11491 if (0 < Rotation)
11492 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
11493 DAG.getTargetConstant(Rotation, DL, MVT::i8));
11494
11495 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
11496 // TODO: Pull this out as a matchShuffleAsElementShift helper?
11497 // TODO: We can probably make this more aggressive and use shift-pairs like
11498 // lowerShuffleAsByteShiftMask.
11499 unsigned NumElts = Mask.size();
11500 unsigned ZeroLo = Zeroable.countr_one();
11501 unsigned ZeroHi = Zeroable.countl_one();
11502 assert((ZeroLo + ZeroHi) < NumElts && "Zeroable shuffle detected");
11503 if (!ZeroLo && !ZeroHi)
11504 return SDValue();
11505
11506 if (ZeroLo) {
11507 SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
11508 int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts;
11509 if (isSequentialOrUndefInRange(Mask, ZeroLo, NumElts - ZeroLo, Low))
11510 return DAG.getNode(X86ISD::VALIGN, DL, VT, Src,
11511 getZeroVector(VT, Subtarget, DAG, DL),
11512 DAG.getTargetConstant(NumElts - ZeroLo, DL, MVT::i8));
11513 }
11514
11515 if (ZeroHi) {
11516 SDValue Src = Mask[0] < (int)NumElts ? V1 : V2;
11517 int Low = Mask[0] < (int)NumElts ? 0 : NumElts;
11518 if (isSequentialOrUndefInRange(Mask, 0, NumElts - ZeroHi, Low + ZeroHi))
11519 return DAG.getNode(X86ISD::VALIGN, DL, VT,
11520 getZeroVector(VT, Subtarget, DAG, DL), Src,
11521 DAG.getTargetConstant(ZeroHi, DL, MVT::i8));
11522 }
11523
11524 return SDValue();
11525}
11526
11527/// Try to lower a vector shuffle as a byte shift sequence.
11529 SDValue V2, ArrayRef<int> Mask,
11530 const APInt &Zeroable,
11531 const X86Subtarget &Subtarget,
11532 SelectionDAG &DAG) {
11533 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11534 assert(VT.is128BitVector() && "Only 128-bit vectors supported");
11535
11536 // We need a shuffle that has zeros at one/both ends and a sequential
11537 // shuffle from one source within.
11538 unsigned ZeroLo = Zeroable.countr_one();
11539 unsigned ZeroHi = Zeroable.countl_one();
11540 if (!ZeroLo && !ZeroHi)
11541 return SDValue();
11542
11543 unsigned NumElts = Mask.size();
11544 unsigned Len = NumElts - (ZeroLo + ZeroHi);
11545 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
11546 return SDValue();
11547
11548 unsigned Scale = VT.getScalarSizeInBits() / 8;
11549 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
11550 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
11551 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
11552 return SDValue();
11553
11554 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
11555 Res = DAG.getBitcast(MVT::v16i8, Res);
11556
11557 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
11558 // inner sequential set of elements, possibly offset:
11559 // 01234567 --> zzzzzz01 --> 1zzzzzzz
11560 // 01234567 --> 4567zzzz --> zzzzz456
11561 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
11562 if (ZeroLo == 0) {
11563 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
11564 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11565 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11566 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11567 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
11568 } else if (ZeroHi == 0) {
11569 unsigned Shift = Mask[ZeroLo] % NumElts;
11570 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11571 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11572 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11573 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
11574 } else if (!Subtarget.hasSSSE3()) {
11575 // If we don't have PSHUFB then its worth avoiding an AND constant mask
11576 // by performing 3 byte shifts. Shuffle combining can kick in above that.
11577 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
11578 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
11579 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11580 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11581 Shift += Mask[ZeroLo] % NumElts;
11582 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11583 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11584 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11585 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
11586 } else
11587 return SDValue();
11588
11589 return DAG.getBitcast(VT, Res);
11590}
11591
11592/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
11593///
11594/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
11595/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
11596/// matches elements from one of the input vectors shuffled to the left or
11597/// right with zeroable elements 'shifted in'. It handles both the strictly
11598/// bit-wise element shifts and the byte shift across an entire 128-bit double
11599/// quad word lane.
11600///
11601/// PSHL : (little-endian) left bit shift.
11602/// [ zz, 0, zz, 2 ]
11603/// [ -1, 4, zz, -1 ]
11604/// PSRL : (little-endian) right bit shift.
11605/// [ 1, zz, 3, zz]
11606/// [ -1, -1, 7, zz]
11607/// PSLLDQ : (little-endian) left byte shift
11608/// [ zz, 0, 1, 2, 3, 4, 5, 6]
11609/// [ zz, zz, -1, -1, 2, 3, 4, -1]
11610/// [ zz, zz, zz, zz, zz, zz, -1, 1]
11611/// PSRLDQ : (little-endian) right byte shift
11612/// [ 5, 6, 7, zz, zz, zz, zz, zz]
11613/// [ -1, 5, 6, 7, zz, zz, zz, zz]
11614/// [ 1, 2, -1, -1, -1, -1, zz, zz]
11615static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
11616 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
11617 int MaskOffset, const APInt &Zeroable,
11618 const X86Subtarget &Subtarget) {
11619 int Size = Mask.size();
11620 unsigned SizeInBits = Size * ScalarSizeInBits;
11621
11622 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
11623 for (int i = 0; i < Size; i += Scale)
11624 for (int j = 0; j < Shift; ++j)
11625 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
11626 return false;
11627
11628 return true;
11629 };
11630
11631 auto MatchShift = [&](int Shift, int Scale, bool Left) {
11632 for (int i = 0; i != Size; i += Scale) {
11633 unsigned Pos = Left ? i + Shift : i;
11634 unsigned Low = Left ? i : i + Shift;
11635 unsigned Len = Scale - Shift;
11636 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
11637 return -1;
11638 }
11639
11640 int ShiftEltBits = ScalarSizeInBits * Scale;
11641 bool ByteShift = ShiftEltBits > 64;
11642 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
11643 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
11644 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
11645
11646 // Normalize the scale for byte shifts to still produce an i64 element
11647 // type.
11648 Scale = ByteShift ? Scale / 2 : Scale;
11649
11650 // We need to round trip through the appropriate type for the shift.
11651 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
11652 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
11653 : MVT::getVectorVT(ShiftSVT, Size / Scale);
11654 return (int)ShiftAmt;
11655 };
11656
11657 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
11658 // keep doubling the size of the integer elements up to that. We can
11659 // then shift the elements of the integer vector by whole multiples of
11660 // their width within the elements of the larger integer vector. Test each
11661 // multiple to see if we can find a match with the moved element indices
11662 // and that the shifted in elements are all zeroable.
11663 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
11664 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
11665 for (int Shift = 1; Shift != Scale; ++Shift)
11666 for (bool Left : {true, false})
11667 if (CheckZeros(Shift, Scale, Left)) {
11668 int ShiftAmt = MatchShift(Shift, Scale, Left);
11669 if (0 < ShiftAmt)
11670 return ShiftAmt;
11671 }
11672
11673 // no match
11674 return -1;
11675}
11676
11678 SDValue V2, ArrayRef<int> Mask,
11679 const APInt &Zeroable,
11680 const X86Subtarget &Subtarget,
11681 SelectionDAG &DAG, bool BitwiseOnly) {
11682 int Size = Mask.size();
11683 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
11684
11685 MVT ShiftVT;
11686 SDValue V = V1;
11687 unsigned Opcode;
11688
11689 // Try to match shuffle against V1 shift.
11690 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
11691 Mask, 0, Zeroable, Subtarget);
11692
11693 // If V1 failed, try to match shuffle against V2 shift.
11694 if (ShiftAmt < 0) {
11695 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
11696 Mask, Size, Zeroable, Subtarget);
11697 V = V2;
11698 }
11699
11700 if (ShiftAmt < 0)
11701 return SDValue();
11702
11703 if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))
11704 return SDValue();
11705
11706 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
11707 "Illegal integer vector type");
11708 V = DAG.getBitcast(ShiftVT, V);
11709 V = DAG.getNode(Opcode, DL, ShiftVT, V,
11710 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
11711 return DAG.getBitcast(VT, V);
11712}
11713
11714// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
11715// Remainder of lower half result is zero and upper half is all undef.
11716static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
11717 ArrayRef<int> Mask, uint64_t &BitLen,
11718 uint64_t &BitIdx, const APInt &Zeroable) {
11719 int Size = Mask.size();
11720 int HalfSize = Size / 2;
11721 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
11722 assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask");
11723
11724 // Upper half must be undefined.
11725 if (!isUndefUpperHalf(Mask))
11726 return false;
11727
11728 // Determine the extraction length from the part of the
11729 // lower half that isn't zeroable.
11730 int Len = HalfSize;
11731 for (; Len > 0; --Len)
11732 if (!Zeroable[Len - 1])
11733 break;
11734 assert(Len > 0 && "Zeroable shuffle mask");
11735
11736 // Attempt to match first Len sequential elements from the lower half.
11737 SDValue Src;
11738 int Idx = -1;
11739 for (int i = 0; i != Len; ++i) {
11740 int M = Mask[i];
11741 if (M == SM_SentinelUndef)
11742 continue;
11743 SDValue &V = (M < Size ? V1 : V2);
11744 M = M % Size;
11745
11746 // The extracted elements must start at a valid index and all mask
11747 // elements must be in the lower half.
11748 if (i > M || M >= HalfSize)
11749 return false;
11750
11751 if (Idx < 0 || (Src == V && Idx == (M - i))) {
11752 Src = V;
11753 Idx = M - i;
11754 continue;
11755 }
11756 return false;
11757 }
11758
11759 if (!Src || Idx < 0)
11760 return false;
11761
11762 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
11763 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
11764 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
11765 V1 = Src;
11766 return true;
11767}
11768
11769// INSERTQ: Extract lowest Len elements from lower half of second source and
11770// insert over first source, starting at Idx.
11771// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
11772static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
11773 ArrayRef<int> Mask, uint64_t &BitLen,
11774 uint64_t &BitIdx) {
11775 int Size = Mask.size();
11776 int HalfSize = Size / 2;
11777 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
11778
11779 // Upper half must be undefined.
11780 if (!isUndefUpperHalf(Mask))
11781 return false;
11782
11783 for (int Idx = 0; Idx != HalfSize; ++Idx) {
11784 SDValue Base;
11785
11786 // Attempt to match first source from mask before insertion point.
11787 if (isUndefInRange(Mask, 0, Idx)) {
11788 /* EMPTY */
11789 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
11790 Base = V1;
11791 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
11792 Base = V2;
11793 } else {
11794 continue;
11795 }
11796
11797 // Extend the extraction length looking to match both the insertion of
11798 // the second source and the remaining elements of the first.
11799 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
11800 SDValue Insert;
11801 int Len = Hi - Idx;
11802
11803 // Match insertion.
11804 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
11805 Insert = V1;
11806 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
11807 Insert = V2;
11808 } else {
11809 continue;
11810 }
11811
11812 // Match the remaining elements of the lower half.
11813 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
11814 /* EMPTY */
11815 } else if ((!Base || (Base == V1)) &&
11816 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
11817 Base = V1;
11818 } else if ((!Base || (Base == V2)) &&
11819 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
11820 Size + Hi)) {
11821 Base = V2;
11822 } else {
11823 continue;
11824 }
11825
11826 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
11827 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
11828 V1 = Base;
11829 V2 = Insert;
11830 return true;
11831 }
11832 }
11833
11834 return false;
11835}
11836
11837/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
11839 SDValue V2, ArrayRef<int> Mask,
11840 const APInt &Zeroable, SelectionDAG &DAG) {
11841 uint64_t BitLen, BitIdx;
11842 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
11843 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
11844 DAG.getTargetConstant(BitLen, DL, MVT::i8),
11845 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
11846
11847 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
11848 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
11849 V2 ? V2 : DAG.getUNDEF(VT),
11850 DAG.getTargetConstant(BitLen, DL, MVT::i8),
11851 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
11852
11853 return SDValue();
11854}
11855
11856/// Lower a vector shuffle as a zero or any extension.
11857///
11858/// Given a specific number of elements, element bit width, and extension
11859/// stride, produce either a zero or any extension based on the available
11860/// features of the subtarget. The extended elements are consecutive and
11861/// begin and can start from an offsetted element index in the input; to
11862/// avoid excess shuffling the offset must either being in the bottom lane
11863/// or at the start of a higher lane. All extended elements must be from
11864/// the same lane.
11866 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
11867 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11868 assert(Scale > 1 && "Need a scale to extend.");
11869 int EltBits = VT.getScalarSizeInBits();
11870 int NumElements = VT.getVectorNumElements();
11871 int NumEltsPerLane = 128 / EltBits;
11872 int OffsetLane = Offset / NumEltsPerLane;
11873 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
11874 "Only 8, 16, and 32 bit elements can be extended.");
11875 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
11876 assert(0 <= Offset && "Extension offset must be positive.");
11877 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
11878 "Extension offset must be in the first lane or start an upper lane.");
11879
11880 // Check that an index is in same lane as the base offset.
11881 auto SafeOffset = [&](int Idx) {
11882 return OffsetLane == (Idx / NumEltsPerLane);
11883 };
11884
11885 // Shift along an input so that the offset base moves to the first element.
11886 auto ShuffleOffset = [&](SDValue V) {
11887 if (!Offset)
11888 return V;
11889
11890 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
11891 for (int i = 0; i * Scale < NumElements; ++i) {
11892 int SrcIdx = i + Offset;
11893 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
11894 }
11895 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
11896 };
11897
11898 // Found a valid a/zext mask! Try various lowering strategies based on the
11899 // input type and available ISA extensions.
11900 if (Subtarget.hasSSE41()) {
11901 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
11902 // PUNPCK will catch this in a later shuffle match.
11903 if (Offset && Scale == 2 && VT.is128BitVector())
11904 return SDValue();
11905 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
11906 NumElements / Scale);
11907 InputV = DAG.getBitcast(VT, InputV);
11908 InputV = ShuffleOffset(InputV);
11910 DL, ExtVT, InputV, DAG);
11911 return DAG.getBitcast(VT, InputV);
11912 }
11913
11914 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
11915 InputV = DAG.getBitcast(VT, InputV);
11916
11917 // For any extends we can cheat for larger element sizes and use shuffle
11918 // instructions that can fold with a load and/or copy.
11919 if (AnyExt && EltBits == 32) {
11920 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
11921 -1};
11922 return DAG.getBitcast(
11923 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
11924 DAG.getBitcast(MVT::v4i32, InputV),
11925 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11926 }
11927 if (AnyExt && EltBits == 16 && Scale > 2) {
11928 int PSHUFDMask[4] = {Offset / 2, -1,
11929 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
11930 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
11931 DAG.getBitcast(MVT::v4i32, InputV),
11932 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
11933 int PSHUFWMask[4] = {1, -1, -1, -1};
11934 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
11935 return DAG.getBitcast(
11936 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
11937 DAG.getBitcast(MVT::v8i16, InputV),
11938 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
11939 }
11940
11941 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
11942 // to 64-bits.
11943 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
11944 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
11945 assert(VT.is128BitVector() && "Unexpected vector width!");
11946
11947 int LoIdx = Offset * EltBits;
11948 SDValue Lo = DAG.getBitcast(
11949 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
11950 DAG.getTargetConstant(EltBits, DL, MVT::i8),
11951 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
11952
11953 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
11954 return DAG.getBitcast(VT, Lo);
11955
11956 int HiIdx = (Offset + 1) * EltBits;
11957 SDValue Hi = DAG.getBitcast(
11958 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
11959 DAG.getTargetConstant(EltBits, DL, MVT::i8),
11960 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
11961 return DAG.getBitcast(VT,
11962 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
11963 }
11964
11965 // If this would require more than 2 unpack instructions to expand, use
11966 // pshufb when available. We can only use more than 2 unpack instructions
11967 // when zero extending i8 elements which also makes it easier to use pshufb.
11968 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
11969 assert(NumElements == 16 && "Unexpected byte vector width!");
11970 SDValue PSHUFBMask[16];
11971 for (int i = 0; i < 16; ++i) {
11972 int Idx = Offset + (i / Scale);
11973 if ((i % Scale == 0 && SafeOffset(Idx))) {
11974 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
11975 continue;
11976 }
11977 PSHUFBMask[i] =
11978 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
11979 }
11980 InputV = DAG.getBitcast(MVT::v16i8, InputV);
11981 return DAG.getBitcast(
11982 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
11983 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
11984 }
11985
11986 // If we are extending from an offset, ensure we start on a boundary that
11987 // we can unpack from.
11988 int AlignToUnpack = Offset % (NumElements / Scale);
11989 if (AlignToUnpack) {
11990 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
11991 for (int i = AlignToUnpack; i < NumElements; ++i)
11992 ShMask[i - AlignToUnpack] = i;
11993 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
11994 Offset -= AlignToUnpack;
11995 }
11996
11997 // Otherwise emit a sequence of unpacks.
11998 do {
11999 unsigned UnpackLoHi = X86ISD::UNPCKL;
12000 if (Offset >= (NumElements / 2)) {
12001 UnpackLoHi = X86ISD::UNPCKH;
12002 Offset -= (NumElements / 2);
12003 }
12004
12005 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
12006 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
12007 : getZeroVector(InputVT, Subtarget, DAG, DL);
12008 InputV = DAG.getBitcast(InputVT, InputV);
12009 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
12010 Scale /= 2;
12011 EltBits *= 2;
12012 NumElements /= 2;
12013 } while (Scale > 1);
12014 return DAG.getBitcast(VT, InputV);
12015}
12016
12017/// Try to lower a vector shuffle as a zero extension on any microarch.
12018///
12019/// This routine will try to do everything in its power to cleverly lower
12020/// a shuffle which happens to match the pattern of a zero extend. It doesn't
12021/// check for the profitability of this lowering, it tries to aggressively
12022/// match this pattern. It will use all of the micro-architectural details it
12023/// can to emit an efficient lowering. It handles both blends with all-zero
12024/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
12025/// masking out later).
12026///
12027/// The reason we have dedicated lowering for zext-style shuffles is that they
12028/// are both incredibly common and often quite performance sensitive.
12030 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12031 const APInt &Zeroable, const X86Subtarget &Subtarget,
12032 SelectionDAG &DAG) {
12033 int Bits = VT.getSizeInBits();
12034 int NumLanes = Bits / 128;
12035 int NumElements = VT.getVectorNumElements();
12036 int NumEltsPerLane = NumElements / NumLanes;
12037 assert(VT.getScalarSizeInBits() <= 32 &&
12038 "Exceeds 32-bit integer zero extension limit");
12039 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
12040
12041 // Define a helper function to check a particular ext-scale and lower to it if
12042 // valid.
12043 auto Lower = [&](int Scale) -> SDValue {
12044 SDValue InputV;
12045 bool AnyExt = true;
12046 int Offset = 0;
12047 int Matches = 0;
12048 for (int i = 0; i < NumElements; ++i) {
12049 int M = Mask[i];
12050 if (M < 0)
12051 continue; // Valid anywhere but doesn't tell us anything.
12052 if (i % Scale != 0) {
12053 // Each of the extended elements need to be zeroable.
12054 if (!Zeroable[i])
12055 return SDValue();
12056
12057 // We no longer are in the anyext case.
12058 AnyExt = false;
12059 continue;
12060 }
12061
12062 // Each of the base elements needs to be consecutive indices into the
12063 // same input vector.
12064 SDValue V = M < NumElements ? V1 : V2;
12065 M = M % NumElements;
12066 if (!InputV) {
12067 InputV = V;
12068 Offset = M - (i / Scale);
12069 } else if (InputV != V)
12070 return SDValue(); // Flip-flopping inputs.
12071
12072 // Offset must start in the lowest 128-bit lane or at the start of an
12073 // upper lane.
12074 // FIXME: Is it ever worth allowing a negative base offset?
12075 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
12076 (Offset % NumEltsPerLane) == 0))
12077 return SDValue();
12078
12079 // If we are offsetting, all referenced entries must come from the same
12080 // lane.
12081 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
12082 return SDValue();
12083
12084 if ((M % NumElements) != (Offset + (i / Scale)))
12085 return SDValue(); // Non-consecutive strided elements.
12086 Matches++;
12087 }
12088
12089 // If we fail to find an input, we have a zero-shuffle which should always
12090 // have already been handled.
12091 // FIXME: Maybe handle this here in case during blending we end up with one?
12092 if (!InputV)
12093 return SDValue();
12094
12095 // If we are offsetting, don't extend if we only match a single input, we
12096 // can always do better by using a basic PSHUF or PUNPCK.
12097 if (Offset != 0 && Matches < 2)
12098 return SDValue();
12099
12100 return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
12101 InputV, Mask, Subtarget, DAG);
12102 };
12103
12104 // The widest scale possible for extending is to a 64-bit integer.
12105 assert(Bits % 64 == 0 &&
12106 "The number of bits in a vector must be divisible by 64 on x86!");
12107 int NumExtElements = Bits / 64;
12108
12109 // Each iteration, try extending the elements half as much, but into twice as
12110 // many elements.
12111 for (; NumExtElements < NumElements; NumExtElements *= 2) {
12112 assert(NumElements % NumExtElements == 0 &&
12113 "The input vector size must be divisible by the extended size.");
12114 if (SDValue V = Lower(NumElements / NumExtElements))
12115 return V;
12116 }
12117
12118 // General extends failed, but 128-bit vectors may be able to use MOVQ.
12119 if (Bits != 128)
12120 return SDValue();
12121
12122 // Returns one of the source operands if the shuffle can be reduced to a
12123 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
12124 auto CanZExtLowHalf = [&]() {
12125 for (int i = NumElements / 2; i != NumElements; ++i)
12126 if (!Zeroable[i])
12127 return SDValue();
12128 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
12129 return V1;
12130 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
12131 return V2;
12132 return SDValue();
12133 };
12134
12135 if (SDValue V = CanZExtLowHalf()) {
12136 V = DAG.getBitcast(MVT::v2i64, V);
12137 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
12138 return DAG.getBitcast(VT, V);
12139 }
12140
12141 // No viable ext lowering found.
12142 return SDValue();
12143}
12144
12145/// Try to get a scalar value for a specific element of a vector.
12146///
12147/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
12149 SelectionDAG &DAG) {
12150 MVT VT = V.getSimpleValueType();
12151 MVT EltVT = VT.getVectorElementType();
12152 V = peekThroughBitcasts(V);
12153
12154 // If the bitcasts shift the element size, we can't extract an equivalent
12155 // element from it.
12156 MVT NewVT = V.getSimpleValueType();
12157 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
12158 return SDValue();
12159
12160 if (V.getOpcode() == ISD::BUILD_VECTOR ||
12161 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
12162 // Ensure the scalar operand is the same size as the destination.
12163 // FIXME: Add support for scalar truncation where possible.
12164 SDValue S = V.getOperand(Idx);
12165 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
12166 return DAG.getBitcast(EltVT, S);
12167 }
12168
12169 return SDValue();
12170}
12171
12172/// Helper to test for a load that can be folded with x86 shuffles.
12173///
12174/// This is particularly important because the set of instructions varies
12175/// significantly based on whether the operand is a load or not.
12177 return V->hasOneUse() &&
12179}
12180
12181template<typename T>
12182static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
12183 T EltVT = VT.getScalarType();
12184 return EltVT == MVT::bf16 || (EltVT == MVT::f16 && !Subtarget.hasFP16());
12185}
12186
12187/// Try to lower insertion of a single element into a zero vector.
12188///
12189/// This is a common pattern that we have especially efficient patterns to lower
12190/// across all subtarget feature sets.
12192 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12193 const APInt &Zeroable, const X86Subtarget &Subtarget,
12194 SelectionDAG &DAG) {
12195 MVT ExtVT = VT;
12196 MVT EltVT = VT.getVectorElementType();
12197 unsigned NumElts = VT.getVectorNumElements();
12198 unsigned EltBits = VT.getScalarSizeInBits();
12199
12200 if (isSoftF16(EltVT, Subtarget))
12201 return SDValue();
12202
12203 int V2Index =
12204 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
12205 Mask.begin();
12206 bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr;
12207 bool IsV1Zeroable = true;
12208 for (int i = 0, Size = Mask.size(); i < Size; ++i)
12209 if (i != V2Index && !Zeroable[i]) {
12210 IsV1Zeroable = false;
12211 break;
12212 }
12213
12214 // Bail if a non-zero V1 isn't used in place.
12215 if (!IsV1Zeroable) {
12216 SmallVector<int, 8> V1Mask(Mask);
12217 V1Mask[V2Index] = -1;
12218 if (!isNoopShuffleMask(V1Mask))
12219 return SDValue();
12220 }
12221
12222 // Check for a single input from a SCALAR_TO_VECTOR node.
12223 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
12224 // all the smarts here sunk into that routine. However, the current
12225 // lowering of BUILD_VECTOR makes that nearly impossible until the old
12226 // vector shuffle lowering is dead.
12227 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
12228 DAG);
12229 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
12230 // We need to zext the scalar if it is smaller than an i32.
12231 V2S = DAG.getBitcast(EltVT, V2S);
12232 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
12233 // Using zext to expand a narrow element won't work for non-zero
12234 // insertions. But we can use a masked constant vector if we're
12235 // inserting V2 into the bottom of V1.
12236 if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
12237 return SDValue();
12238
12239 // Zero-extend directly to i32.
12240 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
12241 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
12242
12243 // If we're inserting into a constant, mask off the inserted index
12244 // and OR with the zero-extended scalar.
12245 if (!IsV1Zeroable) {
12246 SmallVector<APInt> Bits(NumElts, APInt::getAllOnes(EltBits));
12247 Bits[V2Index] = APInt::getZero(EltBits);
12248 SDValue BitMask = getConstVector(Bits, VT, DAG, DL);
12249 V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask);
12250 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12251 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2));
12252 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
12253 }
12254 }
12255 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12256 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
12257 EltVT == MVT::i16) {
12258 // Either not inserting from the low element of the input or the input
12259 // element size is too small to use VZEXT_MOVL to clear the high bits.
12260 return SDValue();
12261 }
12262
12263 if (!IsV1Zeroable) {
12264 // If V1 can't be treated as a zero vector we have fewer options to lower
12265 // this. We can't support integer vectors or non-zero targets cheaply.
12266 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
12267 if (!VT.isFloatingPoint() || V2Index != 0)
12268 return SDValue();
12269 if (!VT.is128BitVector())
12270 return SDValue();
12271
12272 // Otherwise, use MOVSD, MOVSS or MOVSH.
12273 unsigned MovOpc = 0;
12274 if (EltVT == MVT::f16)
12275 MovOpc = X86ISD::MOVSH;
12276 else if (EltVT == MVT::f32)
12277 MovOpc = X86ISD::MOVSS;
12278 else if (EltVT == MVT::f64)
12279 MovOpc = X86ISD::MOVSD;
12280 else
12281 llvm_unreachable("Unsupported floating point element type to handle!");
12282 return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
12283 }
12284
12285 // This lowering only works for the low element with floating point vectors.
12286 if (VT.isFloatingPoint() && V2Index != 0)
12287 return SDValue();
12288
12289 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
12290 if (ExtVT != VT)
12291 V2 = DAG.getBitcast(VT, V2);
12292
12293 if (V2Index != 0) {
12294 // If we have 4 or fewer lanes we can cheaply shuffle the element into
12295 // the desired position. Otherwise it is more efficient to do a vector
12296 // shift left. We know that we can do a vector shift left because all
12297 // the inputs are zero.
12298 if (VT.isFloatingPoint() || NumElts <= 4) {
12299 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
12300 V2Shuffle[V2Index] = 0;
12301 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
12302 } else {
12303 V2 = DAG.getBitcast(MVT::v16i8, V2);
12304 V2 = DAG.getNode(
12305 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
12306 DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8));
12307 V2 = DAG.getBitcast(VT, V2);
12308 }
12309 }
12310 return V2;
12311}
12312
12313/// Try to lower broadcast of a single - truncated - integer element,
12314/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
12315///
12316/// This assumes we have AVX2.
12318 int BroadcastIdx,
12319 const X86Subtarget &Subtarget,
12320 SelectionDAG &DAG) {
12321 assert(Subtarget.hasAVX2() &&
12322 "We can only lower integer broadcasts with AVX2!");
12323
12324 MVT EltVT = VT.getVectorElementType();
12325 MVT V0VT = V0.getSimpleValueType();
12326
12327 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
12328 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
12329
12330 MVT V0EltVT = V0VT.getVectorElementType();
12331 if (!V0EltVT.isInteger())
12332 return SDValue();
12333
12334 const unsigned EltSize = EltVT.getSizeInBits();
12335 const unsigned V0EltSize = V0EltVT.getSizeInBits();
12336
12337 // This is only a truncation if the original element type is larger.
12338 if (V0EltSize <= EltSize)
12339 return SDValue();
12340
12341 assert(((V0EltSize % EltSize) == 0) &&
12342 "Scalar type sizes must all be powers of 2 on x86!");
12343
12344 const unsigned V0Opc = V0.getOpcode();
12345 const unsigned Scale = V0EltSize / EltSize;
12346 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
12347
12348 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
12349 V0Opc != ISD::BUILD_VECTOR)
12350 return SDValue();
12351
12352 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
12353
12354 // If we're extracting non-least-significant bits, shift so we can truncate.
12355 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
12356 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
12357 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
12358 if (const int OffsetIdx = BroadcastIdx % Scale)
12359 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
12360 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
12361
12362 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
12363 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
12364}
12365
12366/// Test whether this can be lowered with a single SHUFPS instruction.
12367///
12368/// This is used to disable more specialized lowerings when the shufps lowering
12369/// will happen to be efficient.
12371 // This routine only handles 128-bit shufps.
12372 assert(Mask.size() == 4 && "Unsupported mask size!");
12373 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
12374 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
12375 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
12376 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
12377
12378 // To lower with a single SHUFPS we need to have the low half and high half
12379 // each requiring a single input.
12380 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
12381 return false;
12382 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
12383 return false;
12384
12385 return true;
12386}
12387
12388/// Test whether the specified input (0 or 1) is in-place blended by the
12389/// given mask.
12390///
12391/// This returns true if the elements from a particular input are already in the
12392/// slot required by the given mask and require no permutation.
12393static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
12394 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12395 int Size = Mask.size();
12396 for (int i = 0; i < Size; ++i)
12397 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12398 return false;
12399
12400 return true;
12401}
12402
12403/// If we are extracting two 128-bit halves of a vector and shuffling the
12404/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
12405/// multi-shuffle lowering.
12407 SDValue N1, ArrayRef<int> Mask,
12408 SelectionDAG &DAG) {
12409 MVT VT = N0.getSimpleValueType();
12410 assert((VT.is128BitVector() &&
12411 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
12412 "VPERM* family of shuffles requires 32-bit or 64-bit elements");
12413
12414 // Check that both sources are extracts of the same source vector.
12415 if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
12417 N0.getOperand(0) != N1.getOperand(0) ||
12418 !N0.hasOneUse() || !N1.hasOneUse())
12419 return SDValue();
12420
12421 SDValue WideVec = N0.getOperand(0);
12422 MVT WideVT = WideVec.getSimpleValueType();
12423 if (!WideVT.is256BitVector())
12424 return SDValue();
12425
12426 // Match extracts of each half of the wide source vector. Commute the shuffle
12427 // if the extract of the low half is N1.
12428 unsigned NumElts = VT.getVectorNumElements();
12429 SmallVector<int, 4> NewMask(Mask);
12430 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
12431 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
12432 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
12434 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
12435 return SDValue();
12436
12437 // Final bailout: if the mask is simple, we are better off using an extract
12438 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
12439 // because that avoids a constant load from memory.
12440 if (NumElts == 4 &&
12441 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))
12442 return SDValue();
12443
12444 // Extend the shuffle mask with undef elements.
12445 NewMask.append(NumElts, -1);
12446
12447 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
12448 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
12449 NewMask);
12450 // This is free: ymm -> xmm.
12451 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
12452 DAG.getIntPtrConstant(0, DL));
12453}
12454
12455/// Try to lower broadcast of a single element.
12456///
12457/// For convenience, this code also bundles all of the subtarget feature set
12458/// filtering. While a little annoying to re-dispatch on type here, there isn't
12459/// a convenient way to factor it out.
12461 SDValue V2, ArrayRef<int> Mask,
12462 const X86Subtarget &Subtarget,
12463 SelectionDAG &DAG) {
12464 MVT EltVT = VT.getVectorElementType();
12465 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
12466 (Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
12467 (Subtarget.hasAVX2() && (VT.isInteger() || EltVT == MVT::f16))))
12468 return SDValue();
12469
12470 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
12471 // we can only broadcast from a register with AVX2.
12472 unsigned NumEltBits = VT.getScalarSizeInBits();
12473 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
12476 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
12477
12478 // Check that the mask is a broadcast.
12479 int BroadcastIdx = getSplatIndex(Mask);
12480 if (BroadcastIdx < 0)
12481 return SDValue();
12482 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
12483 "a sorted mask where the broadcast "
12484 "comes from V1.");
12485
12486 // Go up the chain of (vector) values to find a scalar load that we can
12487 // combine with the broadcast.
12488 // TODO: Combine this logic with findEltLoadSrc() used by
12489 // EltsFromConsecutiveLoads().
12490 int BitOffset = BroadcastIdx * NumEltBits;
12491 SDValue V = V1;
12492 for (;;) {
12493 switch (V.getOpcode()) {
12494 case ISD::BITCAST: {
12495 V = V.getOperand(0);
12496 continue;
12497 }
12498 case ISD::CONCAT_VECTORS: {
12499 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
12500 int OpIdx = BitOffset / OpBitWidth;
12501 V = V.getOperand(OpIdx);
12502 BitOffset %= OpBitWidth;
12503 continue;
12504 }
12506 // The extraction index adds to the existing offset.
12507 unsigned EltBitWidth = V.getScalarValueSizeInBits();
12508 unsigned Idx = V.getConstantOperandVal(1);
12509 unsigned BeginOffset = Idx * EltBitWidth;
12510 BitOffset += BeginOffset;
12511 V = V.getOperand(0);
12512 continue;
12513 }
12514 case ISD::INSERT_SUBVECTOR: {
12515 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
12516 int EltBitWidth = VOuter.getScalarValueSizeInBits();
12517 int Idx = (int)V.getConstantOperandVal(2);
12518 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
12519 int BeginOffset = Idx * EltBitWidth;
12520 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
12521 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
12522 BitOffset -= BeginOffset;
12523 V = VInner;
12524 } else {
12525 V = VOuter;
12526 }
12527 continue;
12528 }
12529 }
12530 break;
12531 }
12532 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
12533 BroadcastIdx = BitOffset / NumEltBits;
12534
12535 // Do we need to bitcast the source to retrieve the original broadcast index?
12536 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
12537
12538 // Check if this is a broadcast of a scalar. We special case lowering
12539 // for scalars so that we can more effectively fold with loads.
12540 // If the original value has a larger element type than the shuffle, the
12541 // broadcast element is in essence truncated. Make that explicit to ease
12542 // folding.
12543 if (BitCastSrc && VT.isInteger())
12544 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
12545 DL, VT, V, BroadcastIdx, Subtarget, DAG))
12546 return TruncBroadcast;
12547
12548 // Also check the simpler case, where we can directly reuse the scalar.
12549 if (!BitCastSrc &&
12550 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
12551 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
12552 V = V.getOperand(BroadcastIdx);
12553
12554 // If we can't broadcast from a register, check that the input is a load.
12555 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
12556 return SDValue();
12557 } else if (ISD::isNormalLoad(V.getNode()) &&
12558 cast<LoadSDNode>(V)->isSimple()) {
12559 // We do not check for one-use of the vector load because a broadcast load
12560 // is expected to be a win for code size, register pressure, and possibly
12561 // uops even if the original vector load is not eliminated.
12562
12563 // Reduce the vector load and shuffle to a broadcasted scalar load.
12564 LoadSDNode *Ld = cast<LoadSDNode>(V);
12565 SDValue BaseAddr = Ld->getOperand(1);
12566 MVT SVT = VT.getScalarType();
12567 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
12568 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
12569 SDValue NewAddr =
12571
12572 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
12573 // than MOVDDUP.
12574 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
12575 if (Opcode == X86ISD::VBROADCAST) {
12576 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
12577 SDValue Ops[] = {Ld->getChain(), NewAddr};
12578 V = DAG.getMemIntrinsicNode(
12579 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
12581 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
12583 return DAG.getBitcast(VT, V);
12584 }
12585 assert(SVT == MVT::f64 && "Unexpected VT!");
12586 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
12588 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
12590 } else if (!BroadcastFromReg) {
12591 // We can't broadcast from a vector register.
12592 return SDValue();
12593 } else if (BitOffset != 0) {
12594 // We can only broadcast from the zero-element of a vector register,
12595 // but it can be advantageous to broadcast from the zero-element of a
12596 // subvector.
12597 if (!VT.is256BitVector() && !VT.is512BitVector())
12598 return SDValue();
12599
12600 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
12601 if (VT == MVT::v4f64 || VT == MVT::v4i64)
12602 return SDValue();
12603
12604 // Only broadcast the zero-element of a 128-bit subvector.
12605 if ((BitOffset % 128) != 0)
12606 return SDValue();
12607
12608 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
12609 "Unexpected bit-offset");
12610 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
12611 "Unexpected vector size");
12612 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
12613 V = extract128BitVector(V, ExtractIdx, DAG, DL);
12614 }
12615
12616 // On AVX we can use VBROADCAST directly for scalar sources.
12617 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
12618 V = DAG.getBitcast(MVT::f64, V);
12619 if (Subtarget.hasAVX()) {
12620 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
12621 return DAG.getBitcast(VT, V);
12622 }
12623 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
12624 }
12625
12626 // If this is a scalar, do the broadcast on this type and bitcast.
12627 if (!V.getValueType().isVector()) {
12628 assert(V.getScalarValueSizeInBits() == NumEltBits &&
12629 "Unexpected scalar size");
12630 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
12632 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
12633 }
12634
12635 // We only support broadcasting from 128-bit vectors to minimize the
12636 // number of patterns we need to deal with in isel. So extract down to
12637 // 128-bits, removing as many bitcasts as possible.
12638 if (V.getValueSizeInBits() > 128)
12640
12641 // Otherwise cast V to a vector with the same element type as VT, but
12642 // possibly narrower than VT. Then perform the broadcast.
12643 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
12644 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
12645 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
12646}
12647
12648// Check for whether we can use INSERTPS to perform the shuffle. We only use
12649// INSERTPS when the V1 elements are already in the correct locations
12650// because otherwise we can just always use two SHUFPS instructions which
12651// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
12652// perform INSERTPS if a single V1 element is out of place and all V2
12653// elements are zeroable.
12655 unsigned &InsertPSMask,
12656 const APInt &Zeroable,
12657 ArrayRef<int> Mask, SelectionDAG &DAG) {
12658 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
12659 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
12660 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12661
12662 // Attempt to match INSERTPS with one element from VA or VB being
12663 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
12664 // are updated.
12665 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
12666 ArrayRef<int> CandidateMask) {
12667 unsigned ZMask = 0;
12668 int VADstIndex = -1;
12669 int VBDstIndex = -1;
12670 bool VAUsedInPlace = false;
12671
12672 for (int i = 0; i < 4; ++i) {
12673 // Synthesize a zero mask from the zeroable elements (includes undefs).
12674 if (Zeroable[i]) {
12675 ZMask |= 1 << i;
12676 continue;
12677 }
12678
12679 // Flag if we use any VA inputs in place.
12680 if (i == CandidateMask[i]) {
12681 VAUsedInPlace = true;
12682 continue;
12683 }
12684
12685 // We can only insert a single non-zeroable element.
12686 if (VADstIndex >= 0 || VBDstIndex >= 0)
12687 return false;
12688
12689 if (CandidateMask[i] < 4) {
12690 // VA input out of place for insertion.
12691 VADstIndex = i;
12692 } else {
12693 // VB input for insertion.
12694 VBDstIndex = i;
12695 }
12696 }
12697
12698 // Don't bother if we have no (non-zeroable) element for insertion.
12699 if (VADstIndex < 0 && VBDstIndex < 0)
12700 return false;
12701
12702 // Determine element insertion src/dst indices. The src index is from the
12703 // start of the inserted vector, not the start of the concatenated vector.
12704 unsigned VBSrcIndex = 0;
12705 if (VADstIndex >= 0) {
12706 // If we have a VA input out of place, we use VA as the V2 element
12707 // insertion and don't use the original V2 at all.
12708 VBSrcIndex = CandidateMask[VADstIndex];
12709 VBDstIndex = VADstIndex;
12710 VB = VA;
12711 } else {
12712 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
12713 }
12714
12715 // If no V1 inputs are used in place, then the result is created only from
12716 // the zero mask and the V2 insertion - so remove V1 dependency.
12717 if (!VAUsedInPlace)
12718 VA = DAG.getUNDEF(MVT::v4f32);
12719
12720 // Update V1, V2 and InsertPSMask accordingly.
12721 V1 = VA;
12722 V2 = VB;
12723
12724 // Insert the V2 element into the desired position.
12725 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
12726 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
12727 return true;
12728 };
12729
12730 if (matchAsInsertPS(V1, V2, Mask))
12731 return true;
12732
12733 // Commute and try again.
12734 SmallVector<int, 4> CommutedMask(Mask);
12736 if (matchAsInsertPS(V2, V1, CommutedMask))
12737 return true;
12738
12739 return false;
12740}
12741
12743 ArrayRef<int> Mask, const APInt &Zeroable,
12744 SelectionDAG &DAG) {
12745 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
12746 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
12747
12748 // Attempt to match the insertps pattern.
12749 unsigned InsertPSMask = 0;
12750 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
12751 return SDValue();
12752
12753 // Insert the V2 element into the desired position.
12754 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
12755 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
12756}
12757
12758/// Handle lowering of 2-lane 64-bit floating point shuffles.
12759///
12760/// This is the basis function for the 2-lane 64-bit shuffles as we have full
12761/// support for floating point shuffles but not integer shuffles. These
12762/// instructions will incur a domain crossing penalty on some chips though so
12763/// it is better to avoid lowering through this for integer vectors where
12764/// possible.
12766 const APInt &Zeroable, SDValue V1, SDValue V2,
12767 const X86Subtarget &Subtarget,
12768 SelectionDAG &DAG) {
12769 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
12770 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
12771 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
12772
12773 if (V2.isUndef()) {
12774 // Check for being able to broadcast a single element.
12775 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
12776 Mask, Subtarget, DAG))
12777 return Broadcast;
12778
12779 // Straight shuffle of a single input vector. Simulate this by using the
12780 // single input as both of the "inputs" to this instruction..
12781 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
12782
12783 if (Subtarget.hasAVX()) {
12784 // If we have AVX, we can use VPERMILPS which will allow folding a load
12785 // into the shuffle.
12786 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
12787 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
12788 }
12789
12790 return DAG.getNode(
12791 X86ISD::SHUFP, DL, MVT::v2f64,
12792 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
12793 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
12794 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
12795 }
12796 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
12797 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
12798 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
12799 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
12800
12801 if (Subtarget.hasAVX2())
12802 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
12803 return Extract;
12804
12805 // When loading a scalar and then shuffling it into a vector we can often do
12806 // the insertion cheaply.
12808 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
12809 return Insertion;
12810 // Try inverting the insertion since for v2 masks it is easy to do and we
12811 // can't reliably sort the mask one way or the other.
12812 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
12813 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
12815 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
12816 return Insertion;
12817
12818 // Try to use one of the special instruction patterns to handle two common
12819 // blend patterns if a zero-blend above didn't work.
12820 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
12821 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
12822 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
12823 // We can either use a special instruction to load over the low double or
12824 // to move just the low double.
12825 return DAG.getNode(
12826 X86ISD::MOVSD, DL, MVT::v2f64, V2,
12827 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
12828
12829 if (Subtarget.hasSSE41())
12830 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
12831 Zeroable, Subtarget, DAG))
12832 return Blend;
12833
12834 // Use dedicated unpack instructions for masks that match their pattern.
12835 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
12836 return V;
12837
12838 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
12839 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
12840 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
12841}
12842
12843/// Handle lowering of 2-lane 64-bit integer shuffles.
12844///
12845/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
12846/// the integer unit to minimize domain crossing penalties. However, for blends
12847/// it falls back to the floating point shuffle operation with appropriate bit
12848/// casting.
12850 const APInt &Zeroable, SDValue V1, SDValue V2,
12851 const X86Subtarget &Subtarget,
12852 SelectionDAG &DAG) {
12853 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
12854 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
12855 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
12856
12857 if (V2.isUndef()) {
12858 // Check for being able to broadcast a single element.
12859 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
12860 Mask, Subtarget, DAG))
12861 return Broadcast;
12862
12863 // Straight shuffle of a single input vector. For everything from SSE2
12864 // onward this has a single fast instruction with no scary immediates.
12865 // We have to map the mask as it is actually a v4i32 shuffle instruction.
12866 V1 = DAG.getBitcast(MVT::v4i32, V1);
12867 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
12868 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
12869 Mask[1] < 0 ? -1 : (Mask[1] * 2),
12870 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
12871 return DAG.getBitcast(
12872 MVT::v2i64,
12873 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
12874 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
12875 }
12876 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
12877 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
12878 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
12879 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
12880
12881 if (Subtarget.hasAVX2())
12882 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
12883 return Extract;
12884
12885 // Try to use shift instructions.
12886 if (SDValue Shift =
12887 lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,
12888 DAG, /*BitwiseOnly*/ false))
12889 return Shift;
12890
12891 // When loading a scalar and then shuffling it into a vector we can often do
12892 // the insertion cheaply.
12894 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
12895 return Insertion;
12896 // Try inverting the insertion since for v2 masks it is easy to do and we
12897 // can't reliably sort the mask one way or the other.
12898 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
12900 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
12901 return Insertion;
12902
12903 // We have different paths for blend lowering, but they all must use the
12904 // *exact* same predicate.
12905 bool IsBlendSupported = Subtarget.hasSSE41();
12906 if (IsBlendSupported)
12907 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
12908 Zeroable, Subtarget, DAG))
12909 return Blend;
12910
12911 // Use dedicated unpack instructions for masks that match their pattern.
12912 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
12913 return V;
12914
12915 // Try to use byte rotation instructions.
12916 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
12917 if (Subtarget.hasSSSE3()) {
12918 if (Subtarget.hasVLX())
12919 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
12920 Zeroable, Subtarget, DAG))
12921 return Rotate;
12922
12923 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
12924 Subtarget, DAG))
12925 return Rotate;
12926 }
12927
12928 // If we have direct support for blends, we should lower by decomposing into
12929 // a permute. That will be faster than the domain cross.
12930 if (IsBlendSupported)
12931 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
12932 Subtarget, DAG);
12933
12934 // We implement this with SHUFPD which is pretty lame because it will likely
12935 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
12936 // However, all the alternatives are still more cycles and newer chips don't
12937 // have this problem. It would be really nice if x86 had better shuffles here.
12938 V1 = DAG.getBitcast(MVT::v2f64, V1);
12939 V2 = DAG.getBitcast(MVT::v2f64, V2);
12940 return DAG.getBitcast(MVT::v2i64,
12941 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
12942}
12943
12944/// Lower a vector shuffle using the SHUFPS instruction.
12945///
12946/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
12947/// It makes no assumptions about whether this is the *best* lowering, it simply
12948/// uses it.
12950 ArrayRef<int> Mask, SDValue V1,
12951 SDValue V2, SelectionDAG &DAG) {
12952 SDValue LowV = V1, HighV = V2;
12953 SmallVector<int, 4> NewMask(Mask);
12954 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
12955
12956 if (NumV2Elements == 1) {
12957 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
12958
12959 // Compute the index adjacent to V2Index and in the same half by toggling
12960 // the low bit.
12961 int V2AdjIndex = V2Index ^ 1;
12962
12963 if (Mask[V2AdjIndex] < 0) {
12964 // Handles all the cases where we have a single V2 element and an undef.
12965 // This will only ever happen in the high lanes because we commute the
12966 // vector otherwise.
12967 if (V2Index < 2)
12968 std::swap(LowV, HighV);
12969 NewMask[V2Index] -= 4;
12970 } else {
12971 // Handle the case where the V2 element ends up adjacent to a V1 element.
12972 // To make this work, blend them together as the first step.
12973 int V1Index = V2AdjIndex;
12974 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
12975 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
12976 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
12977
12978 // Now proceed to reconstruct the final blend as we have the necessary
12979 // high or low half formed.
12980 if (V2Index < 2) {
12981 LowV = V2;
12982 HighV = V1;
12983 } else {
12984 HighV = V2;
12985 }
12986 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
12987 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
12988 }
12989 } else if (NumV2Elements == 2) {
12990 if (Mask[0] < 4 && Mask[1] < 4) {
12991 // Handle the easy case where we have V1 in the low lanes and V2 in the
12992 // high lanes.
12993 NewMask[2] -= 4;
12994 NewMask[3] -= 4;
12995 } else if (Mask[2] < 4 && Mask[3] < 4) {
12996 // We also handle the reversed case because this utility may get called
12997 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
12998 // arrange things in the right direction.
12999 NewMask[0] -= 4;
13000 NewMask[1] -= 4;
13001 HighV = V1;
13002 LowV = V2;
13003 } else {
13004 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
13005 // trying to place elements directly, just blend them and set up the final
13006 // shuffle to place them.
13007
13008 // The first two blend mask elements are for V1, the second two are for
13009 // V2.
13010 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
13011 Mask[2] < 4 ? Mask[2] : Mask[3],
13012 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
13013 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
13014 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
13015 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13016
13017 // Now we do a normal shuffle of V1 by giving V1 as both operands to
13018 // a blend.
13019 LowV = HighV = V1;
13020 NewMask[0] = Mask[0] < 4 ? 0 : 2;
13021 NewMask[1] = Mask[0] < 4 ? 2 : 0;
13022 NewMask[2] = Mask[2] < 4 ? 1 : 3;
13023 NewMask[3] = Mask[2] < 4 ? 3 : 1;
13024 }
13025 } else if (NumV2Elements == 3) {
13026 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
13027 // we can get here due to other paths (e.g repeated mask matching) that we
13028 // don't want to do another round of lowerVECTOR_SHUFFLE.
13030 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
13031 }
13032 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
13033 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
13034}
13035
13036/// Lower 4-lane 32-bit floating point shuffles.
13037///
13038/// Uses instructions exclusively from the floating point unit to minimize
13039/// domain crossing penalties, as these are sufficient to implement all v4f32
13040/// shuffles.
13042 const APInt &Zeroable, SDValue V1, SDValue V2,
13043 const X86Subtarget &Subtarget,
13044 SelectionDAG &DAG) {
13045 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13046 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13047 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13048
13049 if (Subtarget.hasSSE41())
13050 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
13051 Zeroable, Subtarget, DAG))
13052 return Blend;
13053
13054 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13055
13056 if (NumV2Elements == 0) {
13057 // Check for being able to broadcast a single element.
13058 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
13059 Mask, Subtarget, DAG))
13060 return Broadcast;
13061
13062 // Use even/odd duplicate instructions for masks that match their pattern.
13063 if (Subtarget.hasSSE3()) {
13064 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
13065 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
13066 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
13067 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
13068 }
13069
13070 if (Subtarget.hasAVX()) {
13071 // If we have AVX, we can use VPERMILPS which will allow folding a load
13072 // into the shuffle.
13073 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
13074 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13075 }
13076
13077 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
13078 // in SSE1 because otherwise they are widened to v2f64 and never get here.
13079 if (!Subtarget.hasSSE2()) {
13080 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
13081 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
13082 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
13083 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
13084 }
13085
13086 // Otherwise, use a straight shuffle of a single input vector. We pass the
13087 // input vector to both operands to simulate this with a SHUFPS.
13088 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
13089 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13090 }
13091
13092 if (Subtarget.hasSSE2())
13094 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
13095 ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
13096 return ZExt;
13097 }
13098
13099 if (Subtarget.hasAVX2())
13100 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13101 return Extract;
13102
13103 // There are special ways we can lower some single-element blends. However, we
13104 // have custom ways we can lower more complex single-element blends below that
13105 // we defer to if both this and BLENDPS fail to match, so restrict this to
13106 // when the V2 input is targeting element 0 of the mask -- that is the fast
13107 // case here.
13108 if (NumV2Elements == 1 && Mask[0] >= 4)
13110 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13111 return V;
13112
13113 if (Subtarget.hasSSE41()) {
13114 // Use INSERTPS if we can complete the shuffle efficiently.
13115 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
13116 return V;
13117
13118 if (!isSingleSHUFPSMask(Mask))
13119 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
13120 V2, Mask, DAG))
13121 return BlendPerm;
13122 }
13123
13124 // Use low/high mov instructions. These are only valid in SSE1 because
13125 // otherwise they are widened to v2f64 and never get here.
13126 if (!Subtarget.hasSSE2()) {
13127 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
13128 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
13129 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
13130 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
13131 }
13132
13133 // Use dedicated unpack instructions for masks that match their pattern.
13134 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
13135 return V;
13136
13137 // Otherwise fall back to a SHUFPS lowering strategy.
13138 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
13139}
13140
13141/// Lower 4-lane i32 vector shuffles.
13142///
13143/// We try to handle these with integer-domain shuffles where we can, but for
13144/// blends we use the floating point domain blend instructions.
13146 const APInt &Zeroable, SDValue V1, SDValue V2,
13147 const X86Subtarget &Subtarget,
13148 SelectionDAG &DAG) {
13149 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13150 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13151 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13152
13153 // Whenever we can lower this as a zext, that instruction is strictly faster
13154 // than any alternative. It also allows us to fold memory operands into the
13155 // shuffle in many cases.
13156 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
13157 Zeroable, Subtarget, DAG))
13158 return ZExt;
13159
13160 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13161
13162 // Try to use shift instructions if fast.
13163 if (Subtarget.preferLowerShuffleAsShift()) {
13164 if (SDValue Shift =
13165 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,
13166 Subtarget, DAG, /*BitwiseOnly*/ true))
13167 return Shift;
13168 if (NumV2Elements == 0)
13169 if (SDValue Rotate =
13170 lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))
13171 return Rotate;
13172 }
13173
13174 if (NumV2Elements == 0) {
13175 // Try to use broadcast unless the mask only has one non-undef element.
13176 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
13177 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
13178 Mask, Subtarget, DAG))
13179 return Broadcast;
13180 }
13181
13182 // Straight shuffle of a single input vector. For everything from SSE2
13183 // onward this has a single fast instruction with no scary immediates.
13184 // We coerce the shuffle pattern to be compatible with UNPCK instructions
13185 // but we aren't actually going to use the UNPCK instruction because doing
13186 // so prevents folding a load into this instruction or making a copy.
13187 const int UnpackLoMask[] = {0, 0, 1, 1};
13188 const int UnpackHiMask[] = {2, 2, 3, 3};
13189 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
13190 Mask = UnpackLoMask;
13191 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
13192 Mask = UnpackHiMask;
13193
13194 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13195 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13196 }
13197
13198 if (Subtarget.hasAVX2())
13199 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13200 return Extract;
13201
13202 // Try to use shift instructions.
13203 if (SDValue Shift =
13204 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,
13205 DAG, /*BitwiseOnly*/ false))
13206 return Shift;
13207
13208 // There are special ways we can lower some single-element blends.
13209 if (NumV2Elements == 1)
13211 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13212 return V;
13213
13214 // We have different paths for blend lowering, but they all must use the
13215 // *exact* same predicate.
13216 bool IsBlendSupported = Subtarget.hasSSE41();
13217 if (IsBlendSupported)
13218 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
13219 Zeroable, Subtarget, DAG))
13220 return Blend;
13221
13222 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
13223 Zeroable, Subtarget, DAG))
13224 return Masked;
13225
13226 // Use dedicated unpack instructions for masks that match their pattern.
13227 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
13228 return V;
13229
13230 // Try to use byte rotation instructions.
13231 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13232 if (Subtarget.hasSSSE3()) {
13233 if (Subtarget.hasVLX())
13234 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
13235 Zeroable, Subtarget, DAG))
13236 return Rotate;
13237
13238 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
13239 Subtarget, DAG))
13240 return Rotate;
13241 }
13242
13243 // Assume that a single SHUFPS is faster than an alternative sequence of
13244 // multiple instructions (even if the CPU has a domain penalty).
13245 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13246 if (!isSingleSHUFPSMask(Mask)) {
13247 // If we have direct support for blends, we should lower by decomposing into
13248 // a permute. That will be faster than the domain cross.
13249 if (IsBlendSupported)
13250 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
13251 Subtarget, DAG);
13252
13253 // Try to lower by permuting the inputs into an unpack instruction.
13254 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
13255 Mask, Subtarget, DAG))
13256 return Unpack;
13257 }
13258
13259 // We implement this with SHUFPS because it can blend from two vectors.
13260 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
13261 // up the inputs, bypassing domain shift penalties that we would incur if we
13262 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
13263 // relevant.
13264 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
13265 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
13266 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
13267 return DAG.getBitcast(MVT::v4i32, ShufPS);
13268}
13269
13270/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
13271/// shuffle lowering, and the most complex part.
13272///
13273/// The lowering strategy is to try to form pairs of input lanes which are
13274/// targeted at the same half of the final vector, and then use a dword shuffle
13275/// to place them onto the right half, and finally unpack the paired lanes into
13276/// their final position.
13277///
13278/// The exact breakdown of how to form these dword pairs and align them on the
13279/// correct sides is really tricky. See the comments within the function for
13280/// more of the details.
13281///
13282/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
13283/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
13284/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
13285/// vector, form the analogous 128-bit 8-element Mask.
13287 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
13288 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13289 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
13290 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
13291
13292 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
13293 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
13294 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
13295
13296 // Attempt to directly match PSHUFLW or PSHUFHW.
13297 if (isUndefOrInRange(LoMask, 0, 4) &&
13298 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
13299 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13300 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
13301 }
13302 if (isUndefOrInRange(HiMask, 4, 8) &&
13303 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
13304 for (int i = 0; i != 4; ++i)
13305 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
13306 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13307 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
13308 }
13309
13310 SmallVector<int, 4> LoInputs;
13311 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
13312 array_pod_sort(LoInputs.begin(), LoInputs.end());
13313 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
13314 SmallVector<int, 4> HiInputs;
13315 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
13316 array_pod_sort(HiInputs.begin(), HiInputs.end());
13317 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
13318 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
13319 int NumHToL = LoInputs.size() - NumLToL;
13320 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
13321 int NumHToH = HiInputs.size() - NumLToH;
13322 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
13323 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
13324 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
13325 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
13326
13327 // If we are shuffling values from one half - check how many different DWORD
13328 // pairs we need to create. If only 1 or 2 then we can perform this as a
13329 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
13330 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
13331 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
13332 V = DAG.getNode(ShufWOp, DL, VT, V,
13333 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13334 V = DAG.getBitcast(PSHUFDVT, V);
13335 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
13336 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13337 return DAG.getBitcast(VT, V);
13338 };
13339
13340 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
13341 int PSHUFDMask[4] = { -1, -1, -1, -1 };
13342 SmallVector<std::pair<int, int>, 4> DWordPairs;
13343 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
13344
13345 // Collect the different DWORD pairs.
13346 for (int DWord = 0; DWord != 4; ++DWord) {
13347 int M0 = Mask[2 * DWord + 0];
13348 int M1 = Mask[2 * DWord + 1];
13349 M0 = (M0 >= 0 ? M0 % 4 : M0);
13350 M1 = (M1 >= 0 ? M1 % 4 : M1);
13351 if (M0 < 0 && M1 < 0)
13352 continue;
13353
13354 bool Match = false;
13355 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
13356 auto &DWordPair = DWordPairs[j];
13357 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
13358 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
13359 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
13360 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
13361 PSHUFDMask[DWord] = DOffset + j;
13362 Match = true;
13363 break;
13364 }
13365 }
13366 if (!Match) {
13367 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
13368 DWordPairs.push_back(std::make_pair(M0, M1));
13369 }
13370 }
13371
13372 if (DWordPairs.size() <= 2) {
13373 DWordPairs.resize(2, std::make_pair(-1, -1));
13374 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
13375 DWordPairs[1].first, DWordPairs[1].second};
13376 if ((NumHToL + NumHToH) == 0)
13377 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
13378 if ((NumLToL + NumLToH) == 0)
13379 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
13380 }
13381 }
13382
13383 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
13384 // such inputs we can swap two of the dwords across the half mark and end up
13385 // with <=2 inputs to each half in each half. Once there, we can fall through
13386 // to the generic code below. For example:
13387 //
13388 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13389 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
13390 //
13391 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
13392 // and an existing 2-into-2 on the other half. In this case we may have to
13393 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
13394 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
13395 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
13396 // because any other situation (including a 3-into-1 or 1-into-3 in the other
13397 // half than the one we target for fixing) will be fixed when we re-enter this
13398 // path. We will also combine away any sequence of PSHUFD instructions that
13399 // result into a single instruction. Here is an example of the tricky case:
13400 //
13401 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13402 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
13403 //
13404 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
13405 //
13406 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
13407 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
13408 //
13409 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
13410 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
13411 //
13412 // The result is fine to be handled by the generic logic.
13413 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
13414 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
13415 int AOffset, int BOffset) {
13416 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
13417 "Must call this with A having 3 or 1 inputs from the A half.");
13418 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
13419 "Must call this with B having 1 or 3 inputs from the B half.");
13420 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
13421 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
13422
13423 bool ThreeAInputs = AToAInputs.size() == 3;
13424
13425 // Compute the index of dword with only one word among the three inputs in
13426 // a half by taking the sum of the half with three inputs and subtracting
13427 // the sum of the actual three inputs. The difference is the remaining
13428 // slot.
13429 int ADWord = 0, BDWord = 0;
13430 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
13431 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
13432 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
13433 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
13434 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
13435 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
13436 int TripleNonInputIdx =
13437 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
13438 TripleDWord = TripleNonInputIdx / 2;
13439
13440 // We use xor with one to compute the adjacent DWord to whichever one the
13441 // OneInput is in.
13442 OneInputDWord = (OneInput / 2) ^ 1;
13443
13444 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
13445 // and BToA inputs. If there is also such a problem with the BToB and AToB
13446 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
13447 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
13448 // is essential that we don't *create* a 3<-1 as then we might oscillate.
13449 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
13450 // Compute how many inputs will be flipped by swapping these DWords. We
13451 // need
13452 // to balance this to ensure we don't form a 3-1 shuffle in the other
13453 // half.
13454 int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
13455 llvm::count(AToBInputs, 2 * ADWord + 1);
13456 int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
13457 llvm::count(BToBInputs, 2 * BDWord + 1);
13458 if ((NumFlippedAToBInputs == 1 &&
13459 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
13460 (NumFlippedBToBInputs == 1 &&
13461 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
13462 // We choose whether to fix the A half or B half based on whether that
13463 // half has zero flipped inputs. At zero, we may not be able to fix it
13464 // with that half. We also bias towards fixing the B half because that
13465 // will more commonly be the high half, and we have to bias one way.
13466 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
13467 ArrayRef<int> Inputs) {
13468 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
13469 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
13470 // Determine whether the free index is in the flipped dword or the
13471 // unflipped dword based on where the pinned index is. We use this bit
13472 // in an xor to conditionally select the adjacent dword.
13473 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
13474 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
13475 if (IsFixIdxInput == IsFixFreeIdxInput)
13476 FixFreeIdx += 1;
13477 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
13478 assert(IsFixIdxInput != IsFixFreeIdxInput &&
13479 "We need to be changing the number of flipped inputs!");
13480 int PSHUFHalfMask[] = {0, 1, 2, 3};
13481 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
13482 V = DAG.getNode(
13483 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
13484 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
13485 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13486
13487 for (int &M : Mask)
13488 if (M >= 0 && M == FixIdx)
13489 M = FixFreeIdx;
13490 else if (M >= 0 && M == FixFreeIdx)
13491 M = FixIdx;
13492 };
13493 if (NumFlippedBToBInputs != 0) {
13494 int BPinnedIdx =
13495 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
13496 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
13497 } else {
13498 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
13499 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
13500 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
13501 }
13502 }
13503 }
13504
13505 int PSHUFDMask[] = {0, 1, 2, 3};
13506 PSHUFDMask[ADWord] = BDWord;
13507 PSHUFDMask[BDWord] = ADWord;
13508 V = DAG.getBitcast(
13509 VT,
13510 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
13511 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13512
13513 // Adjust the mask to match the new locations of A and B.
13514 for (int &M : Mask)
13515 if (M >= 0 && M/2 == ADWord)
13516 M = 2 * BDWord + M % 2;
13517 else if (M >= 0 && M/2 == BDWord)
13518 M = 2 * ADWord + M % 2;
13519
13520 // Recurse back into this routine to re-compute state now that this isn't
13521 // a 3 and 1 problem.
13522 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
13523 };
13524 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
13525 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
13526 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
13527 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
13528
13529 // At this point there are at most two inputs to the low and high halves from
13530 // each half. That means the inputs can always be grouped into dwords and
13531 // those dwords can then be moved to the correct half with a dword shuffle.
13532 // We use at most one low and one high word shuffle to collect these paired
13533 // inputs into dwords, and finally a dword shuffle to place them.
13534 int PSHUFLMask[4] = {-1, -1, -1, -1};
13535 int PSHUFHMask[4] = {-1, -1, -1, -1};
13536 int PSHUFDMask[4] = {-1, -1, -1, -1};
13537
13538 // First fix the masks for all the inputs that are staying in their
13539 // original halves. This will then dictate the targets of the cross-half
13540 // shuffles.
13541 auto fixInPlaceInputs =
13542 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
13543 MutableArrayRef<int> SourceHalfMask,
13544 MutableArrayRef<int> HalfMask, int HalfOffset) {
13545 if (InPlaceInputs.empty())
13546 return;
13547 if (InPlaceInputs.size() == 1) {
13548 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
13549 InPlaceInputs[0] - HalfOffset;
13550 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
13551 return;
13552 }
13553 if (IncomingInputs.empty()) {
13554 // Just fix all of the in place inputs.
13555 for (int Input : InPlaceInputs) {
13556 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
13557 PSHUFDMask[Input / 2] = Input / 2;
13558 }
13559 return;
13560 }
13561
13562 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
13563 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
13564 InPlaceInputs[0] - HalfOffset;
13565 // Put the second input next to the first so that they are packed into
13566 // a dword. We find the adjacent index by toggling the low bit.
13567 int AdjIndex = InPlaceInputs[0] ^ 1;
13568 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
13569 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
13570 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
13571 };
13572 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
13573 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
13574
13575 // Now gather the cross-half inputs and place them into a free dword of
13576 // their target half.
13577 // FIXME: This operation could almost certainly be simplified dramatically to
13578 // look more like the 3-1 fixing operation.
13579 auto moveInputsToRightHalf = [&PSHUFDMask](
13580 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
13581 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
13582 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
13583 int DestOffset) {
13584 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
13585 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
13586 };
13587 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
13588 int Word) {
13589 int LowWord = Word & ~1;
13590 int HighWord = Word | 1;
13591 return isWordClobbered(SourceHalfMask, LowWord) ||
13592 isWordClobbered(SourceHalfMask, HighWord);
13593 };
13594
13595 if (IncomingInputs.empty())
13596 return;
13597
13598 if (ExistingInputs.empty()) {
13599 // Map any dwords with inputs from them into the right half.
13600 for (int Input : IncomingInputs) {
13601 // If the source half mask maps over the inputs, turn those into
13602 // swaps and use the swapped lane.
13603 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
13604 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
13605 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
13606 Input - SourceOffset;
13607 // We have to swap the uses in our half mask in one sweep.
13608 for (int &M : HalfMask)
13609 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
13610 M = Input;
13611 else if (M == Input)
13612 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
13613 } else {
13614 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
13615 Input - SourceOffset &&
13616 "Previous placement doesn't match!");
13617 }
13618 // Note that this correctly re-maps both when we do a swap and when
13619 // we observe the other side of the swap above. We rely on that to
13620 // avoid swapping the members of the input list directly.
13621 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
13622 }
13623
13624 // Map the input's dword into the correct half.
13625 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
13626 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
13627 else
13628 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
13629 Input / 2 &&
13630 "Previous placement doesn't match!");
13631 }
13632
13633 // And just directly shift any other-half mask elements to be same-half
13634 // as we will have mirrored the dword containing the element into the
13635 // same position within that half.
13636 for (int &M : HalfMask)
13637 if (M >= SourceOffset && M < SourceOffset + 4) {
13638 M = M - SourceOffset + DestOffset;
13639 assert(M >= 0 && "This should never wrap below zero!");
13640 }
13641 return;
13642 }
13643
13644 // Ensure we have the input in a viable dword of its current half. This
13645 // is particularly tricky because the original position may be clobbered
13646 // by inputs being moved and *staying* in that half.
13647 if (IncomingInputs.size() == 1) {
13648 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
13649 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
13650 SourceOffset;
13651 SourceHalfMask[InputFixed - SourceOffset] =
13652 IncomingInputs[0] - SourceOffset;
13653 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
13654 InputFixed);
13655 IncomingInputs[0] = InputFixed;
13656 }
13657 } else if (IncomingInputs.size() == 2) {
13658 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
13659 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
13660 // We have two non-adjacent or clobbered inputs we need to extract from
13661 // the source half. To do this, we need to map them into some adjacent
13662 // dword slot in the source mask.
13663 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
13664 IncomingInputs[1] - SourceOffset};
13665
13666 // If there is a free slot in the source half mask adjacent to one of
13667 // the inputs, place the other input in it. We use (Index XOR 1) to
13668 // compute an adjacent index.
13669 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
13670 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
13671 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
13672 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
13673 InputsFixed[1] = InputsFixed[0] ^ 1;
13674 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
13675 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
13676 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
13677 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
13678 InputsFixed[0] = InputsFixed[1] ^ 1;
13679 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
13680 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
13681 // The two inputs are in the same DWord but it is clobbered and the
13682 // adjacent DWord isn't used at all. Move both inputs to the free
13683 // slot.
13684 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
13685 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
13686 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
13687 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
13688 } else {
13689 // The only way we hit this point is if there is no clobbering
13690 // (because there are no off-half inputs to this half) and there is no
13691 // free slot adjacent to one of the inputs. In this case, we have to
13692 // swap an input with a non-input.
13693 for (int i = 0; i < 4; ++i)
13694 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
13695 "We can't handle any clobbers here!");
13696 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
13697 "Cannot have adjacent inputs here!");
13698
13699 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
13700 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
13701
13702 // We also have to update the final source mask in this case because
13703 // it may need to undo the above swap.
13704 for (int &M : FinalSourceHalfMask)
13705 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
13706 M = InputsFixed[1] + SourceOffset;
13707 else if (M == InputsFixed[1] + SourceOffset)
13708 M = (InputsFixed[0] ^ 1) + SourceOffset;
13709
13710 InputsFixed[1] = InputsFixed[0] ^ 1;
13711 }
13712
13713 // Point everything at the fixed inputs.
13714 for (int &M : HalfMask)
13715 if (M == IncomingInputs[0])
13716 M = InputsFixed[0] + SourceOffset;
13717 else if (M == IncomingInputs[1])
13718 M = InputsFixed[1] + SourceOffset;
13719
13720 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
13721 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
13722 }
13723 } else {
13724 llvm_unreachable("Unhandled input size!");
13725 }
13726
13727 // Now hoist the DWord down to the right half.
13728 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
13729 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
13730 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
13731 for (int &M : HalfMask)
13732 for (int Input : IncomingInputs)
13733 if (M == Input)
13734 M = FreeDWord * 2 + Input % 2;
13735 };
13736 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
13737 /*SourceOffset*/ 4, /*DestOffset*/ 0);
13738 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
13739 /*SourceOffset*/ 0, /*DestOffset*/ 4);
13740
13741 // Now enact all the shuffles we've computed to move the inputs into their
13742 // target half.
13743 if (!isNoopShuffleMask(PSHUFLMask))
13744 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13745 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
13746 if (!isNoopShuffleMask(PSHUFHMask))
13747 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13748 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
13749 if (!isNoopShuffleMask(PSHUFDMask))
13750 V = DAG.getBitcast(
13751 VT,
13752 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
13753 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13754
13755 // At this point, each half should contain all its inputs, and we can then
13756 // just shuffle them into their final position.
13757 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
13758 "Failed to lift all the high half inputs to the low mask!");
13759 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
13760 "Failed to lift all the low half inputs to the high mask!");
13761
13762 // Do a half shuffle for the low mask.
13763 if (!isNoopShuffleMask(LoMask))
13764 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13765 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
13766
13767 // Do a half shuffle with the high mask after shifting its values down.
13768 for (int &M : HiMask)
13769 if (M >= 0)
13770 M -= 4;
13771 if (!isNoopShuffleMask(HiMask))
13772 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13773 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
13774
13775 return V;
13776}
13777
13778/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
13779/// blend if only one input is used.
13781 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13782 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
13784 "Lane crossing shuffle masks not supported");
13785
13786 int NumBytes = VT.getSizeInBits() / 8;
13787 int Size = Mask.size();
13788 int Scale = NumBytes / Size;
13789
13790 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
13791 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
13792 V1InUse = false;
13793 V2InUse = false;
13794
13795 for (int i = 0; i < NumBytes; ++i) {
13796 int M = Mask[i / Scale];
13797 if (M < 0)
13798 continue;
13799
13800 const int ZeroMask = 0x80;
13801 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
13802 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
13803 if (Zeroable[i / Scale])
13804 V1Idx = V2Idx = ZeroMask;
13805
13806 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
13807 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
13808 V1InUse |= (ZeroMask != V1Idx);
13809 V2InUse |= (ZeroMask != V2Idx);
13810 }
13811
13812 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
13813 if (V1InUse)
13814 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
13815 DAG.getBuildVector(ShufVT, DL, V1Mask));
13816 if (V2InUse)
13817 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
13818 DAG.getBuildVector(ShufVT, DL, V2Mask));
13819
13820 // If we need shuffled inputs from both, blend the two.
13821 SDValue V;
13822 if (V1InUse && V2InUse)
13823 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
13824 else
13825 V = V1InUse ? V1 : V2;
13826
13827 // Cast the result back to the correct type.
13828 return DAG.getBitcast(VT, V);
13829}
13830
13831/// Generic lowering of 8-lane i16 shuffles.
13832///
13833/// This handles both single-input shuffles and combined shuffle/blends with
13834/// two inputs. The single input shuffles are immediately delegated to
13835/// a dedicated lowering routine.
13836///
13837/// The blends are lowered in one of three fundamental ways. If there are few
13838/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
13839/// of the input is significantly cheaper when lowered as an interleaving of
13840/// the two inputs, try to interleave them. Otherwise, blend the low and high
13841/// halves of the inputs separately (making them have relatively few inputs)
13842/// and then concatenate them.
13844 const APInt &Zeroable, SDValue V1, SDValue V2,
13845 const X86Subtarget &Subtarget,
13846 SelectionDAG &DAG) {
13847 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
13848 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
13849 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13850
13851 // Whenever we can lower this as a zext, that instruction is strictly faster
13852 // than any alternative.
13853 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
13854 Zeroable, Subtarget, DAG))
13855 return ZExt;
13856
13857 // Try to use lower using a truncation.
13858 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
13859 Subtarget, DAG))
13860 return V;
13861
13862 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
13863
13864 if (NumV2Inputs == 0) {
13865 // Try to use shift instructions.
13866 if (SDValue Shift =
13867 lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,
13868 Subtarget, DAG, /*BitwiseOnly*/ false))
13869 return Shift;
13870
13871 // Check for being able to broadcast a single element.
13872 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
13873 Mask, Subtarget, DAG))
13874 return Broadcast;
13875
13876 // Try to use bit rotation instructions.
13877 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
13878 Subtarget, DAG))
13879 return Rotate;
13880
13881 // Use dedicated unpack instructions for masks that match their pattern.
13882 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
13883 return V;
13884
13885 // Use dedicated pack instructions for masks that match their pattern.
13886 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
13887 Subtarget))
13888 return V;
13889
13890 // Try to use byte rotation instructions.
13891 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
13892 Subtarget, DAG))
13893 return Rotate;
13894
13895 // Make a copy of the mask so it can be modified.
13896 SmallVector<int, 8> MutableMask(Mask);
13897 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
13898 Subtarget, DAG);
13899 }
13900
13901 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
13902 "All single-input shuffles should be canonicalized to be V1-input "
13903 "shuffles.");
13904
13905 // Try to use shift instructions.
13906 if (SDValue Shift =
13907 lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,
13908 DAG, /*BitwiseOnly*/ false))
13909 return Shift;
13910
13911 // See if we can use SSE4A Extraction / Insertion.
13912 if (Subtarget.hasSSE4A())
13913 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
13914 Zeroable, DAG))
13915 return V;
13916
13917 // There are special ways we can lower some single-element blends.
13918 if (NumV2Inputs == 1)
13920 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13921 return V;
13922
13923 // We have different paths for blend lowering, but they all must use the
13924 // *exact* same predicate.
13925 bool IsBlendSupported = Subtarget.hasSSE41();
13926 if (IsBlendSupported)
13927 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
13928 Zeroable, Subtarget, DAG))
13929 return Blend;
13930
13931 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
13932 Zeroable, Subtarget, DAG))
13933 return Masked;
13934
13935 // Use dedicated unpack instructions for masks that match their pattern.
13936 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
13937 return V;
13938
13939 // Use dedicated pack instructions for masks that match their pattern.
13940 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
13941 Subtarget))
13942 return V;
13943
13944 // Try to use lower using a truncation.
13945 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
13946 Subtarget, DAG))
13947 return V;
13948
13949 // Try to use byte rotation instructions.
13950 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
13951 Subtarget, DAG))
13952 return Rotate;
13953
13954 if (SDValue BitBlend =
13955 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
13956 return BitBlend;
13957
13958 // Try to use byte shift instructions to mask.
13959 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
13960 Zeroable, Subtarget, DAG))
13961 return V;
13962
13963 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
13964 int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
13965 if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.hasSSE41())) &&
13966 !Subtarget.hasVLX()) {
13967 // Check if this is part of a 256-bit vector truncation.
13968 unsigned PackOpc = 0;
13969 if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
13972 SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
13973 V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
13974 getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
13975 DAG.getTargetConstant(0xEE, DL, MVT::i8));
13976 V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
13977 V1 = extract128BitVector(V1V2, 0, DAG, DL);
13978 V2 = extract128BitVector(V1V2, 4, DAG, DL);
13979 PackOpc = X86ISD::PACKUS;
13980 } else if (Subtarget.hasSSE41()) {
13981 SmallVector<SDValue, 4> DWordClearOps(4,
13982 DAG.getConstant(0, DL, MVT::i32));
13983 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
13984 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
13985 SDValue DWordClearMask =
13986 DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
13987 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
13988 DWordClearMask);
13989 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
13990 DWordClearMask);
13991 PackOpc = X86ISD::PACKUS;
13992 } else if (!Subtarget.hasSSSE3()) {
13993 SDValue ShAmt = DAG.getTargetConstant(16, DL, MVT::i8);
13994 V1 = DAG.getBitcast(MVT::v4i32, V1);
13995 V2 = DAG.getBitcast(MVT::v4i32, V2);
13996 V1 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V1, ShAmt);
13997 V2 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V2, ShAmt);
13998 V1 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V1, ShAmt);
13999 V2 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V2, ShAmt);
14000 PackOpc = X86ISD::PACKSS;
14001 }
14002 if (PackOpc) {
14003 // Now pack things back together.
14004 SDValue Result = DAG.getNode(PackOpc, DL, MVT::v8i16, V1, V2);
14005 if (NumEvenDrops == 2) {
14006 Result = DAG.getBitcast(MVT::v4i32, Result);
14007 Result = DAG.getNode(PackOpc, DL, MVT::v8i16, Result, Result);
14008 }
14009 return Result;
14010 }
14011 }
14012
14013 // When compacting odd (upper) elements, use PACKSS pre-SSE41.
14014 int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
14015 if (NumOddDrops == 1) {
14016 bool HasSSE41 = Subtarget.hasSSE41();
14017 V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14018 DAG.getBitcast(MVT::v4i32, V1),
14019 DAG.getTargetConstant(16, DL, MVT::i8));
14020 V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14021 DAG.getBitcast(MVT::v4i32, V2),
14022 DAG.getTargetConstant(16, DL, MVT::i8));
14023 return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
14024 MVT::v8i16, V1, V2);
14025 }
14026
14027 // Try to lower by permuting the inputs into an unpack instruction.
14028 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
14029 Mask, Subtarget, DAG))
14030 return Unpack;
14031
14032 // If we can't directly blend but can use PSHUFB, that will be better as it
14033 // can both shuffle and set up the inefficient blend.
14034 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
14035 bool V1InUse, V2InUse;
14036 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
14037 Zeroable, DAG, V1InUse, V2InUse);
14038 }
14039
14040 // We can always bit-blend if we have to so the fallback strategy is to
14041 // decompose into single-input permutes and blends/unpacks.
14042 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,
14043 Mask, Subtarget, DAG);
14044}
14045
14046/// Lower 8-lane 16-bit floating point shuffles.
14048 const APInt &Zeroable, SDValue V1, SDValue V2,
14049 const X86Subtarget &Subtarget,
14050 SelectionDAG &DAG) {
14051 assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14052 assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14053 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14054 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
14055
14056 if (Subtarget.hasFP16()) {
14057 if (NumV2Elements == 0) {
14058 // Check for being able to broadcast a single element.
14059 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
14060 Mask, Subtarget, DAG))
14061 return Broadcast;
14062 }
14063 if (NumV2Elements == 1 && Mask[0] >= 8)
14065 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14066 return V;
14067 }
14068
14069 V1 = DAG.getBitcast(MVT::v8i16, V1);
14070 V2 = DAG.getBitcast(MVT::v8i16, V2);
14071 return DAG.getBitcast(MVT::v8f16,
14072 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
14073}
14074
14075// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
14076// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
14077// the active subvector is extracted.
14079 ArrayRef<int> Mask, SDValue V1, SDValue V2,
14080 const X86Subtarget &Subtarget,
14081 SelectionDAG &DAG) {
14082 MVT MaskVT = VT.changeTypeToInteger();
14083 SDValue MaskNode;
14084 MVT ShuffleVT = VT;
14085 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
14086 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
14087 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
14088 ShuffleVT = V1.getSimpleValueType();
14089
14090 // Adjust mask to correct indices for the second input.
14091 int NumElts = VT.getVectorNumElements();
14092 unsigned Scale = 512 / VT.getSizeInBits();
14093 SmallVector<int, 32> AdjustedMask(Mask);
14094 for (int &M : AdjustedMask)
14095 if (NumElts <= M)
14096 M += (Scale - 1) * NumElts;
14097 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
14098 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
14099 } else {
14100 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
14101 }
14102
14103 SDValue Result;
14104 if (V2.isUndef())
14105 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
14106 else
14107 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
14108
14109 if (VT != ShuffleVT)
14110 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
14111
14112 return Result;
14113}
14114
14115/// Generic lowering of v16i8 shuffles.
14116///
14117/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
14118/// detect any complexity reducing interleaving. If that doesn't help, it uses
14119/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
14120/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
14121/// back together.
14123 const APInt &Zeroable, SDValue V1, SDValue V2,
14124 const X86Subtarget &Subtarget,
14125 SelectionDAG &DAG) {
14126 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14127 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14128 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14129
14130 // Try to use shift instructions.
14131 if (SDValue Shift =
14132 lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,
14133 DAG, /*BitwiseOnly*/ false))
14134 return Shift;
14135
14136 // Try to use byte rotation instructions.
14137 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
14138 Subtarget, DAG))
14139 return Rotate;
14140
14141 // Use dedicated pack instructions for masks that match their pattern.
14142 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
14143 Subtarget))
14144 return V;
14145
14146 // Try to use a zext lowering.
14147 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
14148 Zeroable, Subtarget, DAG))
14149 return ZExt;
14150
14151 // Try to use lower using a truncation.
14152 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14153 Subtarget, DAG))
14154 return V;
14155
14156 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14157 Subtarget, DAG))
14158 return V;
14159
14160 // See if we can use SSE4A Extraction / Insertion.
14161 if (Subtarget.hasSSE4A())
14162 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
14163 Zeroable, DAG))
14164 return V;
14165
14166 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
14167
14168 // For single-input shuffles, there are some nicer lowering tricks we can use.
14169 if (NumV2Elements == 0) {
14170 // Check for being able to broadcast a single element.
14171 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
14172 Mask, Subtarget, DAG))
14173 return Broadcast;
14174
14175 // Try to use bit rotation instructions.
14176 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
14177 Subtarget, DAG))
14178 return Rotate;
14179
14180 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
14181 return V;
14182
14183 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
14184 // Notably, this handles splat and partial-splat shuffles more efficiently.
14185 // However, it only makes sense if the pre-duplication shuffle simplifies
14186 // things significantly. Currently, this means we need to be able to
14187 // express the pre-duplication shuffle as an i16 shuffle.
14188 //
14189 // FIXME: We should check for other patterns which can be widened into an
14190 // i16 shuffle as well.
14191 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
14192 for (int i = 0; i < 16; i += 2)
14193 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
14194 return false;
14195
14196 return true;
14197 };
14198 auto tryToWidenViaDuplication = [&]() -> SDValue {
14199 if (!canWidenViaDuplication(Mask))
14200 return SDValue();
14201 SmallVector<int, 4> LoInputs;
14202 copy_if(Mask, std::back_inserter(LoInputs),
14203 [](int M) { return M >= 0 && M < 8; });
14204 array_pod_sort(LoInputs.begin(), LoInputs.end());
14205 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
14206 LoInputs.end());
14207 SmallVector<int, 4> HiInputs;
14208 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
14209 array_pod_sort(HiInputs.begin(), HiInputs.end());
14210 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
14211 HiInputs.end());
14212
14213 bool TargetLo = LoInputs.size() >= HiInputs.size();
14214 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
14215 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
14216
14217 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
14219 for (int I : InPlaceInputs) {
14220 PreDupI16Shuffle[I/2] = I/2;
14221 LaneMap[I] = I;
14222 }
14223 int j = TargetLo ? 0 : 4, je = j + 4;
14224 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
14225 // Check if j is already a shuffle of this input. This happens when
14226 // there are two adjacent bytes after we move the low one.
14227 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
14228 // If we haven't yet mapped the input, search for a slot into which
14229 // we can map it.
14230 while (j < je && PreDupI16Shuffle[j] >= 0)
14231 ++j;
14232
14233 if (j == je)
14234 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
14235 return SDValue();
14236
14237 // Map this input with the i16 shuffle.
14238 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
14239 }
14240
14241 // Update the lane map based on the mapping we ended up with.
14242 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
14243 }
14244 V1 = DAG.getBitcast(
14245 MVT::v16i8,
14246 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14247 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
14248
14249 // Unpack the bytes to form the i16s that will be shuffled into place.
14250 bool EvenInUse = false, OddInUse = false;
14251 for (int i = 0; i < 16; i += 2) {
14252 EvenInUse |= (Mask[i + 0] >= 0);
14253 OddInUse |= (Mask[i + 1] >= 0);
14254 if (EvenInUse && OddInUse)
14255 break;
14256 }
14257 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
14258 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
14259 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
14260
14261 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
14262 for (int i = 0; i < 16; ++i)
14263 if (Mask[i] >= 0) {
14264 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
14265 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
14266 if (PostDupI16Shuffle[i / 2] < 0)
14267 PostDupI16Shuffle[i / 2] = MappedMask;
14268 else
14269 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
14270 "Conflicting entries in the original shuffle!");
14271 }
14272 return DAG.getBitcast(
14273 MVT::v16i8,
14274 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14275 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
14276 };
14277 if (SDValue V = tryToWidenViaDuplication())
14278 return V;
14279 }
14280
14281 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
14282 Zeroable, Subtarget, DAG))
14283 return Masked;
14284
14285 // Use dedicated unpack instructions for masks that match their pattern.
14286 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
14287 return V;
14288
14289 // Try to use byte shift instructions to mask.
14290 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
14291 Zeroable, Subtarget, DAG))
14292 return V;
14293
14294 // Check for compaction patterns.
14295 bool IsSingleInput = V2.isUndef();
14296 int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
14297
14298 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
14299 // with PSHUFB. It is important to do this before we attempt to generate any
14300 // blends but after all of the single-input lowerings. If the single input
14301 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
14302 // want to preserve that and we can DAG combine any longer sequences into
14303 // a PSHUFB in the end. But once we start blending from multiple inputs,
14304 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
14305 // and there are *very* few patterns that would actually be faster than the
14306 // PSHUFB approach because of its ability to zero lanes.
14307 //
14308 // If the mask is a binary compaction, we can more efficiently perform this
14309 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
14310 //
14311 // FIXME: The only exceptions to the above are blends which are exact
14312 // interleavings with direct instructions supporting them. We currently don't
14313 // handle those well here.
14314 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
14315 bool V1InUse = false;
14316 bool V2InUse = false;
14317
14319 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
14320
14321 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
14322 // do so. This avoids using them to handle blends-with-zero which is
14323 // important as a single pshufb is significantly faster for that.
14324 if (V1InUse && V2InUse) {
14325 if (Subtarget.hasSSE41())
14326 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
14327 Zeroable, Subtarget, DAG))
14328 return Blend;
14329
14330 // We can use an unpack to do the blending rather than an or in some
14331 // cases. Even though the or may be (very minorly) more efficient, we
14332 // preference this lowering because there are common cases where part of
14333 // the complexity of the shuffles goes away when we do the final blend as
14334 // an unpack.
14335 // FIXME: It might be worth trying to detect if the unpack-feeding
14336 // shuffles will both be pshufb, in which case we shouldn't bother with
14337 // this.
14339 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14340 return Unpack;
14341
14342 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
14343 if (Subtarget.hasVBMI())
14344 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
14345 DAG);
14346
14347 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
14348 if (Subtarget.hasXOP()) {
14349 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
14350 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
14351 }
14352
14353 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
14354 // PALIGNR will be cheaper than the second PSHUFB+OR.
14356 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14357 return V;
14358 }
14359
14360 return PSHUFB;
14361 }
14362
14363 // There are special ways we can lower some single-element blends.
14364 if (NumV2Elements == 1)
14366 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14367 return V;
14368
14369 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
14370 return Blend;
14371
14372 // Check whether a compaction lowering can be done. This handles shuffles
14373 // which take every Nth element for some even N. See the helper function for
14374 // details.
14375 //
14376 // We special case these as they can be particularly efficiently handled with
14377 // the PACKUSB instruction on x86 and they show up in common patterns of
14378 // rearranging bytes to truncate wide elements.
14379 if (NumEvenDrops) {
14380 // NumEvenDrops is the power of two stride of the elements. Another way of
14381 // thinking about it is that we need to drop the even elements this many
14382 // times to get the original input.
14383
14384 // First we need to zero all the dropped bytes.
14385 assert(NumEvenDrops <= 3 &&
14386 "No support for dropping even elements more than 3 times.");
14387 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
14388 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
14389 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
14390 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
14391 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
14392 WordClearMask);
14393 if (!IsSingleInput)
14394 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
14395 WordClearMask);
14396
14397 // Now pack things back together.
14398 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
14399 IsSingleInput ? V1 : V2);
14400 for (int i = 1; i < NumEvenDrops; ++i) {
14401 Result = DAG.getBitcast(MVT::v8i16, Result);
14402 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
14403 }
14404 return Result;
14405 }
14406
14407 int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
14408 if (NumOddDrops == 1) {
14409 V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
14410 DAG.getBitcast(MVT::v8i16, V1),
14411 DAG.getTargetConstant(8, DL, MVT::i8));
14412 if (!IsSingleInput)
14413 V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
14414 DAG.getBitcast(MVT::v8i16, V2),
14415 DAG.getTargetConstant(8, DL, MVT::i8));
14416 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
14417 IsSingleInput ? V1 : V2);
14418 }
14419
14420 // Handle multi-input cases by blending/unpacking single-input shuffles.
14421 if (NumV2Elements > 0)
14422 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
14423 Subtarget, DAG);
14424
14425 // The fallback path for single-input shuffles widens this into two v8i16
14426 // vectors with unpacks, shuffles those, and then pulls them back together
14427 // with a pack.
14428 SDValue V = V1;
14429
14430 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14431 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14432 for (int i = 0; i < 16; ++i)
14433 if (Mask[i] >= 0)
14434 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
14435
14436 SDValue VLoHalf, VHiHalf;
14437 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
14438 // them out and avoid using UNPCK{L,H} to extract the elements of V as
14439 // i16s.
14440 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
14441 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
14442 // Use a mask to drop the high bytes.
14443 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
14444 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
14445 DAG.getConstant(0x00FF, DL, MVT::v8i16));
14446
14447 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
14448 VHiHalf = DAG.getUNDEF(MVT::v8i16);
14449
14450 // Squash the masks to point directly into VLoHalf.
14451 for (int &M : LoBlendMask)
14452 if (M >= 0)
14453 M /= 2;
14454 for (int &M : HiBlendMask)
14455 if (M >= 0)
14456 M /= 2;
14457 } else {
14458 // Otherwise just unpack the low half of V into VLoHalf and the high half into
14459 // VHiHalf so that we can blend them as i16s.
14460 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
14461
14462 VLoHalf = DAG.getBitcast(
14463 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
14464 VHiHalf = DAG.getBitcast(
14465 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
14466 }
14467
14468 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
14469 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
14470
14471 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
14472}
14473
14474/// Dispatching routine to lower various 128-bit x86 vector shuffles.
14475///
14476/// This routine breaks down the specific type of 128-bit shuffle and
14477/// dispatches to the lowering routines accordingly.
14479 MVT VT, SDValue V1, SDValue V2,
14480 const APInt &Zeroable,
14481 const X86Subtarget &Subtarget,
14482 SelectionDAG &DAG) {
14483 if (VT == MVT::v8bf16) {
14484 V1 = DAG.getBitcast(MVT::v8i16, V1);
14485 V2 = DAG.getBitcast(MVT::v8i16, V2);
14486 return DAG.getBitcast(VT,
14487 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
14488 }
14489
14490 switch (VT.SimpleTy) {
14491 case MVT::v2i64:
14492 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14493 case MVT::v2f64:
14494 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14495 case MVT::v4i32:
14496 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14497 case MVT::v4f32:
14498 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14499 case MVT::v8i16:
14500 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14501 case MVT::v8f16:
14502 return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14503 case MVT::v16i8:
14504 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14505
14506 default:
14507 llvm_unreachable("Unimplemented!");
14508 }
14509}
14510
14511/// Generic routine to split vector shuffle into half-sized shuffles.
14512///
14513/// This routine just extracts two subvectors, shuffles them independently, and
14514/// then concatenates them back together. This should work effectively with all
14515/// AVX vector shuffle types.
14517 SDValue V2, ArrayRef<int> Mask,
14518 SelectionDAG &DAG, bool SimpleOnly) {
14519 assert(VT.getSizeInBits() >= 256 &&
14520 "Only for 256-bit or wider vector shuffles!");
14521 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
14522 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
14523
14524 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
14525 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
14526
14527 int NumElements = VT.getVectorNumElements();
14528 int SplitNumElements = NumElements / 2;
14529 MVT ScalarVT = VT.getVectorElementType();
14530 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
14531
14532 // Use splitVector/extractSubVector so that split build-vectors just build two
14533 // narrower build vectors. This helps shuffling with splats and zeros.
14534 auto SplitVector = [&](SDValue V) {
14535 SDValue LoV, HiV;
14536 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
14537 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
14538 DAG.getBitcast(SplitVT, HiV));
14539 };
14540
14541 SDValue LoV1, HiV1, LoV2, HiV2;
14542 std::tie(LoV1, HiV1) = SplitVector(V1);
14543 std::tie(LoV2, HiV2) = SplitVector(V2);
14544
14545 // Now create two 4-way blends of these half-width vectors.
14546 auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
14547 bool &UseHiV1, bool &UseLoV2,
14548 bool &UseHiV2) {
14549 UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
14550 for (int i = 0; i < SplitNumElements; ++i) {
14551 int M = HalfMask[i];
14552 if (M >= NumElements) {
14553 if (M >= NumElements + SplitNumElements)
14554 UseHiV2 = true;
14555 else
14556 UseLoV2 = true;
14557 } else if (M >= 0) {
14558 if (M >= SplitNumElements)
14559 UseHiV1 = true;
14560 else
14561 UseLoV1 = true;
14562 }
14563 }
14564 };
14565
14566 auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
14567 if (!SimpleOnly)
14568 return true;
14569
14570 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
14571 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
14572
14573 return !(UseHiV1 || UseHiV2);
14574 };
14575
14576 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
14577 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
14578 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
14579 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
14580 for (int i = 0; i < SplitNumElements; ++i) {
14581 int M = HalfMask[i];
14582 if (M >= NumElements) {
14583 V2BlendMask[i] = M - NumElements;
14584 BlendMask[i] = SplitNumElements + i;
14585 } else if (M >= 0) {
14586 V1BlendMask[i] = M;
14587 BlendMask[i] = i;
14588 }
14589 }
14590
14591 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
14592 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
14593
14594 // Because the lowering happens after all combining takes place, we need to
14595 // manually combine these blend masks as much as possible so that we create
14596 // a minimal number of high-level vector shuffle nodes.
14597 assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple");
14598
14599 // First try just blending the halves of V1 or V2.
14600 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
14601 return DAG.getUNDEF(SplitVT);
14602 if (!UseLoV2 && !UseHiV2)
14603 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
14604 if (!UseLoV1 && !UseHiV1)
14605 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
14606
14607 SDValue V1Blend, V2Blend;
14608 if (UseLoV1 && UseHiV1) {
14609 V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
14610 } else {
14611 // We only use half of V1 so map the usage down into the final blend mask.
14612 V1Blend = UseLoV1 ? LoV1 : HiV1;
14613 for (int i = 0; i < SplitNumElements; ++i)
14614 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
14615 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
14616 }
14617 if (UseLoV2 && UseHiV2) {
14618 V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
14619 } else {
14620 // We only use half of V2 so map the usage down into the final blend mask.
14621 V2Blend = UseLoV2 ? LoV2 : HiV2;
14622 for (int i = 0; i < SplitNumElements; ++i)
14623 if (BlendMask[i] >= SplitNumElements)
14624 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
14625 }
14626 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
14627 };
14628
14629 if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
14630 return SDValue();
14631
14632 SDValue Lo = HalfBlend(LoMask);
14633 SDValue Hi = HalfBlend(HiMask);
14634 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
14635}
14636
14637/// Either split a vector in halves or decompose the shuffles and the
14638/// blend/unpack.
14639///
14640/// This is provided as a good fallback for many lowerings of non-single-input
14641/// shuffles with more than one 128-bit lane. In those cases, we want to select
14642/// between splitting the shuffle into 128-bit components and stitching those
14643/// back together vs. extracting the single-input shuffles and blending those
14644/// results.
14646 SDValue V2, ArrayRef<int> Mask,
14647 const X86Subtarget &Subtarget,
14648 SelectionDAG &DAG) {
14649 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
14650 "shuffles as it could then recurse on itself.");
14651 int Size = Mask.size();
14652
14653 // If this can be modeled as a broadcast of two elements followed by a blend,
14654 // prefer that lowering. This is especially important because broadcasts can
14655 // often fold with memory operands.
14656 auto DoBothBroadcast = [&] {
14657 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
14658 for (int M : Mask)
14659 if (M >= Size) {
14660 if (V2BroadcastIdx < 0)
14661 V2BroadcastIdx = M - Size;
14662 else if (M - Size != V2BroadcastIdx)
14663 return false;
14664 } else if (M >= 0) {
14665 if (V1BroadcastIdx < 0)
14666 V1BroadcastIdx = M;
14667 else if (M != V1BroadcastIdx)
14668 return false;
14669 }
14670 return true;
14671 };
14672 if (DoBothBroadcast())
14673 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
14674 DAG);
14675
14676 // If the inputs all stem from a single 128-bit lane of each input, then we
14677 // split them rather than blending because the split will decompose to
14678 // unusually few instructions.
14679 int LaneCount = VT.getSizeInBits() / 128;
14680 int LaneSize = Size / LaneCount;
14681 SmallBitVector LaneInputs[2];
14682 LaneInputs[0].resize(LaneCount, false);
14683 LaneInputs[1].resize(LaneCount, false);
14684 for (int i = 0; i < Size; ++i)
14685 if (Mask[i] >= 0)
14686 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
14687 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
14688 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
14689 /*SimpleOnly*/ false);
14690
14691 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
14692 // requires that the decomposed single-input shuffles don't end up here.
14693 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
14694 DAG);
14695}
14696
14697// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
14698// TODO: Extend to support v8f32 (+ 512-bit shuffles).
14700 SDValue V1, SDValue V2,
14701 ArrayRef<int> Mask,
14702 SelectionDAG &DAG) {
14703 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
14704
14705 int LHSMask[4] = {-1, -1, -1, -1};
14706 int RHSMask[4] = {-1, -1, -1, -1};
14707 unsigned SHUFPMask = 0;
14708
14709 // As SHUFPD uses a single LHS/RHS element per lane, we can always
14710 // perform the shuffle once the lanes have been shuffled in place.
14711 for (int i = 0; i != 4; ++i) {
14712 int M = Mask[i];
14713 if (M < 0)
14714 continue;
14715 int LaneBase = i & ~1;
14716 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
14717 LaneMask[LaneBase + (M & 1)] = M;
14718 SHUFPMask |= (M & 1) << i;
14719 }
14720
14721 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
14722 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
14723 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
14724 DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
14725}
14726
14727/// Lower a vector shuffle crossing multiple 128-bit lanes as
14728/// a lane permutation followed by a per-lane permutation.
14729///
14730/// This is mainly for cases where we can have non-repeating permutes
14731/// in each lane.
14732///
14733/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
14734/// we should investigate merging them.
14736 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14737 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
14738 int NumElts = VT.getVectorNumElements();
14739 int NumLanes = VT.getSizeInBits() / 128;
14740 int NumEltsPerLane = NumElts / NumLanes;
14741 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
14742
14743 /// Attempts to find a sublane permute with the given size
14744 /// that gets all elements into their target lanes.
14745 ///
14746 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
14747 /// If unsuccessful, returns false and may overwrite InLaneMask.
14748 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
14749 int NumSublanesPerLane = NumSublanes / NumLanes;
14750 int NumEltsPerSublane = NumElts / NumSublanes;
14751
14752 SmallVector<int, 16> CrossLaneMask;
14753 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
14754 // CrossLaneMask but one entry == one sublane.
14755 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
14756
14757 for (int i = 0; i != NumElts; ++i) {
14758 int M = Mask[i];
14759 if (M < 0)
14760 continue;
14761
14762 int SrcSublane = M / NumEltsPerSublane;
14763 int DstLane = i / NumEltsPerLane;
14764
14765 // We only need to get the elements into the right lane, not sublane.
14766 // So search all sublanes that make up the destination lane.
14767 bool Found = false;
14768 int DstSubStart = DstLane * NumSublanesPerLane;
14769 int DstSubEnd = DstSubStart + NumSublanesPerLane;
14770 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
14771 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
14772 continue;
14773
14774 Found = true;
14775 CrossLaneMaskLarge[DstSublane] = SrcSublane;
14776 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
14777 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
14778 break;
14779 }
14780 if (!Found)
14781 return SDValue();
14782 }
14783
14784 // Fill CrossLaneMask using CrossLaneMaskLarge.
14785 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
14786
14787 if (!CanUseSublanes) {
14788 // If we're only shuffling a single lowest lane and the rest are identity
14789 // then don't bother.
14790 // TODO - isShuffleMaskInputInPlace could be extended to something like
14791 // this.
14792 int NumIdentityLanes = 0;
14793 bool OnlyShuffleLowestLane = true;
14794 for (int i = 0; i != NumLanes; ++i) {
14795 int LaneOffset = i * NumEltsPerLane;
14796 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
14797 i * NumEltsPerLane))
14798 NumIdentityLanes++;
14799 else if (CrossLaneMask[LaneOffset] != 0)
14800 OnlyShuffleLowestLane = false;
14801 }
14802 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
14803 return SDValue();
14804 }
14805
14806 // Avoid returning the same shuffle operation. For example,
14807 // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
14808 // undef:v16i16
14809 if (CrossLaneMask == Mask || InLaneMask == Mask)
14810 return SDValue();
14811
14812 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
14813 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
14814 InLaneMask);
14815 };
14816
14817 // First attempt a solution with full lanes.
14818 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
14819 return V;
14820
14821 // The rest of the solutions use sublanes.
14822 if (!CanUseSublanes)
14823 return SDValue();
14824
14825 // Then attempt a solution with 64-bit sublanes (vpermq).
14826 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
14827 return V;
14828
14829 // If that doesn't work and we have fast variable cross-lane shuffle,
14830 // attempt 32-bit sublanes (vpermd).
14831 if (!Subtarget.hasFastVariableCrossLaneShuffle())
14832 return SDValue();
14833
14834 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
14835}
14836
14837/// Helper to get compute inlane shuffle mask for a complete shuffle mask.
14838static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
14839 SmallVector<int> &InLaneMask) {
14840 int Size = Mask.size();
14841 InLaneMask.assign(Mask.begin(), Mask.end());
14842 for (int i = 0; i < Size; ++i) {
14843 int &M = InLaneMask[i];
14844 if (M < 0)
14845 continue;
14846 if (((M % Size) / LaneSize) != (i / LaneSize))
14847 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
14848 }
14849}
14850
14851/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
14852/// source with a lane permutation.
14853///
14854/// This lowering strategy results in four instructions in the worst case for a
14855/// single-input cross lane shuffle which is lower than any other fully general
14856/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
14857/// shuffle pattern should be handled prior to trying this lowering.
14859 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14860 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
14861 // FIXME: This should probably be generalized for 512-bit vectors as well.
14862 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
14863 int Size = Mask.size();
14864 int LaneSize = Size / 2;
14865
14866 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
14867 // Only do this if the elements aren't all from the lower lane,
14868 // otherwise we're (probably) better off doing a split.
14869 if (VT == MVT::v4f64 &&
14870 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
14871 return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);
14872
14873 // If there are only inputs from one 128-bit lane, splitting will in fact be
14874 // less expensive. The flags track whether the given lane contains an element
14875 // that crosses to another lane.
14876 bool AllLanes;
14877 if (!Subtarget.hasAVX2()) {
14878 bool LaneCrossing[2] = {false, false};
14879 for (int i = 0; i < Size; ++i)
14880 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
14881 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
14882 AllLanes = LaneCrossing[0] && LaneCrossing[1];
14883 } else {
14884 bool LaneUsed[2] = {false, false};
14885 for (int i = 0; i < Size; ++i)
14886 if (Mask[i] >= 0)
14887 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
14888 AllLanes = LaneUsed[0] && LaneUsed[1];
14889 }
14890
14891 // TODO - we could support shuffling V2 in the Flipped input.
14892 assert(V2.isUndef() &&
14893 "This last part of this routine only works on single input shuffles");
14894
14895 SmallVector<int> InLaneMask;
14896 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
14897
14898 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
14899 "In-lane shuffle mask expected");
14900
14901 // If we're not using both lanes in each lane and the inlane mask is not
14902 // repeating, then we're better off splitting.
14903 if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
14904 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
14905 /*SimpleOnly*/ false);
14906
14907 // Flip the lanes, and shuffle the results which should now be in-lane.
14908 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
14909 SDValue Flipped = DAG.getBitcast(PVT, V1);
14910 Flipped =
14911 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
14912 Flipped = DAG.getBitcast(VT, Flipped);
14913 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
14914}
14915
14916/// Handle lowering 2-lane 128-bit shuffles.
14918 SDValue V2, ArrayRef<int> Mask,
14919 const APInt &Zeroable,
14920 const X86Subtarget &Subtarget,
14921 SelectionDAG &DAG) {
14922 if (V2.isUndef()) {
14923 // Attempt to match VBROADCAST*128 subvector broadcast load.
14924 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
14925 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
14926 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
14928 MVT MemVT = VT.getHalfNumVectorElementsVT();
14929 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
14930 auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
14932 VT, MemVT, Ld, Ofs, DAG))
14933 return BcstLd;
14934 }
14935
14936 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
14937 if (Subtarget.hasAVX2())
14938 return SDValue();
14939 }
14940
14941 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
14942
14943 SmallVector<int, 4> WidenedMask;
14944 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
14945 return SDValue();
14946
14947 bool IsLowZero = (Zeroable & 0x3) == 0x3;
14948 bool IsHighZero = (Zeroable & 0xc) == 0xc;
14949
14950 // Try to use an insert into a zero vector.
14951 if (WidenedMask[0] == 0 && IsHighZero) {
14952 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
14953 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
14954 DAG.getIntPtrConstant(0, DL));
14955 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
14956 getZeroVector(VT, Subtarget, DAG, DL), LoV,
14957 DAG.getIntPtrConstant(0, DL));
14958 }
14959
14960 // TODO: If minimizing size and one of the inputs is a zero vector and the
14961 // the zero vector has only one use, we could use a VPERM2X128 to save the
14962 // instruction bytes needed to explicitly generate the zero vector.
14963
14964 // Blends are faster and handle all the non-lane-crossing cases.
14965 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
14966 Subtarget, DAG))
14967 return Blend;
14968
14969 // If either input operand is a zero vector, use VPERM2X128 because its mask
14970 // allows us to replace the zero input with an implicit zero.
14971 if (!IsLowZero && !IsHighZero) {
14972 // Check for patterns which can be matched with a single insert of a 128-bit
14973 // subvector.
14974 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
14975 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
14976
14977 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
14978 // this will likely become vinsertf128 which can't fold a 256-bit memop.
14979 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
14980 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
14981 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
14982 OnlyUsesV1 ? V1 : V2,
14983 DAG.getIntPtrConstant(0, DL));
14984 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
14985 DAG.getIntPtrConstant(2, DL));
14986 }
14987 }
14988
14989 // Try to use SHUF128 if possible.
14990 if (Subtarget.hasVLX()) {
14991 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
14992 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
14993 ((WidenedMask[1] % 2) << 1);
14994 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
14995 DAG.getTargetConstant(PermMask, DL, MVT::i8));
14996 }
14997 }
14998 }
14999
15000 // Otherwise form a 128-bit permutation. After accounting for undefs,
15001 // convert the 64-bit shuffle mask selection values into 128-bit
15002 // selection bits by dividing the indexes by 2 and shifting into positions
15003 // defined by a vperm2*128 instruction's immediate control byte.
15004
15005 // The immediate permute control byte looks like this:
15006 // [1:0] - select 128 bits from sources for low half of destination
15007 // [2] - ignore
15008 // [3] - zero low half of destination
15009 // [5:4] - select 128 bits from sources for high half of destination
15010 // [6] - ignore
15011 // [7] - zero high half of destination
15012
15013 assert((WidenedMask[0] >= 0 || IsLowZero) &&
15014 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
15015
15016 unsigned PermMask = 0;
15017 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
15018 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
15019
15020 // Check the immediate mask and replace unused sources with undef.
15021 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
15022 V1 = DAG.getUNDEF(VT);
15023 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
15024 V2 = DAG.getUNDEF(VT);
15025
15026 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
15027 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15028}
15029
15030/// Lower a vector shuffle by first fixing the 128-bit lanes and then
15031/// shuffling each lane.
15032///
15033/// This attempts to create a repeated lane shuffle where each lane uses one
15034/// or two of the lanes of the inputs. The lanes of the input vectors are
15035/// shuffled in one or two independent shuffles to get the lanes into the
15036/// position needed by the final shuffle.
15038 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15039 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15040 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
15041
15042 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15043 return SDValue();
15044
15045 int NumElts = Mask.size();
15046 int NumLanes = VT.getSizeInBits() / 128;
15047 int NumLaneElts = 128 / VT.getScalarSizeInBits();
15048 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
15049 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
15050
15051 // First pass will try to fill in the RepeatMask from lanes that need two
15052 // sources.
15053 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15054 int Srcs[2] = {-1, -1};
15055 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
15056 for (int i = 0; i != NumLaneElts; ++i) {
15057 int M = Mask[(Lane * NumLaneElts) + i];
15058 if (M < 0)
15059 continue;
15060 // Determine which of the possible input lanes (NumLanes from each source)
15061 // this element comes from. Assign that as one of the sources for this
15062 // lane. We can assign up to 2 sources for this lane. If we run out
15063 // sources we can't do anything.
15064 int LaneSrc = M / NumLaneElts;
15065 int Src;
15066 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
15067 Src = 0;
15068 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
15069 Src = 1;
15070 else
15071 return SDValue();
15072
15073 Srcs[Src] = LaneSrc;
15074 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
15075 }
15076
15077 // If this lane has two sources, see if it fits with the repeat mask so far.
15078 if (Srcs[1] < 0)
15079 continue;
15080
15081 LaneSrcs[Lane][0] = Srcs[0];
15082 LaneSrcs[Lane][1] = Srcs[1];
15083
15084 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
15085 assert(M1.size() == M2.size() && "Unexpected mask size");
15086 for (int i = 0, e = M1.size(); i != e; ++i)
15087 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
15088 return false;
15089 return true;
15090 };
15091
15092 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
15093 assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
15094 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
15095 int M = Mask[i];
15096 if (M < 0)
15097 continue;
15098 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
15099 "Unexpected mask element");
15100 MergedMask[i] = M;
15101 }
15102 };
15103
15104 if (MatchMasks(InLaneMask, RepeatMask)) {
15105 // Merge this lane mask into the final repeat mask.
15106 MergeMasks(InLaneMask, RepeatMask);
15107 continue;
15108 }
15109
15110 // Didn't find a match. Swap the operands and try again.
15111 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
15113
15114 if (MatchMasks(InLaneMask, RepeatMask)) {
15115 // Merge this lane mask into the final repeat mask.
15116 MergeMasks(InLaneMask, RepeatMask);
15117 continue;
15118 }
15119
15120 // Couldn't find a match with the operands in either order.
15121 return SDValue();
15122 }
15123
15124 // Now handle any lanes with only one source.
15125 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15126 // If this lane has already been processed, skip it.
15127 if (LaneSrcs[Lane][0] >= 0)
15128 continue;
15129
15130 for (int i = 0; i != NumLaneElts; ++i) {
15131 int M = Mask[(Lane * NumLaneElts) + i];
15132 if (M < 0)
15133 continue;
15134
15135 // If RepeatMask isn't defined yet we can define it ourself.
15136 if (RepeatMask[i] < 0)
15137 RepeatMask[i] = M % NumLaneElts;
15138
15139 if (RepeatMask[i] < NumElts) {
15140 if (RepeatMask[i] != M % NumLaneElts)
15141 return SDValue();
15142 LaneSrcs[Lane][0] = M / NumLaneElts;
15143 } else {
15144 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
15145 return SDValue();
15146 LaneSrcs[Lane][1] = M / NumLaneElts;
15147 }
15148 }
15149
15150 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
15151 return SDValue();
15152 }
15153
15154 SmallVector<int, 16> NewMask(NumElts, -1);
15155 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15156 int Src = LaneSrcs[Lane][0];
15157 for (int i = 0; i != NumLaneElts; ++i) {
15158 int M = -1;
15159 if (Src >= 0)
15160 M = Src * NumLaneElts + i;
15161 NewMask[Lane * NumLaneElts + i] = M;
15162 }
15163 }
15164 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15165 // Ensure we didn't get back the shuffle we started with.
15166 // FIXME: This is a hack to make up for some splat handling code in
15167 // getVectorShuffle.
15168 if (isa<ShuffleVectorSDNode>(NewV1) &&
15169 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
15170 return SDValue();
15171
15172 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15173 int Src = LaneSrcs[Lane][1];
15174 for (int i = 0; i != NumLaneElts; ++i) {
15175 int M = -1;
15176 if (Src >= 0)
15177 M = Src * NumLaneElts + i;
15178 NewMask[Lane * NumLaneElts + i] = M;
15179 }
15180 }
15181 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15182 // Ensure we didn't get back the shuffle we started with.
15183 // FIXME: This is a hack to make up for some splat handling code in
15184 // getVectorShuffle.
15185 if (isa<ShuffleVectorSDNode>(NewV2) &&
15186 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
15187 return SDValue();
15188
15189 for (int i = 0; i != NumElts; ++i) {
15190 if (Mask[i] < 0) {
15191 NewMask[i] = -1;
15192 continue;
15193 }
15194 NewMask[i] = RepeatMask[i % NumLaneElts];
15195 if (NewMask[i] < 0)
15196 continue;
15197
15198 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
15199 }
15200 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
15201}
15202
15203/// If the input shuffle mask results in a vector that is undefined in all upper
15204/// or lower half elements and that mask accesses only 2 halves of the
15205/// shuffle's operands, return true. A mask of half the width with mask indexes
15206/// adjusted to access the extracted halves of the original shuffle operands is
15207/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
15208/// lower half of each input operand is accessed.
15209static bool
15211 int &HalfIdx1, int &HalfIdx2) {
15212 assert((Mask.size() == HalfMask.size() * 2) &&
15213 "Expected input mask to be twice as long as output");
15214
15215 // Exactly one half of the result must be undef to allow narrowing.
15216 bool UndefLower = isUndefLowerHalf(Mask);
15217 bool UndefUpper = isUndefUpperHalf(Mask);
15218 if (UndefLower == UndefUpper)
15219 return false;
15220
15221 unsigned HalfNumElts = HalfMask.size();
15222 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
15223 HalfIdx1 = -1;
15224 HalfIdx2 = -1;
15225 for (unsigned i = 0; i != HalfNumElts; ++i) {
15226 int M = Mask[i + MaskIndexOffset];
15227 if (M < 0) {
15228 HalfMask[i] = M;
15229 continue;
15230 }
15231
15232 // Determine which of the 4 half vectors this element is from.
15233 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
15234 int HalfIdx = M / HalfNumElts;
15235
15236 // Determine the element index into its half vector source.
15237 int HalfElt = M % HalfNumElts;
15238
15239 // We can shuffle with up to 2 half vectors, set the new 'half'
15240 // shuffle mask accordingly.
15241 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
15242 HalfMask[i] = HalfElt;
15243 HalfIdx1 = HalfIdx;
15244 continue;
15245 }
15246 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
15247 HalfMask[i] = HalfElt + HalfNumElts;
15248 HalfIdx2 = HalfIdx;
15249 continue;
15250 }
15251
15252 // Too many half vectors referenced.
15253 return false;
15254 }
15255
15256 return true;
15257}
15258
15259/// Given the output values from getHalfShuffleMask(), create a half width
15260/// shuffle of extracted vectors followed by an insert back to full width.
15262 ArrayRef<int> HalfMask, int HalfIdx1,
15263 int HalfIdx2, bool UndefLower,
15264 SelectionDAG &DAG, bool UseConcat = false) {
15265 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
15266 assert(V1.getValueType().isSimple() && "Expecting only simple types");
15267
15268 MVT VT = V1.getSimpleValueType();
15269 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15270 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15271
15272 auto getHalfVector = [&](int HalfIdx) {
15273 if (HalfIdx < 0)
15274 return DAG.getUNDEF(HalfVT);
15275 SDValue V = (HalfIdx < 2 ? V1 : V2);
15276 HalfIdx = (HalfIdx % 2) * HalfNumElts;
15277 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
15278 DAG.getIntPtrConstant(HalfIdx, DL));
15279 };
15280
15281 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
15282 SDValue Half1 = getHalfVector(HalfIdx1);
15283 SDValue Half2 = getHalfVector(HalfIdx2);
15284 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
15285 if (UseConcat) {
15286 SDValue Op0 = V;
15287 SDValue Op1 = DAG.getUNDEF(HalfVT);
15288 if (UndefLower)
15289 std::swap(Op0, Op1);
15290 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
15291 }
15292
15293 unsigned Offset = UndefLower ? HalfNumElts : 0;
15294 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
15296}
15297
15298/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
15299/// This allows for fast cases such as subvector extraction/insertion
15300/// or shuffling smaller vector types which can lower more efficiently.
15302 SDValue V2, ArrayRef<int> Mask,
15303 const X86Subtarget &Subtarget,
15304 SelectionDAG &DAG) {
15305 assert((VT.is256BitVector() || VT.is512BitVector()) &&
15306 "Expected 256-bit or 512-bit vector");
15307
15308 bool UndefLower = isUndefLowerHalf(Mask);
15309 if (!UndefLower && !isUndefUpperHalf(Mask))
15310 return SDValue();
15311
15312 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
15313 "Completely undef shuffle mask should have been simplified already");
15314
15315 // Upper half is undef and lower half is whole upper subvector.
15316 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
15317 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15318 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15319 if (!UndefLower &&
15320 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
15321 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15322 DAG.getIntPtrConstant(HalfNumElts, DL));
15323 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15324 DAG.getIntPtrConstant(0, DL));
15325 }
15326
15327 // Lower half is undef and upper half is whole lower subvector.
15328 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
15329 if (UndefLower &&
15330 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
15331 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15332 DAG.getIntPtrConstant(0, DL));
15333 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15334 DAG.getIntPtrConstant(HalfNumElts, DL));
15335 }
15336
15337 int HalfIdx1, HalfIdx2;
15338 SmallVector<int, 8> HalfMask(HalfNumElts);
15339 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
15340 return SDValue();
15341
15342 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
15343
15344 // Only shuffle the halves of the inputs when useful.
15345 unsigned NumLowerHalves =
15346 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
15347 unsigned NumUpperHalves =
15348 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
15349 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
15350
15351 // Determine the larger pattern of undef/halves, then decide if it's worth
15352 // splitting the shuffle based on subtarget capabilities and types.
15353 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
15354 if (!UndefLower) {
15355 // XXXXuuuu: no insert is needed.
15356 // Always extract lowers when setting lower - these are all free subreg ops.
15357 if (NumUpperHalves == 0)
15358 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15359 UndefLower, DAG);
15360
15361 if (NumUpperHalves == 1) {
15362 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
15363 if (Subtarget.hasAVX2()) {
15364 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
15365 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
15366 !is128BitUnpackShuffleMask(HalfMask, DAG) &&
15367 (!isSingleSHUFPSMask(HalfMask) ||
15368 Subtarget.hasFastVariableCrossLaneShuffle()))
15369 return SDValue();
15370 // If this is a unary shuffle (assume that the 2nd operand is
15371 // canonicalized to undef), then we can use vpermpd. Otherwise, we
15372 // are better off extracting the upper half of 1 operand and using a
15373 // narrow shuffle.
15374 if (EltWidth == 64 && V2.isUndef())
15375 return SDValue();
15376 }
15377 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
15378 if (Subtarget.hasAVX512() && VT.is512BitVector())
15379 return SDValue();
15380 // Extract + narrow shuffle is better than the wide alternative.
15381 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15382 UndefLower, DAG);
15383 }
15384
15385 // Don't extract both uppers, instead shuffle and then extract.
15386 assert(NumUpperHalves == 2 && "Half vector count went wrong");
15387 return SDValue();
15388 }
15389
15390 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
15391 if (NumUpperHalves == 0) {
15392 // AVX2 has efficient 64-bit element cross-lane shuffles.
15393 // TODO: Refine to account for unary shuffle, splat, and other masks?
15394 if (Subtarget.hasAVX2() && EltWidth == 64)
15395 return SDValue();
15396 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
15397 if (Subtarget.hasAVX512() && VT.is512BitVector())
15398 return SDValue();
15399 // Narrow shuffle + insert is better than the wide alternative.
15400 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15401 UndefLower, DAG);
15402 }
15403
15404 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
15405 return SDValue();
15406}
15407
15408/// Handle case where shuffle sources are coming from the same 128-bit lane and
15409/// every lane can be represented as the same repeating mask - allowing us to
15410/// shuffle the sources with the repeating shuffle and then permute the result
15411/// to the destination lanes.
15413 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15414 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15415 int NumElts = VT.getVectorNumElements();
15416 int NumLanes = VT.getSizeInBits() / 128;
15417 int NumLaneElts = NumElts / NumLanes;
15418
15419 // On AVX2 we may be able to just shuffle the lowest elements and then
15420 // broadcast the result.
15421 if (Subtarget.hasAVX2()) {
15422 for (unsigned BroadcastSize : {16, 32, 64}) {
15423 if (BroadcastSize <= VT.getScalarSizeInBits())
15424 continue;
15425 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
15426
15427 // Attempt to match a repeating pattern every NumBroadcastElts,
15428 // accounting for UNDEFs but only references the lowest 128-bit
15429 // lane of the inputs.
15430 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
15431 for (int i = 0; i != NumElts; i += NumBroadcastElts)
15432 for (int j = 0; j != NumBroadcastElts; ++j) {
15433 int M = Mask[i + j];
15434 if (M < 0)
15435 continue;
15436 int &R = RepeatMask[j];
15437 if (0 != ((M % NumElts) / NumLaneElts))
15438 return false;
15439 if (0 <= R && R != M)
15440 return false;
15441 R = M;
15442 }
15443 return true;
15444 };
15445
15446 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
15447 if (!FindRepeatingBroadcastMask(RepeatMask))
15448 continue;
15449
15450 // Shuffle the (lowest) repeated elements in place for broadcast.
15451 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
15452
15453 // Shuffle the actual broadcast.
15454 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
15455 for (int i = 0; i != NumElts; i += NumBroadcastElts)
15456 for (int j = 0; j != NumBroadcastElts; ++j)
15457 BroadcastMask[i + j] = j;
15458
15459 // Avoid returning the same shuffle operation. For example,
15460 // v8i32 = vector_shuffle<0,1,0,1,0,1,0,1> t5, undef:v8i32
15461 if (BroadcastMask == Mask)
15462 return SDValue();
15463
15464 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
15465 BroadcastMask);
15466 }
15467 }
15468
15469 // Bail if the shuffle mask doesn't cross 128-bit lanes.
15470 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
15471 return SDValue();
15472
15473 // Bail if we already have a repeated lane shuffle mask.
15474 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15475 return SDValue();
15476
15477 // Helper to look for repeated mask in each split sublane, and that those
15478 // sublanes can then be permuted into place.
15479 auto ShuffleSubLanes = [&](int SubLaneScale) {
15480 int NumSubLanes = NumLanes * SubLaneScale;
15481 int NumSubLaneElts = NumLaneElts / SubLaneScale;
15482
15483 // Check that all the sources are coming from the same lane and see if we
15484 // can form a repeating shuffle mask (local to each sub-lane). At the same
15485 // time, determine the source sub-lane for each destination sub-lane.
15486 int TopSrcSubLane = -1;
15487 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
15488 SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
15489 SubLaneScale,
15490 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
15491
15492 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
15493 // Extract the sub-lane mask, check that it all comes from the same lane
15494 // and normalize the mask entries to come from the first lane.
15495 int SrcLane = -1;
15496 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
15497 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
15498 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
15499 if (M < 0)
15500 continue;
15501 int Lane = (M % NumElts) / NumLaneElts;
15502 if ((0 <= SrcLane) && (SrcLane != Lane))
15503 return SDValue();
15504 SrcLane = Lane;
15505 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
15506 SubLaneMask[Elt] = LocalM;
15507 }
15508
15509 // Whole sub-lane is UNDEF.
15510 if (SrcLane < 0)
15511 continue;
15512
15513 // Attempt to match against the candidate repeated sub-lane masks.
15514 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
15515 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
15516 for (int i = 0; i != NumSubLaneElts; ++i) {
15517 if (M1[i] < 0 || M2[i] < 0)
15518 continue;
15519 if (M1[i] != M2[i])
15520 return false;
15521 }
15522 return true;
15523 };
15524
15525 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
15526 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
15527 continue;
15528
15529 // Merge the sub-lane mask into the matching repeated sub-lane mask.
15530 for (int i = 0; i != NumSubLaneElts; ++i) {
15531 int M = SubLaneMask[i];
15532 if (M < 0)
15533 continue;
15534 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
15535 "Unexpected mask element");
15536 RepeatedSubLaneMask[i] = M;
15537 }
15538
15539 // Track the top most source sub-lane - by setting the remaining to
15540 // UNDEF we can greatly simplify shuffle matching.
15541 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
15542 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
15543 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
15544 break;
15545 }
15546
15547 // Bail if we failed to find a matching repeated sub-lane mask.
15548 if (Dst2SrcSubLanes[DstSubLane] < 0)
15549 return SDValue();
15550 }
15551 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
15552 "Unexpected source lane");
15553
15554 // Create a repeating shuffle mask for the entire vector.
15555 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
15556 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
15557 int Lane = SubLane / SubLaneScale;
15558 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
15559 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
15560 int M = RepeatedSubLaneMask[Elt];
15561 if (M < 0)
15562 continue;
15563 int Idx = (SubLane * NumSubLaneElts) + Elt;
15564 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
15565 }
15566 }
15567
15568 // Shuffle each source sub-lane to its destination.
15569 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
15570 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
15571 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
15572 if (SrcSubLane < 0)
15573 continue;
15574 for (int j = 0; j != NumSubLaneElts; ++j)
15575 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
15576 }
15577
15578 // Avoid returning the same shuffle operation.
15579 // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32
15580 if (RepeatedMask == Mask || SubLaneMask == Mask)
15581 return SDValue();
15582
15583 SDValue RepeatedShuffle =
15584 DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
15585
15586 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
15587 SubLaneMask);
15588 };
15589
15590 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
15591 // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
15592 // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.
15593 // Otherwise we can only permute whole 128-bit lanes.
15594 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
15595 if (Subtarget.hasAVX2() && VT.is256BitVector()) {
15596 bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);
15597 MinSubLaneScale = 2;
15598 MaxSubLaneScale =
15599 (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
15600 }
15601 if (Subtarget.hasBWI() && VT == MVT::v64i8)
15602 MinSubLaneScale = MaxSubLaneScale = 4;
15603
15604 for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
15605 if (SDValue Shuffle = ShuffleSubLanes(Scale))
15606 return Shuffle;
15607
15608 return SDValue();
15609}
15610
15612 bool &ForceV1Zero, bool &ForceV2Zero,
15613 unsigned &ShuffleImm, ArrayRef<int> Mask,
15614 const APInt &Zeroable) {
15615 int NumElts = VT.getVectorNumElements();
15616 assert(VT.getScalarSizeInBits() == 64 &&
15617 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
15618 "Unexpected data type for VSHUFPD");
15619 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
15620 "Illegal shuffle mask");
15621
15622 bool ZeroLane[2] = { true, true };
15623 for (int i = 0; i < NumElts; ++i)
15624 ZeroLane[i & 1] &= Zeroable[i];
15625
15626 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
15627 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
15628 ShuffleImm = 0;
15629 bool ShufpdMask = true;
15630 bool CommutableMask = true;
15631 for (int i = 0; i < NumElts; ++i) {
15632 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
15633 continue;
15634 if (Mask[i] < 0)
15635 return false;
15636 int Val = (i & 6) + NumElts * (i & 1);
15637 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
15638 if (Mask[i] < Val || Mask[i] > Val + 1)
15639 ShufpdMask = false;
15640 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
15641 CommutableMask = false;
15642 ShuffleImm |= (Mask[i] % 2) << i;
15643 }
15644
15645 if (!ShufpdMask && !CommutableMask)
15646 return false;
15647
15648 if (!ShufpdMask && CommutableMask)
15649 std::swap(V1, V2);
15650
15651 ForceV1Zero = ZeroLane[0];
15652 ForceV2Zero = ZeroLane[1];
15653 return true;
15654}
15655
15657 SDValue V2, ArrayRef<int> Mask,
15658 const APInt &Zeroable,
15659 const X86Subtarget &Subtarget,
15660 SelectionDAG &DAG) {
15661 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
15662 "Unexpected data type for VSHUFPD");
15663
15664 unsigned Immediate = 0;
15665 bool ForceV1Zero = false, ForceV2Zero = false;
15666 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
15667 Mask, Zeroable))
15668 return SDValue();
15669
15670 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
15671 if (ForceV1Zero)
15672 V1 = getZeroVector(VT, Subtarget, DAG, DL);
15673 if (ForceV2Zero)
15674 V2 = getZeroVector(VT, Subtarget, DAG, DL);
15675
15676 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
15677 DAG.getTargetConstant(Immediate, DL, MVT::i8));
15678}
15679
15680// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
15681// by zeroable elements in the remaining 24 elements. Turn this into two
15682// vmovqb instructions shuffled together.
15684 SDValue V1, SDValue V2,
15685 ArrayRef<int> Mask,
15686 const APInt &Zeroable,
15687 SelectionDAG &DAG) {
15688 assert(VT == MVT::v32i8 && "Unexpected type!");
15689
15690 // The first 8 indices should be every 8th element.
15691 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
15692 return SDValue();
15693
15694 // Remaining elements need to be zeroable.
15695 if (Zeroable.countl_one() < (Mask.size() - 8))
15696 return SDValue();
15697
15698 V1 = DAG.getBitcast(MVT::v4i64, V1);
15699 V2 = DAG.getBitcast(MVT::v4i64, V2);
15700
15701 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
15702 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
15703
15704 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
15705 // the upper bits of the result using an unpckldq.
15706 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
15707 { 0, 1, 2, 3, 16, 17, 18, 19,
15708 4, 5, 6, 7, 20, 21, 22, 23 });
15709 // Insert the unpckldq into a zero vector to widen to v32i8.
15710 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
15711 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
15712 DAG.getIntPtrConstant(0, DL));
15713}
15714
15715// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2
15716// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2
15717// =>
15718// ul = unpckl v1, v2
15719// uh = unpckh v1, v2
15720// a = vperm ul, uh
15721// b = vperm ul, uh
15722//
15723// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
15724// and permute. We cannot directly match v3 because it is split into two
15725// 256-bit vectors in earlier isel stages. Therefore, this function matches a
15726// pair of 256-bit shuffles and makes sure the masks are consecutive.
15727//
15728// Once unpck and permute nodes are created, the permute corresponding to this
15729// shuffle is returned, while the other permute replaces the other half of the
15730// shuffle in the selection dag.
15732 SDValue V1, SDValue V2,
15733 ArrayRef<int> Mask,
15734 SelectionDAG &DAG) {
15735 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
15736 VT != MVT::v32i8)
15737 return SDValue();
15738 // <B0, B1, B0+1, B1+1, ..., >
15739 auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,
15740 unsigned Begin1) {
15741 size_t Size = Mask.size();
15742 assert(Size % 2 == 0 && "Expected even mask size");
15743 for (unsigned I = 0; I < Size; I += 2) {
15744 if (Mask[I] != (int)(Begin0 + I / 2) ||
15745 Mask[I + 1] != (int)(Begin1 + I / 2))
15746 return false;
15747 }
15748 return true;
15749 };
15750 // Check which half is this shuffle node
15751 int NumElts = VT.getVectorNumElements();
15752 size_t FirstQtr = NumElts / 2;
15753 size_t ThirdQtr = NumElts + NumElts / 2;
15754 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
15755 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
15756 if (!IsFirstHalf && !IsSecondHalf)
15757 return SDValue();
15758
15759 // Find the intersection between shuffle users of V1 and V2.
15760 SmallVector<SDNode *, 2> Shuffles;
15761 for (SDNode *User : V1->uses())
15762 if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
15763 User->getOperand(1) == V2)
15764 Shuffles.push_back(User);
15765 // Limit user size to two for now.
15766 if (Shuffles.size() != 2)
15767 return SDValue();
15768 // Find out which half of the 512-bit shuffles is each smaller shuffle
15769 auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
15770 auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
15771 SDNode *FirstHalf;
15772 SDNode *SecondHalf;
15773 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
15774 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
15775 FirstHalf = Shuffles[0];
15776 SecondHalf = Shuffles[1];
15777 } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
15778 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
15779 FirstHalf = Shuffles[1];
15780 SecondHalf = Shuffles[0];
15781 } else {
15782 return SDValue();
15783 }
15784 // Lower into unpck and perm. Return the perm of this shuffle and replace
15785 // the other.
15786 SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
15787 SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
15788 SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
15789 DAG.getTargetConstant(0x20, DL, MVT::i8));
15790 SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
15791 DAG.getTargetConstant(0x31, DL, MVT::i8));
15792 if (IsFirstHalf) {
15793 DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);
15794 return Perm1;
15795 }
15796 DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);
15797 return Perm2;
15798}
15799
15800/// Handle lowering of 4-lane 64-bit floating point shuffles.
15801///
15802/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
15803/// isn't available.
15805 const APInt &Zeroable, SDValue V1, SDValue V2,
15806 const X86Subtarget &Subtarget,
15807 SelectionDAG &DAG) {
15808 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
15809 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
15810 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
15811
15812 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
15813 Subtarget, DAG))
15814 return V;
15815
15816 if (V2.isUndef()) {
15817 // Check for being able to broadcast a single element.
15818 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
15819 Mask, Subtarget, DAG))
15820 return Broadcast;
15821
15822 // Use low duplicate instructions for masks that match their pattern.
15823 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
15824 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
15825
15826 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
15827 // Non-half-crossing single input shuffles can be lowered with an
15828 // interleaved permutation.
15829 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
15830 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
15831 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
15832 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
15833 }
15834
15835 // With AVX2 we have direct support for this permutation.
15836 if (Subtarget.hasAVX2())
15837 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
15838 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
15839
15840 // Try to create an in-lane repeating shuffle mask and then shuffle the
15841 // results into the target lanes.
15843 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
15844 return V;
15845
15846 // Try to permute the lanes and then use a per-lane permute.
15847 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
15848 Mask, DAG, Subtarget))
15849 return V;
15850
15851 // Otherwise, fall back.
15852 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
15853 DAG, Subtarget);
15854 }
15855
15856 // Use dedicated unpack instructions for masks that match their pattern.
15857 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
15858 return V;
15859
15860 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
15861 Zeroable, Subtarget, DAG))
15862 return Blend;
15863
15864 // Check if the blend happens to exactly fit that of SHUFPD.
15865 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
15866 Zeroable, Subtarget, DAG))
15867 return Op;
15868
15869 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
15870 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
15871
15872 // If we have lane crossing shuffles AND they don't all come from the lower
15873 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15874 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
15875 // canonicalize to a blend of splat which isn't necessary for this combine.
15876 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
15877 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
15878 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
15879 (V2.getOpcode() != ISD::BUILD_VECTOR))
15880 return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
15881
15882 // If we have one input in place, then we can permute the other input and
15883 // blend the result.
15884 if (V1IsInPlace || V2IsInPlace)
15885 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
15886 Subtarget, DAG);
15887
15888 // Try to create an in-lane repeating shuffle mask and then shuffle the
15889 // results into the target lanes.
15891 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
15892 return V;
15893
15894 // Try to simplify this by merging 128-bit lanes to enable a lane-based
15895 // shuffle. However, if we have AVX2 and either inputs are already in place,
15896 // we will be able to shuffle even across lanes the other input in a single
15897 // instruction so skip this pattern.
15898 if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))
15900 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
15901 return V;
15902
15903 // If we have VLX support, we can use VEXPAND.
15904 if (Subtarget.hasVLX())
15905 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
15906 DAG, Subtarget))
15907 return V;
15908
15909 // If we have AVX2 then we always want to lower with a blend because an v4 we
15910 // can fully permute the elements.
15911 if (Subtarget.hasAVX2())
15912 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
15913 Subtarget, DAG);
15914
15915 // Otherwise fall back on generic lowering.
15916 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
15917 Subtarget, DAG);
15918}
15919
15920/// Handle lowering of 4-lane 64-bit integer shuffles.
15921///
15922/// This routine is only called when we have AVX2 and thus a reasonable
15923/// instruction set for v4i64 shuffling..
15925 const APInt &Zeroable, SDValue V1, SDValue V2,
15926 const X86Subtarget &Subtarget,
15927 SelectionDAG &DAG) {
15928 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
15929 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
15930 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
15931 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
15932
15933 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
15934 Subtarget, DAG))
15935 return V;
15936
15937 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
15938 Zeroable, Subtarget, DAG))
15939 return Blend;
15940
15941 // Check for being able to broadcast a single element.
15942 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
15943 Subtarget, DAG))
15944 return Broadcast;
15945
15946 // Try to use shift instructions if fast.
15947 if (Subtarget.preferLowerShuffleAsShift())
15948 if (SDValue Shift =
15949 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
15950 Subtarget, DAG, /*BitwiseOnly*/ true))
15951 return Shift;
15952
15953 if (V2.isUndef()) {
15954 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
15955 // can use lower latency instructions that will operate on both lanes.
15956 SmallVector<int, 2> RepeatedMask;
15957 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
15958 SmallVector<int, 4> PSHUFDMask;
15959 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
15960 return DAG.getBitcast(
15961 MVT::v4i64,
15962 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
15963 DAG.getBitcast(MVT::v8i32, V1),
15964 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
15965 }
15966
15967 // AVX2 provides a direct instruction for permuting a single input across
15968 // lanes.
15969 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
15970 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
15971 }
15972
15973 // Try to use shift instructions.
15974 if (SDValue Shift =
15975 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,
15976 DAG, /*BitwiseOnly*/ false))
15977 return Shift;
15978
15979 // If we have VLX support, we can use VALIGN or VEXPAND.
15980 if (Subtarget.hasVLX()) {
15981 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
15982 Zeroable, Subtarget, DAG))
15983 return Rotate;
15984
15985 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
15986 DAG, Subtarget))
15987 return V;
15988 }
15989
15990 // Try to use PALIGNR.
15991 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
15992 Subtarget, DAG))
15993 return Rotate;
15994
15995 // Use dedicated unpack instructions for masks that match their pattern.
15996 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
15997 return V;
15998
15999 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16000 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16001
16002 // If we have one input in place, then we can permute the other input and
16003 // blend the result.
16004 if (V1IsInPlace || V2IsInPlace)
16005 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16006 Subtarget, DAG);
16007
16008 // Try to create an in-lane repeating shuffle mask and then shuffle the
16009 // results into the target lanes.
16011 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16012 return V;
16013
16014 // Try to lower to PERMQ(BLENDD(V1,V2)).
16015 if (SDValue V =
16016 lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
16017 return V;
16018
16019 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16020 // shuffle. However, if we have AVX2 and either inputs are already in place,
16021 // we will be able to shuffle even across lanes the other input in a single
16022 // instruction so skip this pattern.
16023 if (!V1IsInPlace && !V2IsInPlace)
16025 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16026 return Result;
16027
16028 // Otherwise fall back on generic blend lowering.
16029 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16030 Subtarget, DAG);
16031}
16032
16033/// Handle lowering of 8-lane 32-bit floating point shuffles.
16034///
16035/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
16036/// isn't available.
16038 const APInt &Zeroable, SDValue V1, SDValue V2,
16039 const X86Subtarget &Subtarget,
16040 SelectionDAG &DAG) {
16041 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16042 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16043 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16044
16045 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
16046 Zeroable, Subtarget, DAG))
16047 return Blend;
16048
16049 // Check for being able to broadcast a single element.
16050 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
16051 Subtarget, DAG))
16052 return Broadcast;
16053
16054 if (!Subtarget.hasAVX2()) {
16055 SmallVector<int> InLaneMask;
16056 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
16057
16058 if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
16059 if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
16060 /*SimpleOnly*/ true))
16061 return R;
16062 }
16063 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16064 Zeroable, Subtarget, DAG))
16065 return DAG.getBitcast(MVT::v8f32, ZExt);
16066
16067 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16068 // options to efficiently lower the shuffle.
16069 SmallVector<int, 4> RepeatedMask;
16070 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
16071 assert(RepeatedMask.size() == 4 &&
16072 "Repeated masks must be half the mask width!");
16073
16074 // Use even/odd duplicate instructions for masks that match their pattern.
16075 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
16076 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
16077 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
16078 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
16079
16080 if (V2.isUndef())
16081 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
16082 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16083
16084 // Use dedicated unpack instructions for masks that match their pattern.
16085 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
16086 return V;
16087
16088 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
16089 // have already handled any direct blends.
16090 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
16091 }
16092
16093 // Try to create an in-lane repeating shuffle mask and then shuffle the
16094 // results into the target lanes.
16096 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16097 return V;
16098
16099 // If we have a single input shuffle with different shuffle patterns in the
16100 // two 128-bit lanes use the variable mask to VPERMILPS.
16101 if (V2.isUndef()) {
16102 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
16103 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16104 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
16105 }
16106 if (Subtarget.hasAVX2()) {
16107 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16108 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
16109 }
16110 // Otherwise, fall back.
16111 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
16112 DAG, Subtarget);
16113 }
16114
16115 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16116 // shuffle.
16118 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16119 return Result;
16120
16121 // If we have VLX support, we can use VEXPAND.
16122 if (Subtarget.hasVLX())
16123 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
16124 DAG, Subtarget))
16125 return V;
16126
16127 // Try to match an interleave of two v8f32s and lower them as unpck and
16128 // permutes using ymms. This needs to go before we try to split the vectors.
16129 //
16130 // TODO: Expand this to AVX1. Currently v8i32 is casted to v8f32 and hits
16131 // this path inadvertently.
16132 if (Subtarget.hasAVX2() && !Subtarget.hasAVX512())
16133 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
16134 Mask, DAG))
16135 return V;
16136
16137 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16138 // since after split we get a more efficient code using vpunpcklwd and
16139 // vpunpckhwd instrs than vblend.
16140 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
16141 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,
16142 DAG);
16143
16144 // If we have AVX2 then we always want to lower with a blend because at v8 we
16145 // can fully permute the elements.
16146 if (Subtarget.hasAVX2())
16147 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
16148 Subtarget, DAG);
16149
16150 // Otherwise fall back on generic lowering.
16151 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
16152 Subtarget, DAG);
16153}
16154
16155/// Handle lowering of 8-lane 32-bit integer shuffles.
16156///
16157/// This routine is only called when we have AVX2 and thus a reasonable
16158/// instruction set for v8i32 shuffling..
16160 const APInt &Zeroable, SDValue V1, SDValue V2,
16161 const X86Subtarget &Subtarget,
16162 SelectionDAG &DAG) {
16163 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16164 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16165 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16166 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
16167
16168 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
16169
16170 // Whenever we can lower this as a zext, that instruction is strictly faster
16171 // than any alternative. It also allows us to fold memory operands into the
16172 // shuffle in many cases.
16173 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16174 Zeroable, Subtarget, DAG))
16175 return ZExt;
16176
16177 // Try to match an interleave of two v8i32s and lower them as unpck and
16178 // permutes using ymms. This needs to go before we try to split the vectors.
16179 if (!Subtarget.hasAVX512())
16180 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,
16181 Mask, DAG))
16182 return V;
16183
16184 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16185 // since after split we get a more efficient code than vblend by using
16186 // vpunpcklwd and vpunpckhwd instrs.
16187 if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
16188 !Subtarget.hasAVX512())
16189 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,
16190 DAG);
16191
16192 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
16193 Zeroable, Subtarget, DAG))
16194 return Blend;
16195
16196 // Check for being able to broadcast a single element.
16197 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
16198 Subtarget, DAG))
16199 return Broadcast;
16200
16201 // Try to use shift instructions if fast.
16202 if (Subtarget.preferLowerShuffleAsShift()) {
16203 if (SDValue Shift =
16204 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16205 Subtarget, DAG, /*BitwiseOnly*/ true))
16206 return Shift;
16207 if (NumV2Elements == 0)
16208 if (SDValue Rotate =
16209 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16210 return Rotate;
16211 }
16212
16213 // If the shuffle mask is repeated in each 128-bit lane we can use more
16214 // efficient instructions that mirror the shuffles across the two 128-bit
16215 // lanes.
16216 SmallVector<int, 4> RepeatedMask;
16217 bool Is128BitLaneRepeatedShuffle =
16218 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
16219 if (Is128BitLaneRepeatedShuffle) {
16220 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
16221 if (V2.isUndef())
16222 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
16223 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16224
16225 // Use dedicated unpack instructions for masks that match their pattern.
16226 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
16227 return V;
16228 }
16229
16230 // Try to use shift instructions.
16231 if (SDValue Shift =
16232 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,
16233 DAG, /*BitwiseOnly*/ false))
16234 return Shift;
16235
16236 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
16237 if (SDValue Rotate =
16238 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16239 return Rotate;
16240
16241 // If we have VLX support, we can use VALIGN or EXPAND.
16242 if (Subtarget.hasVLX()) {
16243 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
16244 Zeroable, Subtarget, DAG))
16245 return Rotate;
16246
16247 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
16248 DAG, Subtarget))
16249 return V;
16250 }
16251
16252 // Try to use byte rotation instructions.
16253 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
16254 Subtarget, DAG))
16255 return Rotate;
16256
16257 // Try to create an in-lane repeating shuffle mask and then shuffle the
16258 // results into the target lanes.
16260 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16261 return V;
16262
16263 if (V2.isUndef()) {
16264 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16265 // because that should be faster than the variable permute alternatives.
16266 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))
16267 return V;
16268
16269 // If the shuffle patterns aren't repeated but it's a single input, directly
16270 // generate a cross-lane VPERMD instruction.
16271 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16272 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
16273 }
16274
16275 // Assume that a single SHUFPS is faster than an alternative sequence of
16276 // multiple instructions (even if the CPU has a domain penalty).
16277 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
16278 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
16279 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
16280 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
16281 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
16282 CastV1, CastV2, DAG);
16283 return DAG.getBitcast(MVT::v8i32, ShufPS);
16284 }
16285
16286 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16287 // shuffle.
16289 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16290 return Result;
16291
16292 // Otherwise fall back on generic blend lowering.
16293 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
16294 Subtarget, DAG);
16295}
16296
16297/// Handle lowering of 16-lane 16-bit integer shuffles.
16298///
16299/// This routine is only called when we have AVX2 and thus a reasonable
16300/// instruction set for v16i16 shuffling..
16302 const APInt &Zeroable, SDValue V1, SDValue V2,
16303 const X86Subtarget &Subtarget,
16304 SelectionDAG &DAG) {
16305 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16306 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16307 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16308 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
16309
16310 // Whenever we can lower this as a zext, that instruction is strictly faster
16311 // than any alternative. It also allows us to fold memory operands into the
16312 // shuffle in many cases.
16314 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16315 return ZExt;
16316
16317 // Check for being able to broadcast a single element.
16318 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
16319 Subtarget, DAG))
16320 return Broadcast;
16321
16322 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
16323 Zeroable, Subtarget, DAG))
16324 return Blend;
16325
16326 // Use dedicated unpack instructions for masks that match their pattern.
16327 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
16328 return V;
16329
16330 // Use dedicated pack instructions for masks that match their pattern.
16331 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
16332 Subtarget))
16333 return V;
16334
16335 // Try to use lower using a truncation.
16336 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16337 Subtarget, DAG))
16338 return V;
16339
16340 // Try to use shift instructions.
16341 if (SDValue Shift =
16342 lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16343 Subtarget, DAG, /*BitwiseOnly*/ false))
16344 return Shift;
16345
16346 // Try to use byte rotation instructions.
16347 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
16348 Subtarget, DAG))
16349 return Rotate;
16350
16351 // Try to create an in-lane repeating shuffle mask and then shuffle the
16352 // results into the target lanes.
16354 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16355 return V;
16356
16357 if (V2.isUndef()) {
16358 // Try to use bit rotation instructions.
16359 if (SDValue Rotate =
16360 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
16361 return Rotate;
16362
16363 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16364 // because that should be faster than the variable permute alternatives.
16365 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))
16366 return V;
16367
16368 // There are no generalized cross-lane shuffle operations available on i16
16369 // element types.
16370 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
16372 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
16373 return V;
16374
16375 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
16376 DAG, Subtarget);
16377 }
16378
16379 SmallVector<int, 8> RepeatedMask;
16380 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
16381 // As this is a single-input shuffle, the repeated mask should be
16382 // a strictly valid v8i16 mask that we can pass through to the v8i16
16383 // lowering to handle even the v16 case.
16385 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
16386 }
16387 }
16388
16389 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
16390 Zeroable, Subtarget, DAG))
16391 return PSHUFB;
16392
16393 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
16394 if (Subtarget.hasBWI())
16395 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
16396
16397 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16398 // shuffle.
16400 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16401 return Result;
16402
16403 // Try to permute the lanes and then use a per-lane permute.
16405 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
16406 return V;
16407
16408 // Try to match an interleave of two v16i16s and lower them as unpck and
16409 // permutes using ymms.
16410 if (!Subtarget.hasAVX512())
16411 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,
16412 Mask, DAG))
16413 return V;
16414
16415 // Otherwise fall back on generic lowering.
16416 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
16417 Subtarget, DAG);
16418}
16419
16420/// Handle lowering of 32-lane 8-bit integer shuffles.
16421///
16422/// This routine is only called when we have AVX2 and thus a reasonable
16423/// instruction set for v32i8 shuffling..
16425 const APInt &Zeroable, SDValue V1, SDValue V2,
16426 const X86Subtarget &Subtarget,
16427 SelectionDAG &DAG) {
16428 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
16429 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
16430 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
16431 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
16432
16433 // Whenever we can lower this as a zext, that instruction is strictly faster
16434 // than any alternative. It also allows us to fold memory operands into the
16435 // shuffle in many cases.
16436 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
16437 Zeroable, Subtarget, DAG))
16438 return ZExt;
16439
16440 // Check for being able to broadcast a single element.
16441 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
16442 Subtarget, DAG))
16443 return Broadcast;
16444
16445 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
16446 Zeroable, Subtarget, DAG))
16447 return Blend;
16448
16449 // Use dedicated unpack instructions for masks that match their pattern.
16450 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
16451 return V;
16452
16453 // Use dedicated pack instructions for masks that match their pattern.
16454 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
16455 Subtarget))
16456 return V;
16457
16458 // Try to use lower using a truncation.
16459 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
16460 Subtarget, DAG))
16461 return V;
16462
16463 // Try to use shift instructions.
16464 if (SDValue Shift =
16465 lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,
16466 DAG, /*BitwiseOnly*/ false))
16467 return Shift;
16468
16469 // Try to use byte rotation instructions.
16470 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
16471 Subtarget, DAG))
16472 return Rotate;
16473
16474 // Try to use bit rotation instructions.
16475 if (V2.isUndef())
16476 if (SDValue Rotate =
16477 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
16478 return Rotate;
16479
16480 // Try to create an in-lane repeating shuffle mask and then shuffle the
16481 // results into the target lanes.
16483 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16484 return V;
16485
16486 // There are no generalized cross-lane shuffle operations available on i8
16487 // element types.
16488 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
16489 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16490 // because that should be faster than the variable permute alternatives.
16491 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))
16492 return V;
16493
16495 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
16496 return V;
16497
16498 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
16499 DAG, Subtarget);
16500 }
16501
16502 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
16503 Zeroable, Subtarget, DAG))
16504 return PSHUFB;
16505
16506 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
16507 if (Subtarget.hasVBMI())
16508 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
16509
16510 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16511 // shuffle.
16513 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16514 return Result;
16515
16516 // Try to permute the lanes and then use a per-lane permute.
16518 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
16519 return V;
16520
16521 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16522 // by zeroable elements in the remaining 24 elements. Turn this into two
16523 // vmovqb instructions shuffled together.
16524 if (Subtarget.hasVLX())
16525 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
16526 Mask, Zeroable, DAG))
16527 return V;
16528
16529 // Try to match an interleave of two v32i8s and lower them as unpck and
16530 // permutes using ymms.
16531 if (!Subtarget.hasAVX512())
16532 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,
16533 Mask, DAG))
16534 return V;
16535
16536 // Otherwise fall back on generic lowering.
16537 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
16538 Subtarget, DAG);
16539}
16540
16541/// High-level routine to lower various 256-bit x86 vector shuffles.
16542///
16543/// This routine either breaks down the specific type of a 256-bit x86 vector
16544/// shuffle or splits it into two 128-bit shuffles and fuses the results back
16545/// together based on the available instructions.
16547 SDValue V1, SDValue V2, const APInt &Zeroable,
16548 const X86Subtarget &Subtarget,
16549 SelectionDAG &DAG) {
16550 // If we have a single input to the zero element, insert that into V1 if we
16551 // can do so cheaply.
16552 int NumElts = VT.getVectorNumElements();
16553 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
16554
16555 if (NumV2Elements == 1 && Mask[0] >= NumElts)
16557 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
16558 return Insertion;
16559
16560 // Handle special cases where the lower or upper half is UNDEF.
16561 if (SDValue V =
16562 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
16563 return V;
16564
16565 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
16566 // can check for those subtargets here and avoid much of the subtarget
16567 // querying in the per-vector-type lowering routines. With AVX1 we have
16568 // essentially *zero* ability to manipulate a 256-bit vector with integer
16569 // types. Since we'll use floating point types there eventually, just
16570 // immediately cast everything to a float and operate entirely in that domain.
16571 if (VT.isInteger() && !Subtarget.hasAVX2()) {
16572 int ElementBits = VT.getScalarSizeInBits();
16573 if (ElementBits < 32) {
16574 // No floating point type available, if we can't use the bit operations
16575 // for masking/blending then decompose into 128-bit vectors.
16576 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
16577 Subtarget, DAG))
16578 return V;
16579 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
16580 return V;
16581 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
16582 }
16583
16584 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
16586 V1 = DAG.getBitcast(FpVT, V1);
16587 V2 = DAG.getBitcast(FpVT, V2);
16588 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
16589 }
16590
16591 if (VT == MVT::v16f16 || VT == MVT::v16bf16) {
16592 V1 = DAG.getBitcast(MVT::v16i16, V1);
16593 V2 = DAG.getBitcast(MVT::v16i16, V2);
16594 return DAG.getBitcast(VT,
16595 DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
16596 }
16597
16598 switch (VT.SimpleTy) {
16599 case MVT::v4f64:
16600 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16601 case MVT::v4i64:
16602 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16603 case MVT::v8f32:
16604 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16605 case MVT::v8i32:
16606 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16607 case MVT::v16i16:
16608 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16609 case MVT::v32i8:
16610 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16611
16612 default:
16613 llvm_unreachable("Not a valid 256-bit x86 vector type!");
16614 }
16615}
16616
16617/// Try to lower a vector shuffle as a 128-bit shuffles.
16619 const APInt &Zeroable, SDValue V1, SDValue V2,
16620 const X86Subtarget &Subtarget,
16621 SelectionDAG &DAG) {
16622 assert(VT.getScalarSizeInBits() == 64 &&
16623 "Unexpected element type size for 128bit shuffle.");
16624
16625 // To handle 256 bit vector requires VLX and most probably
16626 // function lowerV2X128VectorShuffle() is better solution.
16627 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
16628
16629 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
16630 SmallVector<int, 4> Widened128Mask;
16631 if (!canWidenShuffleElements(Mask, Widened128Mask))
16632 return SDValue();
16633 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
16634
16635 // Try to use an insert into a zero vector.
16636 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
16637 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
16638 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
16639 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
16640 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
16641 DAG.getIntPtrConstant(0, DL));
16642 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
16643 getZeroVector(VT, Subtarget, DAG, DL), LoV,
16644 DAG.getIntPtrConstant(0, DL));
16645 }
16646
16647 // Check for patterns which can be matched with a single insert of a 256-bit
16648 // subvector.
16649 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
16650 if (OnlyUsesV1 ||
16651 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
16652 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
16653 SDValue SubVec =
16654 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
16655 DAG.getIntPtrConstant(0, DL));
16656 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
16657 DAG.getIntPtrConstant(4, DL));
16658 }
16659
16660 // See if this is an insertion of the lower 128-bits of V2 into V1.
16661 bool IsInsert = true;
16662 int V2Index = -1;
16663 for (int i = 0; i < 4; ++i) {
16664 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
16665 if (Widened128Mask[i] < 0)
16666 continue;
16667
16668 // Make sure all V1 subvectors are in place.
16669 if (Widened128Mask[i] < 4) {
16670 if (Widened128Mask[i] != i) {
16671 IsInsert = false;
16672 break;
16673 }
16674 } else {
16675 // Make sure we only have a single V2 index and its the lowest 128-bits.
16676 if (V2Index >= 0 || Widened128Mask[i] != 4) {
16677 IsInsert = false;
16678 break;
16679 }
16680 V2Index = i;
16681 }
16682 }
16683 if (IsInsert && V2Index >= 0) {
16684 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16685 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
16686 DAG.getIntPtrConstant(0, DL));
16687 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
16688 }
16689
16690 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
16691 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
16692 // possible we at least ensure the lanes stay sequential to help later
16693 // combines.
16694 SmallVector<int, 2> Widened256Mask;
16695 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
16696 Widened128Mask.clear();
16697 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
16698 }
16699
16700 // Try to lower to vshuf64x2/vshuf32x4.
16701 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
16702 int PermMask[4] = {-1, -1, -1, -1};
16703 // Ensure elements came from the same Op.
16704 for (int i = 0; i < 4; ++i) {
16705 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
16706 if (Widened128Mask[i] < 0)
16707 continue;
16708
16709 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
16710 unsigned OpIndex = i / 2;
16711 if (Ops[OpIndex].isUndef())
16712 Ops[OpIndex] = Op;
16713 else if (Ops[OpIndex] != Op)
16714 return SDValue();
16715
16716 PermMask[i] = Widened128Mask[i] % 4;
16717 }
16718
16719 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
16720 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
16721}
16722
16723/// Handle lowering of 8-lane 64-bit floating point shuffles.
16725 const APInt &Zeroable, SDValue V1, SDValue V2,
16726 const X86Subtarget &Subtarget,
16727 SelectionDAG &DAG) {
16728 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
16729 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
16730 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16731
16732 if (V2.isUndef()) {
16733 // Use low duplicate instructions for masks that match their pattern.
16734 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
16735 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
16736
16737 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
16738 // Non-half-crossing single input shuffles can be lowered with an
16739 // interleaved permutation.
16740 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16741 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
16742 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
16743 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
16744 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
16745 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16746 }
16747
16748 SmallVector<int, 4> RepeatedMask;
16749 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
16750 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
16751 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16752 }
16753
16754 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
16755 V2, Subtarget, DAG))
16756 return Shuf128;
16757
16758 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
16759 return Unpck;
16760
16761 // Check if the blend happens to exactly fit that of SHUFPD.
16762 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
16763 Zeroable, Subtarget, DAG))
16764 return Op;
16765
16766 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
16767 DAG, Subtarget))
16768 return V;
16769
16770 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
16771 Zeroable, Subtarget, DAG))
16772 return Blend;
16773
16774 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
16775}
16776
16777/// Handle lowering of 16-lane 32-bit floating point shuffles.
16779 const APInt &Zeroable, SDValue V1, SDValue V2,
16780 const X86Subtarget &Subtarget,
16781 SelectionDAG &DAG) {
16782 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
16783 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
16784 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16785
16786 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16787 // options to efficiently lower the shuffle.
16788 SmallVector<int, 4> RepeatedMask;
16789 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
16790 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
16791
16792 // Use even/odd duplicate instructions for masks that match their pattern.
16793 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
16794 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
16795 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
16796 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
16797
16798 if (V2.isUndef())
16799 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
16800 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16801
16802 // Use dedicated unpack instructions for masks that match their pattern.
16803 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
16804 return V;
16805
16806 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
16807 Zeroable, Subtarget, DAG))
16808 return Blend;
16809
16810 // Otherwise, fall back to a SHUFPS sequence.
16811 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
16812 }
16813
16814 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
16815 Zeroable, Subtarget, DAG))
16816 return Blend;
16817
16819 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
16820 return DAG.getBitcast(MVT::v16f32, ZExt);
16821
16822 // Try to create an in-lane repeating shuffle mask and then shuffle the
16823 // results into the target lanes.
16825 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
16826 return V;
16827
16828 // If we have a single input shuffle with different shuffle patterns in the
16829 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
16830 if (V2.isUndef() &&
16831 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
16832 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
16833 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
16834 }
16835
16836 // If we have AVX512F support, we can use VEXPAND.
16837 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
16838 V1, V2, DAG, Subtarget))
16839 return V;
16840
16841 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
16842}
16843
16844/// Handle lowering of 8-lane 64-bit integer shuffles.
16846 const APInt &Zeroable, SDValue V1, SDValue V2,
16847 const X86Subtarget &Subtarget,
16848 SelectionDAG &DAG) {
16849 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
16850 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
16851 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16852
16853 // Try to use shift instructions if fast.
16854 if (Subtarget.preferLowerShuffleAsShift())
16855 if (SDValue Shift =
16856 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
16857 Subtarget, DAG, /*BitwiseOnly*/ true))
16858 return Shift;
16859
16860 if (V2.isUndef()) {
16861 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16862 // can use lower latency instructions that will operate on all four
16863 // 128-bit lanes.
16864 SmallVector<int, 2> Repeated128Mask;
16865 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
16866 SmallVector<int, 4> PSHUFDMask;
16867 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
16868 return DAG.getBitcast(
16869 MVT::v8i64,
16870 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
16871 DAG.getBitcast(MVT::v16i32, V1),
16872 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16873 }
16874
16875 SmallVector<int, 4> Repeated256Mask;
16876 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
16877 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
16878 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
16879 }
16880
16881 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
16882 V2, Subtarget, DAG))
16883 return Shuf128;
16884
16885 // Try to use shift instructions.
16886 if (SDValue Shift =
16887 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,
16888 DAG, /*BitwiseOnly*/ false))
16889 return Shift;
16890
16891 // Try to use VALIGN.
16892 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
16893 Zeroable, Subtarget, DAG))
16894 return Rotate;
16895
16896 // Try to use PALIGNR.
16897 if (Subtarget.hasBWI())
16898 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
16899 Subtarget, DAG))
16900 return Rotate;
16901
16902 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
16903 return Unpck;
16904
16905 // If we have AVX512F support, we can use VEXPAND.
16906 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
16907 DAG, Subtarget))
16908 return V;
16909
16910 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
16911 Zeroable, Subtarget, DAG))
16912 return Blend;
16913
16914 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
16915}
16916
16917/// Handle lowering of 16-lane 32-bit integer shuffles.
16919 const APInt &Zeroable, SDValue V1, SDValue V2,
16920 const X86Subtarget &Subtarget,
16921 SelectionDAG &DAG) {
16922 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
16923 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
16924 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16925
16926 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
16927
16928 // Whenever we can lower this as a zext, that instruction is strictly faster
16929 // than any alternative. It also allows us to fold memory operands into the
16930 // shuffle in many cases.
16932 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
16933 return ZExt;
16934
16935 // Try to use shift instructions if fast.
16936 if (Subtarget.preferLowerShuffleAsShift()) {
16937 if (SDValue Shift =
16938 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
16939 Subtarget, DAG, /*BitwiseOnly*/ true))
16940 return Shift;
16941 if (NumV2Elements == 0)
16942 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,
16943 Subtarget, DAG))
16944 return Rotate;
16945 }
16946
16947 // If the shuffle mask is repeated in each 128-bit lane we can use more
16948 // efficient instructions that mirror the shuffles across the four 128-bit
16949 // lanes.
16950 SmallVector<int, 4> RepeatedMask;
16951 bool Is128BitLaneRepeatedShuffle =
16952 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
16953 if (Is128BitLaneRepeatedShuffle) {
16954 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
16955 if (V2.isUndef())
16956 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
16957 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16958
16959 // Use dedicated unpack instructions for masks that match their pattern.
16960 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
16961 return V;
16962 }
16963
16964 // Try to use shift instructions.
16965 if (SDValue Shift =
16966 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
16967 Subtarget, DAG, /*BitwiseOnly*/ false))
16968 return Shift;
16969
16970 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
16971 if (SDValue Rotate =
16972 lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))
16973 return Rotate;
16974
16975 // Try to use VALIGN.
16976 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
16977 Zeroable, Subtarget, DAG))
16978 return Rotate;
16979
16980 // Try to use byte rotation instructions.
16981 if (Subtarget.hasBWI())
16982 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
16983 Subtarget, DAG))
16984 return Rotate;
16985
16986 // Assume that a single SHUFPS is faster than using a permv shuffle.
16987 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
16988 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
16989 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
16990 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
16991 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
16992 CastV1, CastV2, DAG);
16993 return DAG.getBitcast(MVT::v16i32, ShufPS);
16994 }
16995
16996 // Try to create an in-lane repeating shuffle mask and then shuffle the
16997 // results into the target lanes.
16999 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
17000 return V;
17001
17002 // If we have AVX512F support, we can use VEXPAND.
17003 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
17004 DAG, Subtarget))
17005 return V;
17006
17007 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
17008 Zeroable, Subtarget, DAG))
17009 return Blend;
17010
17011 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
17012}
17013
17014/// Handle lowering of 32-lane 16-bit integer shuffles.
17016 const APInt &Zeroable, SDValue V1, SDValue V2,
17017 const X86Subtarget &Subtarget,
17018 SelectionDAG &DAG) {
17019 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17020 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17021 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17022 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
17023
17024 // Whenever we can lower this as a zext, that instruction is strictly faster
17025 // than any alternative. It also allows us to fold memory operands into the
17026 // shuffle in many cases.
17028 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17029 return ZExt;
17030
17031 // Use dedicated unpack instructions for masks that match their pattern.
17032 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
17033 return V;
17034
17035 // Use dedicated pack instructions for masks that match their pattern.
17036 if (SDValue V =
17037 lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))
17038 return V;
17039
17040 // Try to use shift instructions.
17041 if (SDValue Shift =
17042 lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,
17043 Subtarget, DAG, /*BitwiseOnly*/ false))
17044 return Shift;
17045
17046 // Try to use byte rotation instructions.
17047 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
17048 Subtarget, DAG))
17049 return Rotate;
17050
17051 if (V2.isUndef()) {
17052 // Try to use bit rotation instructions.
17053 if (SDValue Rotate =
17054 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
17055 return Rotate;
17056
17057 SmallVector<int, 8> RepeatedMask;
17058 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
17059 // As this is a single-input shuffle, the repeated mask should be
17060 // a strictly valid v8i16 mask that we can pass through to the v8i16
17061 // lowering to handle even the v32 case.
17062 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
17063 RepeatedMask, Subtarget, DAG);
17064 }
17065 }
17066
17067 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
17068 Zeroable, Subtarget, DAG))
17069 return Blend;
17070
17071 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
17072 Zeroable, Subtarget, DAG))
17073 return PSHUFB;
17074
17075 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
17076}
17077
17078/// Handle lowering of 64-lane 8-bit integer shuffles.
17080 const APInt &Zeroable, SDValue V1, SDValue V2,
17081 const X86Subtarget &Subtarget,
17082 SelectionDAG &DAG) {
17083 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17084 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17085 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
17086 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
17087
17088 // Whenever we can lower this as a zext, that instruction is strictly faster
17089 // than any alternative. It also allows us to fold memory operands into the
17090 // shuffle in many cases.
17092 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
17093 return ZExt;
17094
17095 // Use dedicated unpack instructions for masks that match their pattern.
17096 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
17097 return V;
17098
17099 // Use dedicated pack instructions for masks that match their pattern.
17100 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
17101 Subtarget))
17102 return V;
17103
17104 // Try to use shift instructions.
17105 if (SDValue Shift =
17106 lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,
17107 DAG, /*BitwiseOnly*/ false))
17108 return Shift;
17109
17110 // Try to use byte rotation instructions.
17111 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
17112 Subtarget, DAG))
17113 return Rotate;
17114
17115 // Try to use bit rotation instructions.
17116 if (V2.isUndef())
17117 if (SDValue Rotate =
17118 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
17119 return Rotate;
17120
17121 // Lower as AND if possible.
17122 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
17123 Zeroable, Subtarget, DAG))
17124 return Masked;
17125
17126 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
17127 Zeroable, Subtarget, DAG))
17128 return PSHUFB;
17129
17130 // Try to create an in-lane repeating shuffle mask and then shuffle the
17131 // results into the target lanes.
17133 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17134 return V;
17135
17137 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
17138 return Result;
17139
17140 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
17141 Zeroable, Subtarget, DAG))
17142 return Blend;
17143
17144 if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
17145 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
17146 // PALIGNR will be cheaper than the second PSHUFB+OR.
17147 if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
17148 Mask, Subtarget, DAG))
17149 return V;
17150
17151 // If we can't directly blend but can use PSHUFB, that will be better as it
17152 // can both shuffle and set up the inefficient blend.
17153 bool V1InUse, V2InUse;
17154 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
17155 DAG, V1InUse, V2InUse);
17156 }
17157
17158 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17159 // shuffle.
17160 if (!V2.isUndef())
17162 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17163 return Result;
17164
17165 // VBMI can use VPERMV/VPERMV3 byte shuffles.
17166 if (Subtarget.hasVBMI())
17167 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
17168
17169 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17170}
17171
17172/// High-level routine to lower various 512-bit x86 vector shuffles.
17173///
17174/// This routine either breaks down the specific type of a 512-bit x86 vector
17175/// shuffle or splits it into two 256-bit shuffles and fuses the results back
17176/// together based on the available instructions.
17178 MVT VT, SDValue V1, SDValue V2,
17179 const APInt &Zeroable,
17180 const X86Subtarget &Subtarget,
17181 SelectionDAG &DAG) {
17182 assert(Subtarget.hasAVX512() &&
17183 "Cannot lower 512-bit vectors w/ basic ISA!");
17184
17185 // If we have a single input to the zero element, insert that into V1 if we
17186 // can do so cheaply.
17187 int NumElts = Mask.size();
17188 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17189
17190 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17192 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17193 return Insertion;
17194
17195 // Handle special cases where the lower or upper half is UNDEF.
17196 if (SDValue V =
17197 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17198 return V;
17199
17200 // Check for being able to broadcast a single element.
17201 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
17202 Subtarget, DAG))
17203 return Broadcast;
17204
17205 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
17206 // Try using bit ops for masking and blending before falling back to
17207 // splitting.
17208 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17209 Subtarget, DAG))
17210 return V;
17211 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17212 return V;
17213
17214 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17215 }
17216
17217 if (VT == MVT::v32f16 || VT == MVT::v32bf16) {
17218 if (!Subtarget.hasBWI())
17219 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
17220 /*SimpleOnly*/ false);
17221
17222 V1 = DAG.getBitcast(MVT::v32i16, V1);
17223 V2 = DAG.getBitcast(MVT::v32i16, V2);
17224 return DAG.getBitcast(VT,
17225 DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
17226 }
17227
17228 // Dispatch to each element type for lowering. If we don't have support for
17229 // specific element type shuffles at 512 bits, immediately split them and
17230 // lower them. Each lowering routine of a given type is allowed to assume that
17231 // the requisite ISA extensions for that element type are available.
17232 switch (VT.SimpleTy) {
17233 case MVT::v8f64:
17234 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17235 case MVT::v16f32:
17236 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17237 case MVT::v8i64:
17238 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17239 case MVT::v16i32:
17240 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17241 case MVT::v32i16:
17242 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17243 case MVT::v64i8:
17244 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17245
17246 default:
17247 llvm_unreachable("Not a valid 512-bit x86 vector type!");
17248 }
17249}
17250
17252 MVT VT, SDValue V1, SDValue V2,
17253 const X86Subtarget &Subtarget,
17254 SelectionDAG &DAG) {
17255 // Shuffle should be unary.
17256 if (!V2.isUndef())
17257 return SDValue();
17258
17259 int ShiftAmt = -1;
17260 int NumElts = Mask.size();
17261 for (int i = 0; i != NumElts; ++i) {
17262 int M = Mask[i];
17263 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
17264 "Unexpected mask index.");
17265 if (M < 0)
17266 continue;
17267
17268 // The first non-undef element determines our shift amount.
17269 if (ShiftAmt < 0) {
17270 ShiftAmt = M - i;
17271 // Need to be shifting right.
17272 if (ShiftAmt <= 0)
17273 return SDValue();
17274 }
17275 // All non-undef elements must shift by the same amount.
17276 if (ShiftAmt != M - i)
17277 return SDValue();
17278 }
17279 assert(ShiftAmt >= 0 && "All undef?");
17280
17281 // Great we found a shift right.
17282 SDValue Res = widenMaskVector(V1, false, Subtarget, DAG, DL);
17283 Res = DAG.getNode(X86ISD::KSHIFTR, DL, Res.getValueType(), Res,
17284 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17285 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17286 DAG.getIntPtrConstant(0, DL));
17287}
17288
17289// Determine if this shuffle can be implemented with a KSHIFT instruction.
17290// Returns the shift amount if possible or -1 if not. This is a simplified
17291// version of matchShuffleAsShift.
17292static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
17293 int MaskOffset, const APInt &Zeroable) {
17294 int Size = Mask.size();
17295
17296 auto CheckZeros = [&](int Shift, bool Left) {
17297 for (int j = 0; j < Shift; ++j)
17298 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
17299 return false;
17300
17301 return true;
17302 };
17303
17304 auto MatchShift = [&](int Shift, bool Left) {
17305 unsigned Pos = Left ? Shift : 0;
17306 unsigned Low = Left ? 0 : Shift;
17307 unsigned Len = Size - Shift;
17308 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
17309 };
17310
17311 for (int Shift = 1; Shift != Size; ++Shift)
17312 for (bool Left : {true, false})
17313 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
17315 return Shift;
17316 }
17317
17318 return -1;
17319}
17320
17321
17322// Lower vXi1 vector shuffles.
17323// There is no a dedicated instruction on AVX-512 that shuffles the masks.
17324// The only way to shuffle bits is to sign-extend the mask vector to SIMD
17325// vector, shuffle and then truncate it back.
17327 MVT VT, SDValue V1, SDValue V2,
17328 const APInt &Zeroable,
17329 const X86Subtarget &Subtarget,
17330 SelectionDAG &DAG) {
17331 assert(Subtarget.hasAVX512() &&
17332 "Cannot lower 512-bit vectors w/o basic ISA!");
17333
17334 int NumElts = Mask.size();
17335 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17336
17337 // Try to recognize shuffles that are just padding a subvector with zeros.
17338 int SubvecElts = 0;
17339 int Src = -1;
17340 for (int i = 0; i != NumElts; ++i) {
17341 if (Mask[i] >= 0) {
17342 // Grab the source from the first valid mask. All subsequent elements need
17343 // to use this same source.
17344 if (Src < 0)
17345 Src = Mask[i] / NumElts;
17346 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
17347 break;
17348 }
17349
17350 ++SubvecElts;
17351 }
17352 assert(SubvecElts != NumElts && "Identity shuffle?");
17353
17354 // Clip to a power 2.
17355 SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);
17356
17357 // Make sure the number of zeroable bits in the top at least covers the bits
17358 // not covered by the subvector.
17359 if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
17360 assert(Src >= 0 && "Expected a source!");
17361 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
17362 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
17363 Src == 0 ? V1 : V2,
17364 DAG.getIntPtrConstant(0, DL));
17365 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17366 DAG.getConstant(0, DL, VT),
17367 Extract, DAG.getIntPtrConstant(0, DL));
17368 }
17369
17370 // Try a simple shift right with undef elements. Later we'll try with zeros.
17371 if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,
17372 DAG))
17373 return Shift;
17374
17375 // Try to match KSHIFTs.
17376 unsigned Offset = 0;
17377 for (SDValue V : { V1, V2 }) {
17378 unsigned Opcode;
17379 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
17380 if (ShiftAmt >= 0) {
17381 SDValue Res = widenMaskVector(V, false, Subtarget, DAG, DL);
17382 MVT WideVT = Res.getSimpleValueType();
17383 // Widened right shifts need two shifts to ensure we shift in zeroes.
17384 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
17385 int WideElts = WideVT.getVectorNumElements();
17386 // Shift left to put the original vector in the MSBs of the new size.
17387 Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
17388 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
17389 // Increase the shift amount to account for the left shift.
17390 ShiftAmt += WideElts - NumElts;
17391 }
17392
17393 Res = DAG.getNode(Opcode, DL, WideVT, Res,
17394 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17395 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17396 DAG.getIntPtrConstant(0, DL));
17397 }
17398 Offset += NumElts; // Increment for next iteration.
17399 }
17400
17401 // If we're performing an unary shuffle on a SETCC result, try to shuffle the
17402 // ops instead.
17403 // TODO: What other unary shuffles would benefit from this?
17404 if (NumV2Elements == 0 && V1.getOpcode() == ISD::SETCC && V1->hasOneUse()) {
17405 SDValue Op0 = V1.getOperand(0);
17406 SDValue Op1 = V1.getOperand(1);
17407 ISD::CondCode CC = cast<CondCodeSDNode>(V1.getOperand(2))->get();
17408 EVT OpVT = Op0.getValueType();
17409 if (OpVT.getScalarSizeInBits() >= 32 || isBroadcastShuffleMask(Mask))
17410 return DAG.getSetCC(
17411 DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
17412 DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
17413 }
17414
17415 MVT ExtVT;
17416 switch (VT.SimpleTy) {
17417 default:
17418 llvm_unreachable("Expected a vector of i1 elements");
17419 case MVT::v2i1:
17420 ExtVT = MVT::v2i64;
17421 break;
17422 case MVT::v4i1:
17423 ExtVT = MVT::v4i32;
17424 break;
17425 case MVT::v8i1:
17426 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
17427 // shuffle.
17428 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
17429 break;
17430 case MVT::v16i1:
17431 // Take 512-bit type, unless we are avoiding 512-bit types and have the
17432 // 256-bit operation available.
17433 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
17434 break;
17435 case MVT::v32i1:
17436 // Take 512-bit type, unless we are avoiding 512-bit types and have the
17437 // 256-bit operation available.
17438 assert(Subtarget.hasBWI() && "Expected AVX512BW support");
17439 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
17440 break;
17441 case MVT::v64i1:
17442 // Fall back to scalarization. FIXME: We can do better if the shuffle
17443 // can be partitioned cleanly.
17444 if (!Subtarget.useBWIRegs())
17445 return SDValue();
17446 ExtVT = MVT::v64i8;
17447 break;
17448 }
17449
17450 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
17451 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
17452
17453 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
17454 // i1 was sign extended we can use X86ISD::CVT2MASK.
17455 int NumElems = VT.getVectorNumElements();
17456 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
17457 (Subtarget.hasDQI() && (NumElems < 32)))
17458 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
17459 Shuffle, ISD::SETGT);
17460
17461 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
17462}
17463
17464/// Helper function that returns true if the shuffle mask should be
17465/// commuted to improve canonicalization.
17467 int NumElements = Mask.size();
17468
17469 int NumV1Elements = 0, NumV2Elements = 0;
17470 for (int M : Mask)
17471 if (M < 0)
17472 continue;
17473 else if (M < NumElements)
17474 ++NumV1Elements;
17475 else
17476 ++NumV2Elements;
17477
17478 // Commute the shuffle as needed such that more elements come from V1 than
17479 // V2. This allows us to match the shuffle pattern strictly on how many
17480 // elements come from V1 without handling the symmetric cases.
17481 if (NumV2Elements > NumV1Elements)
17482 return true;
17483
17484 assert(NumV1Elements > 0 && "No V1 indices");
17485
17486 if (NumV2Elements == 0)
17487 return false;
17488
17489 // When the number of V1 and V2 elements are the same, try to minimize the
17490 // number of uses of V2 in the low half of the vector. When that is tied,
17491 // ensure that the sum of indices for V1 is equal to or lower than the sum
17492 // indices for V2. When those are equal, try to ensure that the number of odd
17493 // indices for V1 is lower than the number of odd indices for V2.
17494 if (NumV1Elements == NumV2Elements) {
17495 int LowV1Elements = 0, LowV2Elements = 0;
17496 for (int M : Mask.slice(0, NumElements / 2))
17497 if (M >= NumElements)
17498 ++LowV2Elements;
17499 else if (M >= 0)
17500 ++LowV1Elements;
17501 if (LowV2Elements > LowV1Elements)
17502 return true;
17503 if (LowV2Elements == LowV1Elements) {
17504 int SumV1Indices = 0, SumV2Indices = 0;
17505 for (int i = 0, Size = Mask.size(); i < Size; ++i)
17506 if (Mask[i] >= NumElements)
17507 SumV2Indices += i;
17508 else if (Mask[i] >= 0)
17509 SumV1Indices += i;
17510 if (SumV2Indices < SumV1Indices)
17511 return true;
17512 if (SumV2Indices == SumV1Indices) {
17513 int NumV1OddIndices = 0, NumV2OddIndices = 0;
17514 for (int i = 0, Size = Mask.size(); i < Size; ++i)
17515 if (Mask[i] >= NumElements)
17516 NumV2OddIndices += i % 2;
17517 else if (Mask[i] >= 0)
17518 NumV1OddIndices += i % 2;
17519 if (NumV2OddIndices < NumV1OddIndices)
17520 return true;
17521 }
17522 }
17523 }
17524
17525 return false;
17526}
17527
17529 const X86Subtarget &Subtarget) {
17530 if (!Subtarget.hasAVX512())
17531 return false;
17532
17533 if (!V.getValueType().isSimple())
17534 return false;
17535
17536 MVT VT = V.getSimpleValueType().getScalarType();
17537 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
17538 return false;
17539
17540 // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd
17541 // are preferable to blendw/blendvb/masked-mov.
17542 if ((VT == MVT::i16 || VT == MVT::i8) &&
17543 V.getSimpleValueType().getSizeInBits() < 512)
17544 return false;
17545
17546 auto HasMaskOperation = [&](SDValue V) {
17547 // TODO: Currently we only check limited opcode. We probably extend
17548 // it to all binary operation by checking TLI.isBinOp().
17549 switch (V->getOpcode()) {
17550 default:
17551 return false;
17552 case ISD::ADD:
17553 case ISD::SUB:
17554 case ISD::AND:
17555 case ISD::XOR:
17556 case ISD::OR:
17557 case ISD::SMAX:
17558 case ISD::SMIN:
17559 case ISD::UMAX:
17560 case ISD::UMIN:
17561 case ISD::ABS:
17562 case ISD::SHL:
17563 case ISD::SRL:
17564 case ISD::SRA:
17565 case ISD::MUL:
17566 break;
17567 }
17568 if (!V->hasOneUse())
17569 return false;
17570
17571 return true;
17572 };
17573
17574 if (HasMaskOperation(V))
17575 return true;
17576
17577 return false;
17578}
17579
17580// Forward declaration.
17583 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
17584 const X86Subtarget &Subtarget);
17585
17586 /// Top-level lowering for x86 vector shuffles.
17587///
17588/// This handles decomposition, canonicalization, and lowering of all x86
17589/// vector shuffles. Most of the specific lowering strategies are encapsulated
17590/// above in helper routines. The canonicalization attempts to widen shuffles
17591/// to involve fewer lanes of wider elements, consolidate symmetric patterns
17592/// s.t. only one of the two inputs needs to be tested, etc.
17594 SelectionDAG &DAG) {
17595 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
17596 ArrayRef<int> OrigMask = SVOp->getMask();
17597 SDValue V1 = Op.getOperand(0);
17598 SDValue V2 = Op.getOperand(1);
17599 MVT VT = Op.getSimpleValueType();
17600 int NumElements = VT.getVectorNumElements();
17601 SDLoc DL(Op);
17602 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
17603
17604 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
17605 "Can't lower MMX shuffles");
17606
17607 bool V1IsUndef = V1.isUndef();
17608 bool V2IsUndef = V2.isUndef();
17609 if (V1IsUndef && V2IsUndef)
17610 return DAG.getUNDEF(VT);
17611
17612 // When we create a shuffle node we put the UNDEF node to second operand,
17613 // but in some cases the first operand may be transformed to UNDEF.
17614 // In this case we should just commute the node.
17615 if (V1IsUndef)
17616 return DAG.getCommutedVectorShuffle(*SVOp);
17617
17618 // Check for non-undef masks pointing at an undef vector and make the masks
17619 // undef as well. This makes it easier to match the shuffle based solely on
17620 // the mask.
17621 if (V2IsUndef &&
17622 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
17623 SmallVector<int, 8> NewMask(OrigMask);
17624 for (int &M : NewMask)
17625 if (M >= NumElements)
17626 M = -1;
17627 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
17628 }
17629
17630 // Check for illegal shuffle mask element index values.
17631 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
17632 (void)MaskUpperLimit;
17633 assert(llvm::all_of(OrigMask,
17634 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
17635 "Out of bounds shuffle index");
17636
17637 // We actually see shuffles that are entirely re-arrangements of a set of
17638 // zero inputs. This mostly happens while decomposing complex shuffles into
17639 // simple ones. Directly lower these as a buildvector of zeros.
17640 APInt KnownUndef, KnownZero;
17641 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
17642
17643 APInt Zeroable = KnownUndef | KnownZero;
17644 if (Zeroable.isAllOnes())
17645 return getZeroVector(VT, Subtarget, DAG, DL);
17646
17647 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
17648
17649 // Try to collapse shuffles into using a vector type with fewer elements but
17650 // wider element types. We cap this to not form integers or floating point
17651 // elements wider than 64 bits. It does not seem beneficial to form i128
17652 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
17653 SmallVector<int, 16> WidenedMask;
17654 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
17655 !canCombineAsMaskOperation(V1, Subtarget) &&
17656 !canCombineAsMaskOperation(V2, Subtarget) &&
17657 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
17658 // Shuffle mask widening should not interfere with a broadcast opportunity
17659 // by obfuscating the operands with bitcasts.
17660 // TODO: Avoid lowering directly from this top-level function: make this
17661 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
17662 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
17663 Subtarget, DAG))
17664 return Broadcast;
17665
17666 MVT NewEltVT = VT.isFloatingPoint()
17669 int NewNumElts = NumElements / 2;
17670 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
17671 // Make sure that the new vector type is legal. For example, v2f64 isn't
17672 // legal on SSE1.
17673 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
17674 if (V2IsZero) {
17675 // Modify the new Mask to take all zeros from the all-zero vector.
17676 // Choose indices that are blend-friendly.
17677 bool UsedZeroVector = false;
17678 assert(is_contained(WidenedMask, SM_SentinelZero) &&
17679 "V2's non-undef elements are used?!");
17680 for (int i = 0; i != NewNumElts; ++i)
17681 if (WidenedMask[i] == SM_SentinelZero) {
17682 WidenedMask[i] = i + NewNumElts;
17683 UsedZeroVector = true;
17684 }
17685 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
17686 // some elements to be undef.
17687 if (UsedZeroVector)
17688 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
17689 }
17690 V1 = DAG.getBitcast(NewVT, V1);
17691 V2 = DAG.getBitcast(NewVT, V2);
17692 return DAG.getBitcast(
17693 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
17694 }
17695 }
17696
17697 SmallVector<SDValue> Ops = {V1, V2};
17698 SmallVector<int> Mask(OrigMask);
17699
17700 // Canonicalize the shuffle with any horizontal ops inputs.
17701 // NOTE: This may update Ops and Mask.
17703 Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
17704 return DAG.getBitcast(VT, HOp);
17705
17706 V1 = DAG.getBitcast(VT, Ops[0]);
17707 V2 = DAG.getBitcast(VT, Ops[1]);
17708 assert(NumElements == (int)Mask.size() &&
17709 "canonicalizeShuffleMaskWithHorizOp "
17710 "shouldn't alter the shuffle mask size");
17711
17712 // Commute the shuffle if it will improve canonicalization.
17715 std::swap(V1, V2);
17716 }
17717
17718 // For each vector width, delegate to a specialized lowering routine.
17719 if (VT.is128BitVector())
17720 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17721
17722 if (VT.is256BitVector())
17723 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17724
17725 if (VT.is512BitVector())
17726 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17727
17728 if (Is1BitVector)
17729 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17730
17731 llvm_unreachable("Unimplemented!");
17732}
17733
17734/// Try to lower a VSELECT instruction to a vector shuffle.
17736 const X86Subtarget &Subtarget,
17737 SelectionDAG &DAG) {
17738 SDValue Cond = Op.getOperand(0);
17739 SDValue LHS = Op.getOperand(1);
17740 SDValue RHS = Op.getOperand(2);
17741 MVT VT = Op.getSimpleValueType();
17742
17743 // Only non-legal VSELECTs reach this lowering, convert those into generic
17744 // shuffles and re-use the shuffle lowering path for blends.
17748 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
17749 }
17750
17751 return SDValue();
17752}
17753
17754SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
17755 SDValue Cond = Op.getOperand(0);
17756 SDValue LHS = Op.getOperand(1);
17757 SDValue RHS = Op.getOperand(2);
17758
17759 SDLoc dl(Op);
17760 MVT VT = Op.getSimpleValueType();
17761 if (isSoftF16(VT, Subtarget)) {
17763 return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
17764 DAG.getBitcast(NVT, LHS),
17765 DAG.getBitcast(NVT, RHS)));
17766 }
17767
17768 // A vselect where all conditions and data are constants can be optimized into
17769 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
17773 return SDValue();
17774
17775 // Try to lower this to a blend-style vector shuffle. This can handle all
17776 // constant condition cases.
17777 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
17778 return BlendOp;
17779
17780 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
17781 // with patterns on the mask registers on AVX-512.
17782 MVT CondVT = Cond.getSimpleValueType();
17783 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
17784 if (CondEltSize == 1)
17785 return Op;
17786
17787 // Variable blends are only legal from SSE4.1 onward.
17788 if (!Subtarget.hasSSE41())
17789 return SDValue();
17790
17791 unsigned EltSize = VT.getScalarSizeInBits();
17792 unsigned NumElts = VT.getVectorNumElements();
17793
17794 // Expand v32i16/v64i8 without BWI.
17795 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
17796 return SDValue();
17797
17798 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
17799 // into an i1 condition so that we can use the mask-based 512-bit blend
17800 // instructions.
17801 if (VT.getSizeInBits() == 512) {
17802 // Build a mask by testing the condition against zero.
17803 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
17804 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
17805 DAG.getConstant(0, dl, CondVT),
17806 ISD::SETNE);
17807 // Now return a new VSELECT using the mask.
17808 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
17809 }
17810
17811 // SEXT/TRUNC cases where the mask doesn't match the destination size.
17812 if (CondEltSize != EltSize) {
17813 // If we don't have a sign splat, rely on the expansion.
17814 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
17815 return SDValue();
17816
17817 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
17818 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
17819 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
17820 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
17821 }
17822
17823 // Only some types will be legal on some subtargets. If we can emit a legal
17824 // VSELECT-matching blend, return Op, and but if we need to expand, return
17825 // a null value.
17826 switch (VT.SimpleTy) {
17827 default:
17828 // Most of the vector types have blends past SSE4.1.
17829 return Op;
17830
17831 case MVT::v32i8:
17832 // The byte blends for AVX vectors were introduced only in AVX2.
17833 if (Subtarget.hasAVX2())
17834 return Op;
17835
17836 return SDValue();
17837
17838 case MVT::v8i16:
17839 case MVT::v16i16: {
17840 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
17841 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
17842 Cond = DAG.getBitcast(CastVT, Cond);
17843 LHS = DAG.getBitcast(CastVT, LHS);
17844 RHS = DAG.getBitcast(CastVT, RHS);
17845 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
17846 return DAG.getBitcast(VT, Select);
17847 }
17848 }
17849}
17850
17852 MVT VT = Op.getSimpleValueType();
17853 SDValue Vec = Op.getOperand(0);
17854 SDValue Idx = Op.getOperand(1);
17855 assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
17856 SDLoc dl(Op);
17857
17859 return SDValue();
17860
17861 if (VT.getSizeInBits() == 8) {
17862 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
17863 // we're going to zero extend the register or fold the store.
17866 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
17867 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
17868 DAG.getBitcast(MVT::v4i32, Vec), Idx));
17869
17870 unsigned IdxVal = Idx->getAsZExtVal();
17871 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
17872 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
17873 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
17874 }
17875
17876 if (VT == MVT::f32) {
17877 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
17878 // the result back to FR32 register. It's only worth matching if the
17879 // result has a single use which is a store or a bitcast to i32. And in
17880 // the case of a store, it's not worth it if the index is a constant 0,
17881 // because a MOVSSmr can be used instead, which is smaller and faster.
17882 if (!Op.hasOneUse())
17883 return SDValue();
17884 SDNode *User = *Op.getNode()->use_begin();
17885 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
17886 (User->getOpcode() != ISD::BITCAST ||
17887 User->getValueType(0) != MVT::i32))
17888 return SDValue();
17889 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
17890 DAG.getBitcast(MVT::v4i32, Vec), Idx);
17891 return DAG.getBitcast(MVT::f32, Extract);
17892 }
17893
17894 if (VT == MVT::i32 || VT == MVT::i64)
17895 return Op;
17896
17897 return SDValue();
17898}
17899
17900/// Extract one bit from mask vector, like v16i1 or v8i1.
17901/// AVX-512 feature.
17903 const X86Subtarget &Subtarget) {
17904 SDValue Vec = Op.getOperand(0);
17905 SDLoc dl(Vec);
17906 MVT VecVT = Vec.getSimpleValueType();
17907 SDValue Idx = Op.getOperand(1);
17908 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
17909 MVT EltVT = Op.getSimpleValueType();
17910
17911 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
17912 "Unexpected vector type in ExtractBitFromMaskVector");
17913
17914 // variable index can't be handled in mask registers,
17915 // extend vector to VR512/128
17916 if (!IdxC) {
17917 unsigned NumElts = VecVT.getVectorNumElements();
17918 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
17919 // than extending to 128/256bit.
17920 if (NumElts == 1) {
17921 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
17923 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, DAG.getBitcast(IntVT, Vec));
17924 }
17925 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
17926 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
17927 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
17928 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
17929 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
17930 }
17931
17932 unsigned IdxVal = IdxC->getZExtValue();
17933 if (IdxVal == 0) // the operation is legal
17934 return Op;
17935
17936 // Extend to natively supported kshift.
17937 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
17938
17939 // Use kshiftr instruction to move to the lower element.
17940 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
17941 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
17942
17943 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
17944 DAG.getIntPtrConstant(0, dl));
17945}
17946
17947// Helper to find all the extracted elements from a vector.
17949 MVT VT = N->getSimpleValueType(0);
17950 unsigned NumElts = VT.getVectorNumElements();
17951 APInt DemandedElts = APInt::getZero(NumElts);
17952 for (SDNode *User : N->uses()) {
17953 switch (User->getOpcode()) {
17954 case X86ISD::PEXTRB:
17955 case X86ISD::PEXTRW:
17957 if (!isa<ConstantSDNode>(User->getOperand(1))) {
17958 DemandedElts.setAllBits();
17959 return DemandedElts;
17960 }
17961 DemandedElts.setBit(User->getConstantOperandVal(1));
17962 break;
17963 case ISD::BITCAST: {
17964 if (!User->getValueType(0).isSimple() ||
17965 !User->getValueType(0).isVector()) {
17966 DemandedElts.setAllBits();
17967 return DemandedElts;
17968 }
17969 APInt DemandedSrcElts = getExtractedDemandedElts(User);
17970 DemandedElts |= APIntOps::ScaleBitMask(DemandedSrcElts, NumElts);
17971 break;
17972 }
17973 default:
17974 DemandedElts.setAllBits();
17975 return DemandedElts;
17976 }
17977 }
17978 return DemandedElts;
17979}
17980
17981SDValue
17982X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
17983 SelectionDAG &DAG) const {
17984 SDLoc dl(Op);
17985 SDValue Vec = Op.getOperand(0);
17986 MVT VecVT = Vec.getSimpleValueType();
17987 SDValue Idx = Op.getOperand(1);
17988 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
17989
17990 if (VecVT.getVectorElementType() == MVT::i1)
17991 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
17992
17993 if (!IdxC) {
17994 // Its more profitable to go through memory (1 cycles throughput)
17995 // than using VMOVD + VPERMV/PSHUFB sequence (2/3 cycles throughput)
17996 // IACA tool was used to get performance estimation
17997 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
17998 //
17999 // example : extractelement <16 x i8> %a, i32 %i
18000 //
18001 // Block Throughput: 3.00 Cycles
18002 // Throughput Bottleneck: Port5
18003 //
18004 // | Num Of | Ports pressure in cycles | |
18005 // | Uops | 0 - DV | 5 | 6 | 7 | |
18006 // ---------------------------------------------
18007 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
18008 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
18009 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
18010 // Total Num Of Uops: 4
18011 //
18012 //
18013 // Block Throughput: 1.00 Cycles
18014 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
18015 //
18016 // | | Ports pressure in cycles | |
18017 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
18018 // ---------------------------------------------------------
18019 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
18020 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
18021 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
18022 // Total Num Of Uops: 4
18023
18024 return SDValue();
18025 }
18026
18027 unsigned IdxVal = IdxC->getZExtValue();
18028
18029 // If this is a 256-bit vector result, first extract the 128-bit vector and
18030 // then extract the element from the 128-bit vector.
18031 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
18032 // Get the 128-bit vector.
18033 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
18034 MVT EltVT = VecVT.getVectorElementType();
18035
18036 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
18037 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
18038
18039 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
18040 // this can be done with a mask.
18041 IdxVal &= ElemsPerChunk - 1;
18042 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18043 DAG.getIntPtrConstant(IdxVal, dl));
18044 }
18045
18046 assert(VecVT.is128BitVector() && "Unexpected vector length");
18047
18048 MVT VT = Op.getSimpleValueType();
18049
18050 if (VT == MVT::i16) {
18051 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
18052 // we're going to zero extend the register or fold the store (SSE41 only).
18053 if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
18054 !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
18055 if (Subtarget.hasFP16())
18056 return Op;
18057
18058 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
18059 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18060 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18061 }
18062
18063 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
18064 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18065 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18066 }
18067
18068 if (Subtarget.hasSSE41())
18069 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
18070 return Res;
18071
18072 // Only extract a single element from a v16i8 source - determine the common
18073 // DWORD/WORD that all extractions share, and extract the sub-byte.
18074 // TODO: Add QWORD MOVQ extraction?
18075 if (VT == MVT::i8) {
18076 APInt DemandedElts = getExtractedDemandedElts(Vec.getNode());
18077 assert(DemandedElts.getBitWidth() == 16 && "Vector width mismatch");
18078
18079 // Extract either the lowest i32 or any i16, and extract the sub-byte.
18080 int DWordIdx = IdxVal / 4;
18081 if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
18082 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18083 DAG.getBitcast(MVT::v4i32, Vec),
18084 DAG.getIntPtrConstant(DWordIdx, dl));
18085 int ShiftVal = (IdxVal % 4) * 8;
18086 if (ShiftVal != 0)
18087 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
18088 DAG.getConstant(ShiftVal, dl, MVT::i8));
18089 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18090 }
18091
18092 int WordIdx = IdxVal / 2;
18093 if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
18094 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
18095 DAG.getBitcast(MVT::v8i16, Vec),
18096 DAG.getIntPtrConstant(WordIdx, dl));
18097 int ShiftVal = (IdxVal % 2) * 8;
18098 if (ShiftVal != 0)
18099 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
18100 DAG.getConstant(ShiftVal, dl, MVT::i8));
18101 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18102 }
18103 }
18104
18105 if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
18106 if (IdxVal == 0)
18107 return Op;
18108
18109 // Shuffle the element to the lowest element, then movss or movsh.
18111 Mask[0] = static_cast<int>(IdxVal);
18112 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18113 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18114 DAG.getIntPtrConstant(0, dl));
18115 }
18116
18117 if (VT.getSizeInBits() == 64) {
18118 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
18119 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
18120 // to match extract_elt for f64.
18121 if (IdxVal == 0)
18122 return Op;
18123
18124 // UNPCKHPD the element to the lowest double word, then movsd.
18125 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
18126 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
18127 int Mask[2] = { 1, -1 };
18128 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18129 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18130 DAG.getIntPtrConstant(0, dl));
18131 }
18132
18133 return SDValue();
18134}
18135
18136/// Insert one bit to mask vector, like v16i1 or v8i1.
18137/// AVX-512 feature.
18139 const X86Subtarget &Subtarget) {
18140 SDLoc dl(Op);
18141 SDValue Vec = Op.getOperand(0);
18142 SDValue Elt = Op.getOperand(1);
18143 SDValue Idx = Op.getOperand(2);
18144 MVT VecVT = Vec.getSimpleValueType();
18145
18146 if (!isa<ConstantSDNode>(Idx)) {
18147 // Non constant index. Extend source and destination,
18148 // insert element and then truncate the result.
18149 unsigned NumElts = VecVT.getVectorNumElements();
18150 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18151 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18152 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
18153 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
18154 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
18155 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
18156 }
18157
18158 // Copy into a k-register, extract to v1i1 and insert_subvector.
18159 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
18160 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
18161}
18162
18163SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
18164 SelectionDAG &DAG) const {
18165 MVT VT = Op.getSimpleValueType();
18166 MVT EltVT = VT.getVectorElementType();
18167 unsigned NumElts = VT.getVectorNumElements();
18168 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
18169
18170 if (EltVT == MVT::i1)
18171 return InsertBitToMaskVector(Op, DAG, Subtarget);
18172
18173 SDLoc dl(Op);
18174 SDValue N0 = Op.getOperand(0);
18175 SDValue N1 = Op.getOperand(1);
18176 SDValue N2 = Op.getOperand(2);
18177 auto *N2C = dyn_cast<ConstantSDNode>(N2);
18178
18179 if (EltVT == MVT::bf16) {
18181 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT,
18182 DAG.getBitcast(IVT, N0),
18183 DAG.getBitcast(MVT::i16, N1), N2);
18184 return DAG.getBitcast(VT, Res);
18185 }
18186
18187 if (!N2C) {
18188 // Variable insertion indices, usually we're better off spilling to stack,
18189 // but AVX512 can use a variable compare+select by comparing against all
18190 // possible vector indices, and FP insertion has less gpr->simd traffic.
18191 if (!(Subtarget.hasBWI() ||
18192 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
18193 (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
18194 return SDValue();
18195
18196 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
18197 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
18198 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
18199 return SDValue();
18200
18201 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
18202 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
18203 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
18204
18205 SmallVector<SDValue, 16> RawIndices;
18206 for (unsigned I = 0; I != NumElts; ++I)
18207 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
18208 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
18209
18210 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
18211 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
18213 }
18214
18215 if (N2C->getAPIntValue().uge(NumElts))
18216 return SDValue();
18217 uint64_t IdxVal = N2C->getZExtValue();
18218
18219 bool IsZeroElt = X86::isZeroNode(N1);
18220 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
18221
18222 if (IsZeroElt || IsAllOnesElt) {
18223 // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
18224 // We don't deal with i8 0 since it appears to be handled elsewhere.
18225 if (IsAllOnesElt &&
18226 ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
18227 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
18228 SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
18229 SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
18230 SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
18231 CstVectorElts[IdxVal] = OnesCst;
18232 SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
18233 return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
18234 }
18235 // See if we can do this more efficiently with a blend shuffle with a
18236 // rematerializable vector.
18237 if (Subtarget.hasSSE41() &&
18238 (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
18239 SmallVector<int, 8> BlendMask;
18240 for (unsigned i = 0; i != NumElts; ++i)
18241 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
18242 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
18243 : getOnesVector(VT, DAG, dl);
18244 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
18245 }
18246 }
18247
18248 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
18249 // into that, and then insert the subvector back into the result.
18250 if (VT.is256BitVector() || VT.is512BitVector()) {
18251 // With a 256-bit vector, we can insert into the zero element efficiently
18252 // using a blend if we have AVX or AVX2 and the right data type.
18253 if (VT.is256BitVector() && IdxVal == 0) {
18254 // TODO: It is worthwhile to cast integer to floating point and back
18255 // and incur a domain crossing penalty if that's what we'll end up
18256 // doing anyway after extracting to a 128-bit vector.
18257 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
18258 (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
18259 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
18260 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
18261 DAG.getTargetConstant(1, dl, MVT::i8));
18262 }
18263 }
18264
18265 unsigned NumEltsIn128 = 128 / EltSizeInBits;
18266 assert(isPowerOf2_32(NumEltsIn128) &&
18267 "Vectors will always have power-of-two number of elements.");
18268
18269 // If we are not inserting into the low 128-bit vector chunk,
18270 // then prefer the broadcast+blend sequence.
18271 // FIXME: relax the profitability check iff all N1 uses are insertions.
18272 if (IdxVal >= NumEltsIn128 &&
18273 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
18274 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
18275 X86::mayFoldLoad(N1, Subtarget)))) {
18276 SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
18277 SmallVector<int, 8> BlendMask;
18278 for (unsigned i = 0; i != NumElts; ++i)
18279 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
18280 return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
18281 }
18282
18283 // Get the desired 128-bit vector chunk.
18284 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
18285
18286 // Insert the element into the desired chunk.
18287 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
18288 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
18289
18290 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
18291 DAG.getIntPtrConstant(IdxIn128, dl));
18292
18293 // Insert the changed part back into the bigger vector
18294 return insert128BitVector(N0, V, IdxVal, DAG, dl);
18295 }
18296 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
18297
18298 // This will be just movw/movd/movq/movsh/movss/movsd.
18299 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
18300 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
18301 EltVT == MVT::f16 || EltVT == MVT::i64) {
18302 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
18303 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
18304 }
18305
18306 // We can't directly insert an i8 or i16 into a vector, so zero extend
18307 // it to i32 first.
18308 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
18309 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
18310 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
18311 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
18312 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
18313 return DAG.getBitcast(VT, N1);
18314 }
18315 }
18316
18317 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
18318 // argument. SSE41 required for pinsrb.
18319 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
18320 unsigned Opc;
18321 if (VT == MVT::v8i16) {
18322 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
18323 Opc = X86ISD::PINSRW;
18324 } else {
18325 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
18326 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
18327 Opc = X86ISD::PINSRB;
18328 }
18329
18330 assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
18331 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
18332 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
18333 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
18334 }
18335
18336 if (Subtarget.hasSSE41()) {
18337 if (EltVT == MVT::f32) {
18338 // Bits [7:6] of the constant are the source select. This will always be
18339 // zero here. The DAG Combiner may combine an extract_elt index into
18340 // these bits. For example (insert (extract, 3), 2) could be matched by
18341 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
18342 // Bits [5:4] of the constant are the destination select. This is the
18343 // value of the incoming immediate.
18344 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
18345 // combine either bitwise AND or insert of float 0.0 to set these bits.
18346
18347 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
18348 if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
18349 // If this is an insertion of 32-bits into the low 32-bits of
18350 // a vector, we prefer to generate a blend with immediate rather
18351 // than an insertps. Blends are simpler operations in hardware and so
18352 // will always have equal or better performance than insertps.
18353 // But if optimizing for size and there's a load folding opportunity,
18354 // generate insertps because blendps does not have a 32-bit memory
18355 // operand form.
18356 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
18357 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
18358 DAG.getTargetConstant(1, dl, MVT::i8));
18359 }
18360 // Create this as a scalar to vector..
18361 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
18362 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
18363 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
18364 }
18365
18366 // PINSR* works with constant index.
18367 if (EltVT == MVT::i32 || EltVT == MVT::i64)
18368 return Op;
18369 }
18370
18371 return SDValue();
18372}
18373
18375 SelectionDAG &DAG) {
18376 SDLoc dl(Op);
18377 MVT OpVT = Op.getSimpleValueType();
18378
18379 // It's always cheaper to replace a xor+movd with xorps and simplifies further
18380 // combines.
18381 if (X86::isZeroNode(Op.getOperand(0)))
18382 return getZeroVector(OpVT, Subtarget, DAG, dl);
18383
18384 // If this is a 256-bit vector result, first insert into a 128-bit
18385 // vector and then insert into the 256-bit vector.
18386 if (!OpVT.is128BitVector()) {
18387 // Insert into a 128-bit vector.
18388 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
18390 OpVT.getVectorNumElements() / SizeFactor);
18391
18392 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
18393
18394 // Insert the 128-bit vector.
18395 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
18396 }
18397 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
18398 "Expected an SSE type!");
18399
18400 // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
18401 // tblgen.
18402 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
18403 return Op;
18404
18405 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
18406 return DAG.getBitcast(
18407 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
18408}
18409
18410// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
18411// simple superregister reference or explicit instructions to insert
18412// the upper bits of a vector.
18414 SelectionDAG &DAG) {
18415 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
18416
18417 return insert1BitVector(Op, DAG, Subtarget);
18418}
18419
18421 SelectionDAG &DAG) {
18422 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
18423 "Only vXi1 extract_subvectors need custom lowering");
18424
18425 SDLoc dl(Op);
18426 SDValue Vec = Op.getOperand(0);
18427 uint64_t IdxVal = Op.getConstantOperandVal(1);
18428
18429 if (IdxVal == 0) // the operation is legal
18430 return Op;
18431
18432 // Extend to natively supported kshift.
18433 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18434
18435 // Shift to the LSB.
18436 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
18437 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18438
18439 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
18440 DAG.getIntPtrConstant(0, dl));
18441}
18442
18443// Returns the appropriate wrapper opcode for a global reference.
18444unsigned X86TargetLowering::getGlobalWrapperKind(
18445 const GlobalValue *GV, const unsigned char OpFlags) const {
18446 // References to absolute symbols are never PC-relative.
18447 if (GV && GV->isAbsoluteSymbolRef())
18448 return X86ISD::Wrapper;
18449
18450 // The following OpFlags under RIP-rel PIC use RIP.
18451 if (Subtarget.isPICStyleRIPRel() &&
18452 (OpFlags == X86II::MO_NO_FLAG || OpFlags == X86II::MO_COFFSTUB ||
18453 OpFlags == X86II::MO_DLLIMPORT))
18454 return X86ISD::WrapperRIP;
18455
18456 // GOTPCREL references must always use RIP.
18457 if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
18458 return X86ISD::WrapperRIP;
18459
18460 return X86ISD::Wrapper;
18461}
18462
18463// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
18464// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
18465// one of the above mentioned nodes. It has to be wrapped because otherwise
18466// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
18467// be used to form addressing mode. These wrapped nodes will be selected
18468// into MOV32ri.
18469SDValue
18470X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
18471 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
18472
18473 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18474 // global base reg.
18475 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
18476
18477 auto PtrVT = getPointerTy(DAG.getDataLayout());
18479 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
18480 SDLoc DL(CP);
18481 Result =
18482 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
18483 // With PIC, the address is actually $g + Offset.
18484 if (OpFlag) {
18485 Result =
18486 DAG.getNode(ISD::ADD, DL, PtrVT,
18487 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
18488 }
18489
18490 return Result;
18491}
18492
18493SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
18494 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
18495
18496 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18497 // global base reg.
18498 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
18499
18500 auto PtrVT = getPointerTy(DAG.getDataLayout());
18501 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
18502 SDLoc DL(JT);
18503 Result =
18504 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
18505
18506 // With PIC, the address is actually $g + Offset.
18507 if (OpFlag)
18508 Result =
18509 DAG.getNode(ISD::ADD, DL, PtrVT,
18510 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
18511
18512 return Result;
18513}
18514
18515SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
18516 SelectionDAG &DAG) const {
18517 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
18518}
18519
18520SDValue
18521X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
18522 // Create the TargetBlockAddressAddress node.
18523 unsigned char OpFlags =
18525 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
18526 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
18527 SDLoc dl(Op);
18528 auto PtrVT = getPointerTy(DAG.getDataLayout());
18529 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
18530 Result =
18531 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlags), dl, PtrVT, Result);
18532
18533 // With PIC, the address is actually $g + Offset.
18534 if (isGlobalRelativeToPICBase(OpFlags)) {
18535 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
18536 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
18537 }
18538
18539 return Result;
18540}
18541
18542/// Creates target global address or external symbol nodes for calls or
18543/// other uses.
18544SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
18545 bool ForCall) const {
18546 // Unpack the global address or external symbol.
18547 const SDLoc &dl = SDLoc(Op);
18548 const GlobalValue *GV = nullptr;
18549 int64_t Offset = 0;
18550 const char *ExternalSym = nullptr;
18551 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
18552 GV = G->getGlobal();
18553 Offset = G->getOffset();
18554 } else {
18555 const auto *ES = cast<ExternalSymbolSDNode>(Op);
18556 ExternalSym = ES->getSymbol();
18557 }
18558
18559 // Calculate some flags for address lowering.
18561 unsigned char OpFlags;
18562 if (ForCall)
18563 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
18564 else
18565 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
18566 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
18567 bool NeedsLoad = isGlobalStubReference(OpFlags);
18568
18570 auto PtrVT = getPointerTy(DAG.getDataLayout());
18572
18573 if (GV) {
18574 // Create a target global address if this is a global. If possible, fold the
18575 // offset into the global address reference. Otherwise, ADD it on later.
18576 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
18577 // allowed because if the address of foo is 0, the ELF R_X86_64_32
18578 // relocation will compute to a negative value, which is invalid.
18579 int64_t GlobalOffset = 0;
18580 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
18582 std::swap(GlobalOffset, Offset);
18583 }
18584 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
18585 } else {
18586 // If this is not a global address, this must be an external symbol.
18587 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
18588 }
18589
18590 // If this is a direct call, avoid the wrapper if we don't need to do any
18591 // loads or adds. This allows SDAG ISel to match direct calls.
18592 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
18593 return Result;
18594
18595 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
18596
18597 // With PIC, the address is actually $g + Offset.
18598 if (HasPICReg) {
18599 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
18600 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
18601 }
18602
18603 // For globals that require a load from a stub to get the address, emit the
18604 // load.
18605 if (NeedsLoad)
18606 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
18608
18609 // If there was a non-zero offset that we didn't fold, create an explicit
18610 // addition for it.
18611 if (Offset != 0)
18612 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
18613 DAG.getConstant(Offset, dl, PtrVT));
18614
18615 return Result;
18616}
18617
18618SDValue
18619X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
18620 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
18621}
18622
18623static SDValue
18625 SDValue *InGlue, const EVT PtrVT, unsigned ReturnReg,
18626 unsigned char OperandFlags, bool LocalDynamic = false) {
18628 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18629 SDLoc dl(GA);
18630 SDValue TGA;
18631 bool UseTLSDESC = DAG.getTarget().useTLSDESC();
18632 if (LocalDynamic && UseTLSDESC) {
18633 TGA = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, OperandFlags);
18634 auto UI = TGA->use_begin();
18635 // Reuse existing GetTLSADDR node if we can find it.
18636 if (UI != TGA->use_end())
18637 return SDValue(*UI->use_begin()->use_begin(), 0);
18638 } else {
18639 TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
18640 GA->getOffset(), OperandFlags);
18641 }
18642
18643 X86ISD::NodeType CallType = UseTLSDESC ? X86ISD::TLSDESC
18644 : LocalDynamic ? X86ISD::TLSBASEADDR
18646
18647 if (InGlue) {
18648 SDValue Ops[] = { Chain, TGA, *InGlue };
18649 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
18650 } else {
18651 SDValue Ops[] = { Chain, TGA };
18652 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
18653 }
18654
18655 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
18656 MFI.setAdjustsStack(true);
18657 MFI.setHasCalls(true);
18658
18659 SDValue Glue = Chain.getValue(1);
18660 SDValue Ret = DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
18661
18662 if (!UseTLSDESC)
18663 return Ret;
18664
18665 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
18666 unsigned Seg = Subtarget.is64Bit() ? X86AS::FS : X86AS::GS;
18667
18669 SDValue Offset =
18670 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
18672 return DAG.getNode(ISD::ADD, dl, PtrVT, Ret, Offset);
18673}
18674
18675// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
18676static SDValue
18678 const EVT PtrVT) {
18679 SDValue InGlue;
18680 SDLoc dl(GA); // ? function entry point might be better
18681 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
18683 SDLoc(), PtrVT), InGlue);
18684 InGlue = Chain.getValue(1);
18685
18686 return GetTLSADDR(DAG, Chain, GA, &InGlue, PtrVT, X86::EAX, X86II::MO_TLSGD);
18687}
18688
18689// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
18690static SDValue
18692 const EVT PtrVT) {
18693 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
18694 X86::RAX, X86II::MO_TLSGD);
18695}
18696
18697// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
18698static SDValue
18700 const EVT PtrVT) {
18701 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
18702 X86::EAX, X86II::MO_TLSGD);
18703}
18704
18706 SelectionDAG &DAG, const EVT PtrVT,
18707 bool Is64Bit, bool Is64BitLP64) {
18708 SDLoc dl(GA);
18709
18710 // Get the start address of the TLS block for this module.
18714
18715 SDValue Base;
18716 if (Is64Bit) {
18717 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
18718 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg,
18719 X86II::MO_TLSLD, /*LocalDynamic=*/true);
18720 } else {
18721 SDValue InGlue;
18722 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
18723 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InGlue);
18724 InGlue = Chain.getValue(1);
18725 Base = GetTLSADDR(DAG, Chain, GA, &InGlue, PtrVT, X86::EAX,
18726 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
18727 }
18728
18729 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
18730 // of Base.
18731
18732 // Build x@dtpoff.
18733 unsigned char OperandFlags = X86II::MO_DTPOFF;
18734 unsigned WrapperKind = X86ISD::Wrapper;
18735 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
18736 GA->getValueType(0),
18737 GA->getOffset(), OperandFlags);
18738 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
18739
18740 // Add x@dtpoff with the base.
18741 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
18742}
18743
18744// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
18746 const EVT PtrVT, TLSModel::Model model,
18747 bool is64Bit, bool isPIC) {
18748 SDLoc dl(GA);
18749
18750 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
18752 PointerType::get(*DAG.getContext(), is64Bit ? 257 : 256));
18753
18754 SDValue ThreadPointer =
18755 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
18757
18758 unsigned char OperandFlags = 0;
18759 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
18760 // initialexec.
18761 unsigned WrapperKind = X86ISD::Wrapper;
18762 if (model == TLSModel::LocalExec) {
18763 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
18764 } else if (model == TLSModel::InitialExec) {
18765 if (is64Bit) {
18766 OperandFlags = X86II::MO_GOTTPOFF;
18767 WrapperKind = X86ISD::WrapperRIP;
18768 } else {
18769 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
18770 }
18771 } else {
18772 llvm_unreachable("Unexpected model");
18773 }
18774
18775 // emit "addl x@ntpoff,%eax" (local exec)
18776 // or "addl x@indntpoff,%eax" (initial exec)
18777 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
18778 SDValue TGA =
18779 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
18780 GA->getOffset(), OperandFlags);
18781 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
18782
18783 if (model == TLSModel::InitialExec) {
18784 if (isPIC && !is64Bit) {
18785 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
18786 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
18787 Offset);
18788 }
18789
18790 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
18792 }
18793
18794 // The address of the thread local variable is the add of the thread
18795 // pointer with the offset of the variable.
18796 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
18797}
18798
18799SDValue
18800X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
18801
18802 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
18803
18804 if (DAG.getTarget().useEmulatedTLS())
18805 return LowerToTLSEmulatedModel(GA, DAG);
18806
18807 const GlobalValue *GV = GA->getGlobal();
18808 auto PtrVT = getPointerTy(DAG.getDataLayout());
18809 bool PositionIndependent = isPositionIndependent();
18810
18811 if (Subtarget.isTargetELF()) {
18812 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
18813 switch (model) {
18815 if (Subtarget.is64Bit()) {
18816 if (Subtarget.isTarget64BitLP64())
18817 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
18818 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
18819 }
18820 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
18822 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
18823 Subtarget.isTarget64BitLP64());
18826 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
18827 PositionIndependent);
18828 }
18829 llvm_unreachable("Unknown TLS model.");
18830 }
18831
18832 if (Subtarget.isTargetDarwin()) {
18833 // Darwin only has one model of TLS. Lower to that.
18834 unsigned char OpFlag = 0;
18835 unsigned WrapperKind = 0;
18836
18837 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18838 // global base reg.
18839 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
18840 if (PIC32) {
18841 OpFlag = X86II::MO_TLVP_PIC_BASE;
18842 WrapperKind = X86ISD::Wrapper;
18843 } else {
18844 OpFlag = X86II::MO_TLVP;
18845 WrapperKind = X86ISD::WrapperRIP;
18846 }
18847 SDLoc DL(Op);
18849 GA->getValueType(0),
18850 GA->getOffset(), OpFlag);
18851 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
18852
18853 // With PIC32, the address is actually $g + Offset.
18854 if (PIC32)
18855 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
18856 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
18857 Offset);
18858
18859 // Lowering the machine isd will make sure everything is in the right
18860 // location.
18861 SDValue Chain = DAG.getEntryNode();
18862 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18863 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
18864 SDValue Args[] = { Chain, Offset };
18865 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
18866 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
18867
18868 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
18870 MFI.setAdjustsStack(true);
18871
18872 // And our return value (tls address) is in the standard call return value
18873 // location.
18874 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
18875 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
18876 }
18877
18878 if (Subtarget.isOSWindows()) {
18879 // Just use the implicit TLS architecture
18880 // Need to generate something similar to:
18881 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
18882 // ; from TEB
18883 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
18884 // mov rcx, qword [rdx+rcx*8]
18885 // mov eax, .tls$:tlsvar
18886 // [rax+rcx] contains the address
18887 // Windows 64bit: gs:0x58
18888 // Windows 32bit: fs:__tls_array
18889
18890 SDLoc dl(GA);
18891 SDValue Chain = DAG.getEntryNode();
18892
18893 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
18894 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
18895 // use its literal value of 0x2C.
18897 Subtarget.is64Bit() ? PointerType::get(*DAG.getContext(), 256)
18898 : PointerType::get(*DAG.getContext(), 257));
18899
18900 SDValue TlsArray = Subtarget.is64Bit()
18901 ? DAG.getIntPtrConstant(0x58, dl)
18902 : (Subtarget.isTargetWindowsGNU()
18903 ? DAG.getIntPtrConstant(0x2C, dl)
18904 : DAG.getExternalSymbol("_tls_array", PtrVT));
18905
18907 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
18908
18909 SDValue res;
18911 res = ThreadPointer;
18912 } else {
18913 // Load the _tls_index variable
18914 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
18915 if (Subtarget.is64Bit())
18916 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
18917 MachinePointerInfo(), MVT::i32);
18918 else
18919 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
18920
18921 const DataLayout &DL = DAG.getDataLayout();
18922 SDValue Scale =
18923 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
18924 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
18925
18926 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
18927 }
18928
18929 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
18930
18931 // Get the offset of start of .tls section
18932 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
18933 GA->getValueType(0),
18935 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
18936
18937 // The address of the thread local variable is the add of the thread
18938 // pointer with the offset of the variable.
18939 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
18940 }
18941
18942 llvm_unreachable("TLS not implemented for this target.");
18943}
18944
18946 if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
18948 TLSModel::Model Model = TM.getTLSModel(&GV);
18949 switch (Model) {
18952 // We can include the %fs segment register in addressing modes.
18953 return true;
18956 // These models do not result in %fs relative addresses unless
18957 // TLS descriptior are used.
18958 //
18959 // Even in the case of TLS descriptors we currently have no way to model
18960 // the difference between %fs access and the computations needed for the
18961 // offset and returning `true` for TLS-desc currently duplicates both
18962 // which is detrimental :-/
18963 return false;
18964 }
18965 }
18966 return false;
18967}
18968
18969/// Lower SRA_PARTS and friends, which return two i32 values
18970/// and take a 2 x i32 value to shift plus a shift amount.
18971/// TODO: Can this be moved to general expansion code?
18973 SDValue Lo, Hi;
18974 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
18975 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
18976}
18977
18978// Try to use a packed vector operation to handle i64 on 32-bit targets when
18979// AVX512DQ is enabled.
18981 SelectionDAG &DAG,
18982 const X86Subtarget &Subtarget) {
18983 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
18984 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
18985 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
18986 Op.getOpcode() == ISD::UINT_TO_FP) &&
18987 "Unexpected opcode!");
18988 bool IsStrict = Op->isStrictFPOpcode();
18989 unsigned OpNo = IsStrict ? 1 : 0;
18990 SDValue Src = Op.getOperand(OpNo);
18991 MVT SrcVT = Src.getSimpleValueType();
18992 MVT VT = Op.getSimpleValueType();
18993
18994 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
18995 (VT != MVT::f32 && VT != MVT::f64))
18996 return SDValue();
18997
18998 // Pack the i64 into a vector, do the operation and extract.
18999
19000 // Using 256-bit to ensure result is 128-bits for f32 case.
19001 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
19002 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
19003 MVT VecVT = MVT::getVectorVT(VT, NumElts);
19004
19005 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
19006 if (IsStrict) {
19007 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
19008 {Op.getOperand(0), InVec});
19009 SDValue Chain = CvtVec.getValue(1);
19010 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19011 DAG.getIntPtrConstant(0, dl));
19012 return DAG.getMergeValues({Value, Chain}, dl);
19013 }
19014
19015 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
19016
19017 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19018 DAG.getIntPtrConstant(0, dl));
19019}
19020
19021// Try to use a packed vector operation to handle i64 on 32-bit targets.
19023 const X86Subtarget &Subtarget) {
19024 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19025 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19026 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19027 Op.getOpcode() == ISD::UINT_TO_FP) &&
19028 "Unexpected opcode!");
19029 bool IsStrict = Op->isStrictFPOpcode();
19030 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
19031 MVT SrcVT = Src.getSimpleValueType();
19032 MVT VT = Op.getSimpleValueType();
19033
19034 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
19035 return SDValue();
19036
19037 // Pack the i64 into a vector, do the operation and extract.
19038
19039 assert(Subtarget.hasFP16() && "Expected FP16");
19040
19041 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
19042 if (IsStrict) {
19043 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
19044 {Op.getOperand(0), InVec});
19045 SDValue Chain = CvtVec.getValue(1);
19046 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19047 DAG.getIntPtrConstant(0, dl));
19048 return DAG.getMergeValues({Value, Chain}, dl);
19049 }
19050
19051 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
19052
19053 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19054 DAG.getIntPtrConstant(0, dl));
19055}
19056
19057static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
19058 const X86Subtarget &Subtarget) {
19059 switch (Opcode) {
19060 case ISD::SINT_TO_FP:
19061 // TODO: Handle wider types with AVX/AVX512.
19062 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
19063 return false;
19064 // CVTDQ2PS or (V)CVTDQ2PD
19065 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
19066
19067 case ISD::UINT_TO_FP:
19068 // TODO: Handle wider types and i64 elements.
19069 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
19070 return false;
19071 // VCVTUDQ2PS or VCVTUDQ2PD
19072 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
19073
19074 default:
19075 return false;
19076 }
19077}
19078
19079/// Given a scalar cast operation that is extracted from a vector, try to
19080/// vectorize the cast op followed by extraction. This will avoid an expensive
19081/// round-trip between XMM and GPR.
19083 SelectionDAG &DAG,
19084 const X86Subtarget &Subtarget) {
19085 // TODO: This could be enhanced to handle smaller integer types by peeking
19086 // through an extend.
19087 SDValue Extract = Cast.getOperand(0);
19088 MVT DestVT = Cast.getSimpleValueType();
19089 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19090 !isa<ConstantSDNode>(Extract.getOperand(1)))
19091 return SDValue();
19092
19093 // See if we have a 128-bit vector cast op for this type of cast.
19094 SDValue VecOp = Extract.getOperand(0);
19095 MVT FromVT = VecOp.getSimpleValueType();
19096 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
19097 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
19098 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
19099 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
19100 return SDValue();
19101
19102 // If we are extracting from a non-zero element, first shuffle the source
19103 // vector to allow extracting from element zero.
19104 if (!isNullConstant(Extract.getOperand(1))) {
19105 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
19106 Mask[0] = Extract.getConstantOperandVal(1);
19107 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
19108 }
19109 // If the source vector is wider than 128-bits, extract the low part. Do not
19110 // create an unnecessarily wide vector cast op.
19111 if (FromVT != Vec128VT)
19112 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
19113
19114 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
19115 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
19116 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
19117 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
19118 DAG.getIntPtrConstant(0, DL));
19119}
19120
19121/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
19122/// try to vectorize the cast ops. This will avoid an expensive round-trip
19123/// between XMM and GPR.
19124static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL,
19125 SelectionDAG &DAG,
19126 const X86Subtarget &Subtarget) {
19127 // TODO: Allow FP_TO_UINT.
19128 SDValue CastToInt = CastToFP.getOperand(0);
19129 MVT VT = CastToFP.getSimpleValueType();
19130 if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
19131 return SDValue();
19132
19133 MVT IntVT = CastToInt.getSimpleValueType();
19134 SDValue X = CastToInt.getOperand(0);
19135 MVT SrcVT = X.getSimpleValueType();
19136 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
19137 return SDValue();
19138
19139 // See if we have 128-bit vector cast instructions for this type of cast.
19140 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
19141 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
19142 IntVT != MVT::i32)
19143 return SDValue();
19144
19145 unsigned SrcSize = SrcVT.getSizeInBits();
19146 unsigned IntSize = IntVT.getSizeInBits();
19147 unsigned VTSize = VT.getSizeInBits();
19148 MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
19149 MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
19150 MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
19151
19152 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
19153 unsigned ToIntOpcode =
19154 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
19155 unsigned ToFPOpcode =
19156 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
19157
19158 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
19159 //
19160 // We are not defining the high elements (for example, zero them) because
19161 // that could nullify any performance advantage that we hoped to gain from
19162 // this vector op hack. We do not expect any adverse effects (like denorm
19163 // penalties) with cast ops.
19164 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
19165 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
19166 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
19167 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
19168 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
19169}
19170
19172 SelectionDAG &DAG,
19173 const X86Subtarget &Subtarget) {
19174 bool IsStrict = Op->isStrictFPOpcode();
19175 MVT VT = Op->getSimpleValueType(0);
19176 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
19177
19178 if (Subtarget.hasDQI()) {
19179 assert(!Subtarget.hasVLX() && "Unexpected features");
19180
19181 assert((Src.getSimpleValueType() == MVT::v2i64 ||
19182 Src.getSimpleValueType() == MVT::v4i64) &&
19183 "Unsupported custom type");
19184
19185 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
19186 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
19187 "Unexpected VT!");
19188 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
19189
19190 // Need to concat with zero vector for strict fp to avoid spurious
19191 // exceptions.
19192 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
19193 : DAG.getUNDEF(MVT::v8i64);
19194 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
19195 DAG.getIntPtrConstant(0, DL));
19196 SDValue Res, Chain;
19197 if (IsStrict) {
19198 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
19199 {Op->getOperand(0), Src});
19200 Chain = Res.getValue(1);
19201 } else {
19202 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
19203 }
19204
19205 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19206 DAG.getIntPtrConstant(0, DL));
19207
19208 if (IsStrict)
19209 return DAG.getMergeValues({Res, Chain}, DL);
19210 return Res;
19211 }
19212
19213 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
19214 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
19215 if (VT != MVT::v4f32 || IsSigned)
19216 return SDValue();
19217
19218 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
19219 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
19220 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
19221 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
19222 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
19223 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
19224 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
19225 SmallVector<SDValue, 4> SignCvts(4);
19226 SmallVector<SDValue, 4> Chains(4);
19227 for (int i = 0; i != 4; ++i) {
19228 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
19229 DAG.getIntPtrConstant(i, DL));
19230 if (IsStrict) {
19231 SignCvts[i] =
19232 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
19233 {Op.getOperand(0), Elt});
19234 Chains[i] = SignCvts[i].getValue(1);
19235 } else {
19236 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
19237 }
19238 }
19239 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
19240
19241 SDValue Slow, Chain;
19242 if (IsStrict) {
19243 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
19244 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
19245 {Chain, SignCvt, SignCvt});
19246 Chain = Slow.getValue(1);
19247 } else {
19248 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
19249 }
19250
19251 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
19252 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
19253
19254 if (IsStrict)
19255 return DAG.getMergeValues({Cvt, Chain}, DL);
19256
19257 return Cvt;
19258}
19259
19261 SelectionDAG &DAG) {
19262 bool IsStrict = Op->isStrictFPOpcode();
19263 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
19264 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
19265 MVT VT = Op.getSimpleValueType();
19266 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
19267
19268 SDValue Rnd = DAG.getIntPtrConstant(0, dl);
19269 if (IsStrict)
19270 return DAG.getNode(
19271 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
19272 {Chain,
19273 DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
19274 Rnd});
19275 return DAG.getNode(ISD::FP_ROUND, dl, VT,
19276 DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
19277}
19278
19279static bool isLegalConversion(MVT VT, bool IsSigned,
19280 const X86Subtarget &Subtarget) {
19281 if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
19282 return true;
19283 if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
19284 return true;
19285 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
19286 return true;
19287 if (Subtarget.useAVX512Regs()) {
19288 if (VT == MVT::v16i32)
19289 return true;
19290 if (VT == MVT::v8i64 && Subtarget.hasDQI())
19291 return true;
19292 }
19293 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
19294 (VT == MVT::v2i64 || VT == MVT::v4i64))
19295 return true;
19296 return false;
19297}
19298
19299SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
19300 SelectionDAG &DAG) const {
19301 bool IsStrict = Op->isStrictFPOpcode();
19302 unsigned OpNo = IsStrict ? 1 : 0;
19303 SDValue Src = Op.getOperand(OpNo);
19304 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
19305 MVT SrcVT = Src.getSimpleValueType();
19306 MVT VT = Op.getSimpleValueType();
19307 SDLoc dl(Op);
19308
19309 if (isSoftF16(VT, Subtarget))
19310 return promoteXINT_TO_FP(Op, dl, DAG);
19311 else if (isLegalConversion(SrcVT, true, Subtarget))
19312 return Op;
19313
19314 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
19315 return LowerWin64_INT128_TO_FP(Op, DAG);
19316
19317 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
19318 return Extract;
19319
19320 if (SDValue R = lowerFPToIntToFP(Op, dl, DAG, Subtarget))
19321 return R;
19322
19323 if (SrcVT.isVector()) {
19324 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
19325 // Note: Since v2f64 is a legal type. We don't need to zero extend the
19326 // source for strict FP.
19327 if (IsStrict)
19328 return DAG.getNode(
19329 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
19330 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
19331 DAG.getUNDEF(SrcVT))});
19332 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
19333 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
19334 DAG.getUNDEF(SrcVT)));
19335 }
19336 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
19337 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
19338
19339 return SDValue();
19340 }
19341
19342 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
19343 "Unknown SINT_TO_FP to lower!");
19344
19345 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
19346
19347 // These are really Legal; return the operand so the caller accepts it as
19348 // Legal.
19349 if (SrcVT == MVT::i32 && UseSSEReg)
19350 return Op;
19351 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
19352 return Op;
19353
19354 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
19355 return V;
19356 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
19357 return V;
19358
19359 // SSE doesn't have an i16 conversion so we need to promote.
19360 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
19361 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
19362 if (IsStrict)
19363 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
19364 {Chain, Ext});
19365
19366 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
19367 }
19368
19369 if (VT == MVT::f128 || !Subtarget.hasX87())
19370 return SDValue();
19371
19372 SDValue ValueToStore = Src;
19373 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
19374 // Bitcasting to f64 here allows us to do a single 64-bit store from
19375 // an SSE register, avoiding the store forwarding penalty that would come
19376 // with two 32-bit stores.
19377 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
19378
19379 unsigned Size = SrcVT.getStoreSize();
19380 Align Alignment(Size);
19382 auto PtrVT = getPointerTy(MF.getDataLayout());
19383 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
19384 MachinePointerInfo MPI =
19386 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
19387 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
19388 std::pair<SDValue, SDValue> Tmp =
19389 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
19390
19391 if (IsStrict)
19392 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
19393
19394 return Tmp.first;
19395}
19396
19397std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
19398 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
19399 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
19400 // Build the FILD
19401 SDVTList Tys;
19402 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
19403 if (useSSE)
19404 Tys = DAG.getVTList(MVT::f80, MVT::Other);
19405 else
19406 Tys = DAG.getVTList(DstVT, MVT::Other);
19407
19408 SDValue FILDOps[] = {Chain, Pointer};
19409 SDValue Result =
19410 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
19411 Alignment, MachineMemOperand::MOLoad);
19412 Chain = Result.getValue(1);
19413
19414 if (useSSE) {
19416 unsigned SSFISize = DstVT.getStoreSize();
19417 int SSFI =
19418 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
19419 auto PtrVT = getPointerTy(MF.getDataLayout());
19420 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
19421 Tys = DAG.getVTList(MVT::Other);
19422 SDValue FSTOps[] = {Chain, Result, StackSlot};
19425 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
19426
19427 Chain =
19428 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
19429 Result = DAG.getLoad(
19430 DstVT, DL, Chain, StackSlot,
19432 Chain = Result.getValue(1);
19433 }
19434
19435 return { Result, Chain };
19436}
19437
19438/// Horizontal vector math instructions may be slower than normal math with
19439/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
19440/// implementation, and likely shuffle complexity of the alternate sequence.
19441static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
19442 const X86Subtarget &Subtarget) {
19443 bool IsOptimizingSize = DAG.shouldOptForSize();
19444 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
19445 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
19446}
19447
19448/// 64-bit unsigned integer to double expansion.
19450 SelectionDAG &DAG,
19451 const X86Subtarget &Subtarget) {
19452 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
19453 // when converting 0 when rounding toward negative infinity. Caller will
19454 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
19455 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
19456 // This algorithm is not obvious. Here it is what we're trying to output:
19457 /*
19458 movq %rax, %xmm0
19459 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
19460 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
19461 #ifdef __SSE3__
19462 haddpd %xmm0, %xmm0
19463 #else
19464 pshufd $0x4e, %xmm0, %xmm1
19465 addpd %xmm1, %xmm0
19466 #endif
19467 */
19468
19470
19471 // Build some magic constants.
19472 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
19474 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
19475 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
19476
19478 CV1.push_back(
19479 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
19480 APInt(64, 0x4330000000000000ULL))));
19481 CV1.push_back(
19482 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
19483 APInt(64, 0x4530000000000000ULL))));
19484 Constant *C1 = ConstantVector::get(CV1);
19485 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
19486
19487 // Load the 64-bit value into an XMM register.
19488 SDValue XR1 =
19489 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
19490 SDValue CLod0 = DAG.getLoad(
19491 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
19493 SDValue Unpck1 =
19494 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
19495
19496 SDValue CLod1 = DAG.getLoad(
19497 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
19499 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
19500 // TODO: Are there any fast-math-flags to propagate here?
19501 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
19502 SDValue Result;
19503
19504 if (Subtarget.hasSSE3() &&
19505 shouldUseHorizontalOp(true, DAG, Subtarget)) {
19506 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
19507 } else {
19508 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
19509 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
19510 }
19511 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
19512 DAG.getIntPtrConstant(0, dl));
19513 return Result;
19514}
19515
19516/// 32-bit unsigned integer to float expansion.
19518 SelectionDAG &DAG,
19519 const X86Subtarget &Subtarget) {
19520 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
19521 // FP constant to bias correct the final result.
19522 SDValue Bias = DAG.getConstantFP(
19523 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);
19524
19525 // Load the 32-bit value into an XMM register.
19526 SDValue Load =
19527 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
19528
19529 // Zero out the upper parts of the register.
19530 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
19531
19532 // Or the load with the bias.
19533 SDValue Or = DAG.getNode(
19534 ISD::OR, dl, MVT::v2i64,
19535 DAG.getBitcast(MVT::v2i64, Load),
19536 DAG.getBitcast(MVT::v2i64,
19537 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
19538 Or =
19539 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
19540 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
19541
19542 if (Op.getNode()->isStrictFPOpcode()) {
19543 // Subtract the bias.
19544 // TODO: Are there any fast-math-flags to propagate here?
19545 SDValue Chain = Op.getOperand(0);
19546 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
19547 {Chain, Or, Bias});
19548
19549 if (Op.getValueType() == Sub.getValueType())
19550 return Sub;
19551
19552 // Handle final rounding.
19553 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
19554 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
19555
19556 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
19557 }
19558
19559 // Subtract the bias.
19560 // TODO: Are there any fast-math-flags to propagate here?
19561 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
19562
19563 // Handle final rounding.
19564 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
19565}
19566
19568 SelectionDAG &DAG,
19569 const X86Subtarget &Subtarget) {
19570 if (Op.getSimpleValueType() != MVT::v2f64)
19571 return SDValue();
19572
19573 bool IsStrict = Op->isStrictFPOpcode();
19574
19575 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
19576 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
19577
19578 if (Subtarget.hasAVX512()) {
19579 if (!Subtarget.hasVLX()) {
19580 // Let generic type legalization widen this.
19581 if (!IsStrict)
19582 return SDValue();
19583 // Otherwise pad the integer input with 0s and widen the operation.
19584 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
19585 DAG.getConstant(0, DL, MVT::v2i32));
19586 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
19587 {Op.getOperand(0), N0});
19588 SDValue Chain = Res.getValue(1);
19589 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
19590 DAG.getIntPtrConstant(0, DL));
19591 return DAG.getMergeValues({Res, Chain}, DL);
19592 }
19593
19594 // Legalize to v4i32 type.
19595 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
19596 DAG.getUNDEF(MVT::v2i32));
19597 if (IsStrict)
19598 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
19599 {Op.getOperand(0), N0});
19600 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
19601 }
19602
19603 // Zero extend to 2i64, OR with the floating point representation of 2^52.
19604 // This gives us the floating point equivalent of 2^52 + the i32 integer
19605 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
19606 // point leaving just our i32 integers in double format.
19607 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
19608 SDValue VBias = DAG.getConstantFP(
19609 llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);
19610 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
19611 DAG.getBitcast(MVT::v2i64, VBias));
19612 Or = DAG.getBitcast(MVT::v2f64, Or);
19613
19614 if (IsStrict)
19615 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
19616 {Op.getOperand(0), Or, VBias});
19617 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
19618}
19619
19621 SelectionDAG &DAG,
19622 const X86Subtarget &Subtarget) {
19623 bool IsStrict = Op->isStrictFPOpcode();
19624 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
19625 MVT VecIntVT = V.getSimpleValueType();
19626 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
19627 "Unsupported custom type");
19628
19629 if (Subtarget.hasAVX512()) {
19630 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
19631 assert(!Subtarget.hasVLX() && "Unexpected features");
19632 MVT VT = Op->getSimpleValueType(0);
19633
19634 // v8i32->v8f64 is legal with AVX512 so just return it.
19635 if (VT == MVT::v8f64)
19636 return Op;
19637
19638 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&
19639 "Unexpected VT!");
19640 MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
19641 MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
19642 // Need to concat with zero vector for strict fp to avoid spurious
19643 // exceptions.
19644 SDValue Tmp =
19645 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
19646 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
19647 DAG.getIntPtrConstant(0, DL));
19648 SDValue Res, Chain;
19649 if (IsStrict) {
19650 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
19651 {Op->getOperand(0), V});
19652 Chain = Res.getValue(1);
19653 } else {
19654 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
19655 }
19656
19657 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19658 DAG.getIntPtrConstant(0, DL));
19659
19660 if (IsStrict)
19661 return DAG.getMergeValues({Res, Chain}, DL);
19662 return Res;
19663 }
19664
19665 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
19666 Op->getSimpleValueType(0) == MVT::v4f64) {
19667 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
19668 Constant *Bias = ConstantFP::get(
19669 *DAG.getContext(),
19670 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
19671 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
19672 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
19673 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
19674 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
19675 SDValue VBias = DAG.getMemIntrinsicNode(
19676 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
19679
19680 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
19681 DAG.getBitcast(MVT::v4i64, VBias));
19682 Or = DAG.getBitcast(MVT::v4f64, Or);
19683
19684 if (IsStrict)
19685 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
19686 {Op.getOperand(0), Or, VBias});
19687 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
19688 }
19689
19690 // The algorithm is the following:
19691 // #ifdef __SSE4_1__
19692 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
19693 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
19694 // (uint4) 0x53000000, 0xaa);
19695 // #else
19696 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
19697 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
19698 // #endif
19699 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
19700 // return (float4) lo + fhi;
19701
19702 bool Is128 = VecIntVT == MVT::v4i32;
19703 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
19704 // If we convert to something else than the supported type, e.g., to v4f64,
19705 // abort early.
19706 if (VecFloatVT != Op->getSimpleValueType(0))
19707 return SDValue();
19708
19709 // In the #idef/#else code, we have in common:
19710 // - The vector of constants:
19711 // -- 0x4b000000
19712 // -- 0x53000000
19713 // - A shift:
19714 // -- v >> 16
19715
19716 // Create the splat vector for 0x4b000000.
19717 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
19718 // Create the splat vector for 0x53000000.
19719 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
19720
19721 // Create the right shift.
19722 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
19723 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
19724
19725 SDValue Low, High;
19726 if (Subtarget.hasSSE41()) {
19727 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
19728 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
19729 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
19730 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
19731 // Low will be bitcasted right away, so do not bother bitcasting back to its
19732 // original type.
19733 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
19734 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
19735 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
19736 // (uint4) 0x53000000, 0xaa);
19737 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
19738 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
19739 // High will be bitcasted right away, so do not bother bitcasting back to
19740 // its original type.
19741 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
19742 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
19743 } else {
19744 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
19745 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
19746 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
19747 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
19748
19749 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
19750 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
19751 }
19752
19753 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
19754 SDValue VecCstFSub = DAG.getConstantFP(
19755 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
19756
19757 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
19758 // NOTE: By using fsub of a positive constant instead of fadd of a negative
19759 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
19760 // enabled. See PR24512.
19761 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
19762 // TODO: Are there any fast-math-flags to propagate here?
19763 // (float4) lo;
19764 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
19765 // return (float4) lo + fhi;
19766 if (IsStrict) {
19767 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
19768 {Op.getOperand(0), HighBitcast, VecCstFSub});
19769 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
19770 {FHigh.getValue(1), LowBitcast, FHigh});
19771 }
19772
19773 SDValue FHigh =
19774 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
19775 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
19776}
19777
19779 const X86Subtarget &Subtarget) {
19780 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
19781 SDValue N0 = Op.getOperand(OpNo);
19782 MVT SrcVT = N0.getSimpleValueType();
19783
19784 switch (SrcVT.SimpleTy) {
19785 default:
19786 llvm_unreachable("Custom UINT_TO_FP is not supported!");
19787 case MVT::v2i32:
19788 return lowerUINT_TO_FP_v2i32(Op, dl, DAG, Subtarget);
19789 case MVT::v4i32:
19790 case MVT::v8i32:
19791 return lowerUINT_TO_FP_vXi32(Op, dl, DAG, Subtarget);
19792 case MVT::v2i64:
19793 case MVT::v4i64:
19794 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
19795 }
19796}
19797
19798SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
19799 SelectionDAG &DAG) const {
19800 bool IsStrict = Op->isStrictFPOpcode();
19801 unsigned OpNo = IsStrict ? 1 : 0;
19802 SDValue Src = Op.getOperand(OpNo);
19803 SDLoc dl(Op);
19804 auto PtrVT = getPointerTy(DAG.getDataLayout());
19805 MVT SrcVT = Src.getSimpleValueType();
19806 MVT DstVT = Op->getSimpleValueType(0);
19807 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
19808
19809 // Bail out when we don't have native conversion instructions.
19810 if (DstVT == MVT::f128)
19811 return SDValue();
19812
19813 if (isSoftF16(DstVT, Subtarget))
19814 return promoteXINT_TO_FP(Op, dl, DAG);
19815 else if (isLegalConversion(SrcVT, false, Subtarget))
19816 return Op;
19817
19818 if (DstVT.isVector())
19819 return lowerUINT_TO_FP_vec(Op, dl, DAG, Subtarget);
19820
19821 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
19822 return LowerWin64_INT128_TO_FP(Op, DAG);
19823
19824 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
19825 return Extract;
19826
19827 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
19828 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
19829 // Conversions from unsigned i32 to f32/f64 are legal,
19830 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
19831 return Op;
19832 }
19833
19834 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
19835 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
19836 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
19837 if (IsStrict)
19838 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
19839 {Chain, Src});
19840 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
19841 }
19842
19843 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
19844 return V;
19845 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
19846 return V;
19847
19848 // The transform for i64->f64 isn't correct for 0 when rounding to negative
19849 // infinity. It produces -0.0, so disable under strictfp.
19850 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
19851 !IsStrict)
19852 return LowerUINT_TO_FP_i64(Op, dl, DAG, Subtarget);
19853 // The transform for i32->f64/f32 isn't correct for 0 when rounding to
19854 // negative infinity. So disable under strictfp. Using FILD instead.
19855 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
19856 !IsStrict)
19857 return LowerUINT_TO_FP_i32(Op, dl, DAG, Subtarget);
19858 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
19859 (DstVT == MVT::f32 || DstVT == MVT::f64))
19860 return SDValue();
19861
19862 // Make a 64-bit buffer, and use it to build an FILD.
19863 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
19864 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
19865 Align SlotAlign(8);
19866 MachinePointerInfo MPI =
19868 if (SrcVT == MVT::i32) {
19869 SDValue OffsetSlot =
19870 DAG.getMemBasePlusOffset(StackSlot, TypeSize::getFixed(4), dl);
19871 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
19872 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
19873 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
19874 std::pair<SDValue, SDValue> Tmp =
19875 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
19876 if (IsStrict)
19877 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
19878
19879 return Tmp.first;
19880 }
19881
19882 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
19883 SDValue ValueToStore = Src;
19884 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
19885 // Bitcasting to f64 here allows us to do a single 64-bit store from
19886 // an SSE register, avoiding the store forwarding penalty that would come
19887 // with two 32-bit stores.
19888 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
19889 }
19890 SDValue Store =
19891 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
19892 // For i64 source, we need to add the appropriate power of 2 if the input
19893 // was negative. We must be careful to do the computation in x87 extended
19894 // precision, not in SSE.
19895 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
19896 SDValue Ops[] = {Store, StackSlot};
19897 SDValue Fild =
19898 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
19899 SlotAlign, MachineMemOperand::MOLoad);
19900 Chain = Fild.getValue(1);
19901
19902 // Check whether the sign bit is set.
19903 SDValue SignSet = DAG.getSetCC(
19904 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
19905 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
19906
19907 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
19908 APInt FF(64, 0x5F80000000000000ULL);
19909 SDValue FudgePtr =
19910 DAG.getConstantPool(ConstantInt::get(*DAG.getContext(), FF), PtrVT);
19911 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
19912
19913 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
19914 SDValue Zero = DAG.getIntPtrConstant(0, dl);
19915 SDValue Four = DAG.getIntPtrConstant(4, dl);
19916 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
19917 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
19918
19919 // Load the value out, extending it from f32 to f80.
19920 SDValue Fudge = DAG.getExtLoad(
19921 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
19923 CPAlignment);
19924 Chain = Fudge.getValue(1);
19925 // Extend everything to 80 bits to force it to be done on x87.
19926 // TODO: Are there any fast-math-flags to propagate here?
19927 if (IsStrict) {
19928 unsigned Opc = ISD::STRICT_FADD;
19929 // Windows needs the precision control changed to 80bits around this add.
19930 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
19932
19933 SDValue Add =
19934 DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
19935 // STRICT_FP_ROUND can't handle equal types.
19936 if (DstVT == MVT::f80)
19937 return Add;
19938 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
19939 {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
19940 }
19941 unsigned Opc = ISD::FADD;
19942 // Windows needs the precision control changed to 80bits around this add.
19943 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
19944 Opc = X86ISD::FP80_ADD;
19945
19946 SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
19947 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
19948 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
19949}
19950
19951// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
19952// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
19953// just return an SDValue().
19954// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
19955// to i16, i32 or i64, and we lower it to a legal sequence and return the
19956// result.
19957SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
19958 bool IsSigned,
19959 SDValue &Chain) const {
19960 bool IsStrict = Op->isStrictFPOpcode();
19961 SDLoc DL(Op);
19962
19963 EVT DstTy = Op.getValueType();
19964 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
19965 EVT TheVT = Value.getValueType();
19966 auto PtrVT = getPointerTy(DAG.getDataLayout());
19967
19968 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
19969 // f16 must be promoted before using the lowering in this routine.
19970 // fp128 does not use this lowering.
19971 return SDValue();
19972 }
19973
19974 // If using FIST to compute an unsigned i64, we'll need some fixup
19975 // to handle values above the maximum signed i64. A FIST is always
19976 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
19977 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
19978
19979 // FIXME: This does not generate an invalid exception if the input does not
19980 // fit in i32. PR44019
19981 if (!IsSigned && DstTy != MVT::i64) {
19982 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
19983 // The low 32 bits of the fist result will have the correct uint32 result.
19984 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
19985 DstTy = MVT::i64;
19986 }
19987
19988 assert(DstTy.getSimpleVT() <= MVT::i64 &&
19989 DstTy.getSimpleVT() >= MVT::i16 &&
19990 "Unknown FP_TO_INT to lower!");
19991
19992 // We lower FP->int64 into FISTP64 followed by a load from a temporary
19993 // stack slot.
19995 unsigned MemSize = DstTy.getStoreSize();
19996 int SSFI =
19997 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
19998 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
19999
20000 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20001
20002 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
20003
20004 if (UnsignedFixup) {
20005 //
20006 // Conversion to unsigned i64 is implemented with a select,
20007 // depending on whether the source value fits in the range
20008 // of a signed i64. Let Thresh be the FP equivalent of
20009 // 0x8000000000000000ULL.
20010 //
20011 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
20012 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
20013 // FistSrc = (Value - FltOfs);
20014 // Fist-to-mem64 FistSrc
20015 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
20016 // to XOR'ing the high 32 bits with Adjust.
20017 //
20018 // Being a power of 2, Thresh is exactly representable in all FP formats.
20019 // For X87 we'd like to use the smallest FP type for this constant, but
20020 // for DAG type consistency we have to match the FP operand type.
20021
20022 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
20024 bool LosesInfo = false;
20025 if (TheVT == MVT::f64)
20026 // The rounding mode is irrelevant as the conversion should be exact.
20028 &LosesInfo);
20029 else if (TheVT == MVT::f80)
20030 Status = Thresh.convert(APFloat::x87DoubleExtended(),
20031 APFloat::rmNearestTiesToEven, &LosesInfo);
20032
20033 assert(Status == APFloat::opOK && !LosesInfo &&
20034 "FP conversion should have been exact");
20035
20036 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
20037
20038 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
20039 *DAG.getContext(), TheVT);
20040 SDValue Cmp;
20041 if (IsStrict) {
20042 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
20043 /*IsSignaling*/ true);
20044 Chain = Cmp.getValue(1);
20045 } else {
20046 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
20047 }
20048
20049 // Our preferred lowering of
20050 //
20051 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
20052 //
20053 // is
20054 //
20055 // (Value >= Thresh) << 63
20056 //
20057 // but since we can get here after LegalOperations, DAGCombine might do the
20058 // wrong thing if we create a select. So, directly create the preferred
20059 // version.
20060 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
20061 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
20062 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
20063
20064 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
20065 DAG.getConstantFP(0.0, DL, TheVT));
20066
20067 if (IsStrict) {
20068 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
20069 { Chain, Value, FltOfs });
20070 Chain = Value.getValue(1);
20071 } else
20072 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
20073 }
20074
20076
20077 // FIXME This causes a redundant load/store if the SSE-class value is already
20078 // in memory, such as if it is on the callstack.
20079 if (isScalarFPTypeInSSEReg(TheVT)) {
20080 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
20081 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
20082 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20083 SDValue Ops[] = { Chain, StackSlot };
20084
20085 unsigned FLDSize = TheVT.getStoreSize();
20086 assert(FLDSize <= MemSize && "Stack slot not big enough");
20088 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
20089 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
20090 Chain = Value.getValue(1);
20091 }
20092
20093 // Build the FP_TO_INT*_IN_MEM
20095 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
20096 SDValue Ops[] = { Chain, Value, StackSlot };
20098 DAG.getVTList(MVT::Other),
20099 Ops, DstTy, MMO);
20100
20101 SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);
20102 Chain = Res.getValue(1);
20103
20104 // If we need an unsigned fixup, XOR the result with adjust.
20105 if (UnsignedFixup)
20106 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
20107
20108 return Res;
20109}
20110
20112 const X86Subtarget &Subtarget) {
20113 MVT VT = Op.getSimpleValueType();
20114 SDValue In = Op.getOperand(0);
20115 MVT InVT = In.getSimpleValueType();
20116 SDLoc dl(Op);
20117 unsigned Opc = Op.getOpcode();
20118
20119 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
20120 assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&
20121 "Unexpected extension opcode");
20123 "Expected same number of elements");
20124 assert((VT.getVectorElementType() == MVT::i16 ||
20125 VT.getVectorElementType() == MVT::i32 ||
20126 VT.getVectorElementType() == MVT::i64) &&
20127 "Unexpected element type");
20128 assert((InVT.getVectorElementType() == MVT::i8 ||
20129 InVT.getVectorElementType() == MVT::i16 ||
20130 InVT.getVectorElementType() == MVT::i32) &&
20131 "Unexpected element type");
20132
20133 unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);
20134
20135 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
20136 assert(InVT == MVT::v32i8 && "Unexpected VT!");
20137 return splitVectorIntUnary(Op, DAG, dl);
20138 }
20139
20140 if (Subtarget.hasInt256())
20141 return Op;
20142
20143 // Optimize vectors in AVX mode:
20144 //
20145 // v8i16 -> v8i32
20146 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
20147 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
20148 // Concat upper and lower parts.
20149 //
20150 // v4i32 -> v4i64
20151 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
20152 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
20153 // Concat upper and lower parts.
20154 //
20155 MVT HalfVT = VT.getHalfNumVectorElementsVT();
20156 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
20157
20158 // Short-circuit if we can determine that each 128-bit half is the same value.
20159 // Otherwise, this is difficult to match and optimize.
20160 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
20161 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
20162 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
20163
20164 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
20165 SDValue Undef = DAG.getUNDEF(InVT);
20166 bool NeedZero = Opc == ISD::ZERO_EXTEND;
20167 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
20168 OpHi = DAG.getBitcast(HalfVT, OpHi);
20169
20170 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
20171}
20172
20173// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
20174static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
20175 const SDLoc &dl, SelectionDAG &DAG) {
20176 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
20177 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20178 DAG.getIntPtrConstant(0, dl));
20179 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20180 DAG.getIntPtrConstant(8, dl));
20181 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
20182 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
20183 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
20184 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20185}
20186
20188 const X86Subtarget &Subtarget,
20189 SelectionDAG &DAG) {
20190 MVT VT = Op->getSimpleValueType(0);
20191 SDValue In = Op->getOperand(0);
20192 MVT InVT = In.getSimpleValueType();
20193 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
20194 SDLoc DL(Op);
20195 unsigned NumElts = VT.getVectorNumElements();
20196
20197 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
20198 // avoids a constant pool load.
20199 if (VT.getVectorElementType() != MVT::i8) {
20200 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
20201 return DAG.getNode(ISD::SRL, DL, VT, Extend,
20202 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
20203 }
20204
20205 // Extend VT if BWI is not supported.
20206 MVT ExtVT = VT;
20207 if (!Subtarget.hasBWI()) {
20208 // If v16i32 is to be avoided, we'll need to split and concatenate.
20209 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
20210 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
20211
20212 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
20213 }
20214
20215 // Widen to 512-bits if VLX is not supported.
20216 MVT WideVT = ExtVT;
20217 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
20218 NumElts *= 512 / ExtVT.getSizeInBits();
20219 InVT = MVT::getVectorVT(MVT::i1, NumElts);
20220 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
20221 In, DAG.getIntPtrConstant(0, DL));
20222 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
20223 NumElts);
20224 }
20225
20226 SDValue One = DAG.getConstant(1, DL, WideVT);
20227 SDValue Zero = DAG.getConstant(0, DL, WideVT);
20228
20229 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
20230
20231 // Truncate if we had to extend above.
20232 if (VT != ExtVT) {
20233 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
20234 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
20235 }
20236
20237 // Extract back to 128/256-bit if we widened.
20238 if (WideVT != VT)
20239 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
20240 DAG.getIntPtrConstant(0, DL));
20241
20242 return SelectedVal;
20243}
20244
20246 SelectionDAG &DAG) {
20247 SDValue In = Op.getOperand(0);
20248 MVT SVT = In.getSimpleValueType();
20249
20250 if (SVT.getVectorElementType() == MVT::i1)
20251 return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
20252
20253 assert(Subtarget.hasAVX() && "Expected AVX support");
20254 return LowerAVXExtend(Op, DAG, Subtarget);
20255}
20256
20257/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
20258/// It makes use of the fact that vectors with enough leading sign/zero bits
20259/// prevent the PACKSS/PACKUS from saturating the results.
20260/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
20261/// within each 128-bit lane.
20262static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
20263 const SDLoc &DL, SelectionDAG &DAG,
20264 const X86Subtarget &Subtarget) {
20265 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
20266 "Unexpected PACK opcode");
20267 assert(DstVT.isVector() && "VT not a vector?");
20268
20269 // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
20270 if (!Subtarget.hasSSE2())
20271 return SDValue();
20272
20273 EVT SrcVT = In.getValueType();
20274
20275 // No truncation required, we might get here due to recursive calls.
20276 if (SrcVT == DstVT)
20277 return In;
20278
20279 unsigned NumElems = SrcVT.getVectorNumElements();
20280 if (NumElems < 2 || !isPowerOf2_32(NumElems) )
20281 return SDValue();
20282
20283 unsigned DstSizeInBits = DstVT.getSizeInBits();
20284 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
20285 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
20286 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
20287
20288 LLVMContext &Ctx = *DAG.getContext();
20289 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
20290 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
20291
20292 // Pack to the largest type possible:
20293 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
20294 EVT InVT = MVT::i16, OutVT = MVT::i8;
20295 if (SrcVT.getScalarSizeInBits() > 16 &&
20296 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
20297 InVT = MVT::i32;
20298 OutVT = MVT::i16;
20299 }
20300
20301 // Sub-128-bit truncation - widen to 128-bit src and pack in the lower half.
20302 // On pre-AVX512, pack the src in both halves to help value tracking.
20303 if (SrcSizeInBits <= 128) {
20304 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
20305 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
20306 In = widenSubVector(In, false, Subtarget, DAG, DL, 128);
20307 SDValue LHS = DAG.getBitcast(InVT, In);
20308 SDValue RHS = Subtarget.hasAVX512() ? DAG.getUNDEF(InVT) : LHS;
20309 SDValue Res = DAG.getNode(Opcode, DL, OutVT, LHS, RHS);
20310 Res = extractSubVector(Res, 0, DAG, DL, SrcSizeInBits / 2);
20311 Res = DAG.getBitcast(PackedVT, Res);
20312 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20313 }
20314
20315 // Split lower/upper subvectors.
20316 SDValue Lo, Hi;
20317 std::tie(Lo, Hi) = splitVector(In, DAG, DL);
20318
20319 // If Hi is undef, then don't bother packing it and widen the result instead.
20320 if (Hi.isUndef()) {
20321 EVT DstHalfVT = DstVT.getHalfNumVectorElementsVT(Ctx);
20322 if (SDValue Res =
20323 truncateVectorWithPACK(Opcode, DstHalfVT, Lo, DL, DAG, Subtarget))
20324 return widenSubVector(Res, false, Subtarget, DAG, DL, DstSizeInBits);
20325 }
20326
20327 unsigned SubSizeInBits = SrcSizeInBits / 2;
20328 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
20329 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
20330
20331 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
20332 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
20333 Lo = DAG.getBitcast(InVT, Lo);
20334 Hi = DAG.getBitcast(InVT, Hi);
20335 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
20336 return DAG.getBitcast(DstVT, Res);
20337 }
20338
20339 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
20340 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
20341 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
20342 Lo = DAG.getBitcast(InVT, Lo);
20343 Hi = DAG.getBitcast(InVT, Hi);
20344 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
20345
20346 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
20347 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
20348 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
20350 int Scale = 64 / OutVT.getScalarSizeInBits();
20351 narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
20352 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
20353
20354 if (DstVT.is256BitVector())
20355 return DAG.getBitcast(DstVT, Res);
20356
20357 // If 512bit -> 128bit truncate another stage.
20358 Res = DAG.getBitcast(PackedVT, Res);
20359 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20360 }
20361
20362 // Recursively pack lower/upper subvectors, concat result and pack again.
20363 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
20364
20365 if (PackedVT.is128BitVector()) {
20366 // Avoid CONCAT_VECTORS on sub-128bit nodes as these can fail after
20367 // type legalization.
20368 SDValue Res =
20369 truncateVectorWithPACK(Opcode, PackedVT, In, DL, DAG, Subtarget);
20370 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20371 }
20372
20373 EVT HalfPackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
20374 Lo = truncateVectorWithPACK(Opcode, HalfPackedVT, Lo, DL, DAG, Subtarget);
20375 Hi = truncateVectorWithPACK(Opcode, HalfPackedVT, Hi, DL, DAG, Subtarget);
20376 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
20377 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20378}
20379
20380/// Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
20381/// e.g. trunc <8 x i32> X to <8 x i16> -->
20382/// MaskX = X & 0xffff (clear high bits to prevent saturation)
20383/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
20385 const X86Subtarget &Subtarget,
20386 SelectionDAG &DAG) {
20387 In = DAG.getZeroExtendInReg(In, DL, DstVT);
20388 return truncateVectorWithPACK(X86ISD::PACKUS, DstVT, In, DL, DAG, Subtarget);
20389}
20390
20391/// Truncate using inreg sign extension and X86ISD::PACKSS.
20393 const X86Subtarget &Subtarget,
20394 SelectionDAG &DAG) {
20395 EVT SrcVT = In.getValueType();
20396 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, SrcVT, In,
20397 DAG.getValueType(DstVT));
20398 return truncateVectorWithPACK(X86ISD::PACKSS, DstVT, In, DL, DAG, Subtarget);
20399}
20400
20401/// Helper to determine if \p In truncated to \p DstVT has the necessary
20402/// signbits / leading zero bits to be truncated with PACKSS / PACKUS,
20403/// possibly by converting a SRL node to SRA for sign extension.
20404static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
20405 SDValue In, const SDLoc &DL,
20406 SelectionDAG &DAG,
20407 const X86Subtarget &Subtarget) {
20408 // Requires SSE2.
20409 if (!Subtarget.hasSSE2())
20410 return SDValue();
20411
20412 EVT SrcVT = In.getValueType();
20413 EVT DstSVT = DstVT.getVectorElementType();
20414 EVT SrcSVT = SrcVT.getVectorElementType();
20415 unsigned NumDstEltBits = DstSVT.getSizeInBits();
20416 unsigned NumSrcEltBits = SrcSVT.getSizeInBits();
20417
20418 // Check we have a truncation suited for PACKSS/PACKUS.
20419 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20420 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
20421 return SDValue();
20422
20423 assert(NumSrcEltBits > NumDstEltBits && "Bad truncation");
20424 unsigned NumStages = Log2_32(NumSrcEltBits / NumDstEltBits);
20425
20426 // Truncation from 128-bit to vXi32 can be better handled with PSHUFD.
20427 // Truncation to sub-64-bit vXi16 can be better handled with PSHUFD/PSHUFLW.
20428 // Truncation from v2i64 to v2i8 can be better handled with PSHUFB.
20429 if ((DstSVT == MVT::i32 && SrcVT.getSizeInBits() <= 128) ||
20430 (DstSVT == MVT::i16 && SrcVT.getSizeInBits() <= (64 * NumStages)) ||
20431 (DstVT == MVT::v2i8 && SrcVT == MVT::v2i64 && Subtarget.hasSSSE3()))
20432 return SDValue();
20433
20434 // Prefer to lower v4i64 -> v4i32 as a shuffle unless we can cheaply
20435 // split this for packing.
20436 if (SrcVT == MVT::v4i64 && DstVT == MVT::v4i32 &&
20437 !isFreeToSplitVector(In.getNode(), DAG) &&
20438 (!Subtarget.hasAVX() || DAG.ComputeNumSignBits(In) != 64))
20439 return SDValue();
20440
20441 // Don't truncate AVX512 targets as multiple PACK nodes stages.
20442 if (Subtarget.hasAVX512() && NumStages > 1)
20443 return SDValue();
20444
20445 unsigned NumPackedSignBits = std::min<unsigned>(NumDstEltBits, 16);
20446 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
20447
20448 // Truncate with PACKUS if we are truncating a vector with leading zero
20449 // bits that extend all the way to the packed/truncated value.
20450 // e.g. Masks, zext_in_reg, etc.
20451 // Pre-SSE41 we can only use PACKUSWB.
20452 KnownBits Known = DAG.computeKnownBits(In);
20453 if ((NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) {
20454 PackOpcode = X86ISD::PACKUS;
20455 return In;
20456 }
20457
20458 // Truncate with PACKSS if we are truncating a vector with sign-bits
20459 // that extend all the way to the packed/truncated value.
20460 // e.g. Comparison result, sext_in_reg, etc.
20461 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
20462
20463 // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
20464 // a sign splat (or AVX512 VPSRAQ support). ComputeNumSignBits struggles to
20465 // see through BITCASTs later on and combines/simplifications can't then use
20466 // it.
20467 if (DstSVT == MVT::i32 && NumSignBits != NumSrcEltBits &&
20468 !Subtarget.hasAVX512())
20469 return SDValue();
20470
20471 unsigned MinSignBits = NumSrcEltBits - NumPackedSignBits;
20472 if (MinSignBits < NumSignBits) {
20473 PackOpcode = X86ISD::PACKSS;
20474 return In;
20475 }
20476
20477 // If we have a srl that only generates signbits that we will discard in
20478 // the truncation then we can use PACKSS by converting the srl to a sra.
20479 // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
20480 if (In.getOpcode() == ISD::SRL && In->hasOneUse())
20481 if (const APInt *ShAmt = DAG.getValidShiftAmountConstant(In)) {
20482 if (*ShAmt == MinSignBits) {
20483 PackOpcode = X86ISD::PACKSS;
20484 return DAG.getNode(ISD::SRA, DL, SrcVT, In->ops());
20485 }
20486 }
20487
20488 return SDValue();
20489}
20490
20491/// This function lowers a vector truncation of 'extended sign-bits' or
20492/// 'extended zero-bits' values.
20493/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
20495 const SDLoc &DL,
20496 const X86Subtarget &Subtarget,
20497 SelectionDAG &DAG) {
20498 MVT SrcVT = In.getSimpleValueType();
20499 MVT DstSVT = DstVT.getVectorElementType();
20500 MVT SrcSVT = SrcVT.getVectorElementType();
20501 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20502 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
20503 return SDValue();
20504
20505 // If the upper half of the source is undef, then attempt to split and
20506 // only truncate the lower half.
20507 if (DstVT.getSizeInBits() >= 128) {
20508 SmallVector<SDValue> LowerOps;
20509 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
20510 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
20511 if (SDValue Res = LowerTruncateVecPackWithSignBits(DstHalfVT, Lo, DL,
20512 Subtarget, DAG))
20513 return widenSubVector(Res, false, Subtarget, DAG, DL,
20514 DstVT.getSizeInBits());
20515 }
20516 }
20517
20518 unsigned PackOpcode;
20519 if (SDValue Src =
20520 matchTruncateWithPACK(PackOpcode, DstVT, In, DL, DAG, Subtarget))
20521 return truncateVectorWithPACK(PackOpcode, DstVT, Src, DL, DAG, Subtarget);
20522
20523 return SDValue();
20524}
20525
20526/// This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into
20527/// X86ISD::PACKUS/X86ISD::PACKSS operations.
20529 const X86Subtarget &Subtarget,
20530 SelectionDAG &DAG) {
20531 MVT SrcVT = In.getSimpleValueType();
20532 MVT DstSVT = DstVT.getVectorElementType();
20533 MVT SrcSVT = SrcVT.getVectorElementType();
20534 unsigned NumElems = DstVT.getVectorNumElements();
20535 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20536 (DstSVT == MVT::i8 || DstSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
20537 NumElems >= 8))
20538 return SDValue();
20539
20540 // SSSE3's pshufb results in less instructions in the cases below.
20541 if (Subtarget.hasSSSE3() && NumElems == 8) {
20542 if (SrcSVT == MVT::i16)
20543 return SDValue();
20544 if (SrcSVT == MVT::i32 && (DstSVT == MVT::i8 || !Subtarget.hasSSE41()))
20545 return SDValue();
20546 }
20547
20548 // If the upper half of the source is undef, then attempt to split and
20549 // only truncate the lower half.
20550 if (DstVT.getSizeInBits() >= 128) {
20551 SmallVector<SDValue> LowerOps;
20552 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
20553 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
20554 if (SDValue Res = LowerTruncateVecPack(DstHalfVT, Lo, DL, Subtarget, DAG))
20555 return widenSubVector(Res, false, Subtarget, DAG, DL,
20556 DstVT.getSizeInBits());
20557 }
20558 }
20559
20560 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
20561 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
20562 // truncate 2 x v4i32 to v8i16.
20563 if (Subtarget.hasSSE41() || DstSVT == MVT::i8)
20564 return truncateVectorWithPACKUS(DstVT, In, DL, Subtarget, DAG);
20565
20566 if (SrcSVT == MVT::i16 || SrcSVT == MVT::i32)
20567 return truncateVectorWithPACKSS(DstVT, In, DL, Subtarget, DAG);
20568
20569 // Special case vXi64 -> vXi16, shuffle to vXi32 and then use PACKSS.
20570 if (DstSVT == MVT::i16 && SrcSVT == MVT::i64) {
20571 MVT TruncVT = MVT::getVectorVT(MVT::i32, NumElems);
20572 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, In);
20573 return truncateVectorWithPACKSS(DstVT, Trunc, DL, Subtarget, DAG);
20574 }
20575
20576 return SDValue();
20577}
20578
20580 const X86Subtarget &Subtarget) {
20581
20582 SDLoc DL(Op);
20583 MVT VT = Op.getSimpleValueType();
20584 SDValue In = Op.getOperand(0);
20585 MVT InVT = In.getSimpleValueType();
20586
20587 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
20588
20589 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
20590 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
20591 if (InVT.getScalarSizeInBits() <= 16) {
20592 if (Subtarget.hasBWI()) {
20593 // legal, will go to VPMOVB2M, VPMOVW2M
20594 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
20595 // We need to shift to get the lsb into sign position.
20596 // Shift packed bytes not supported natively, bitcast to word
20597 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
20598 In = DAG.getNode(ISD::SHL, DL, ExtVT,
20599 DAG.getBitcast(ExtVT, In),
20600 DAG.getConstant(ShiftInx, DL, ExtVT));
20601 In = DAG.getBitcast(InVT, In);
20602 }
20603 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
20604 In, ISD::SETGT);
20605 }
20606 // Use TESTD/Q, extended vector to packed dword/qword.
20607 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
20608 "Unexpected vector type.");
20609 unsigned NumElts = InVT.getVectorNumElements();
20610 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
20611 // We need to change to a wider element type that we have support for.
20612 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
20613 // For 16 element vectors we extend to v16i32 unless we are explicitly
20614 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
20615 // we need to split into two 8 element vectors which we can extend to v8i32,
20616 // truncate and concat the results. There's an additional complication if
20617 // the original type is v16i8. In that case we can't split the v16i8
20618 // directly, so we need to shuffle high elements to low and use
20619 // sign_extend_vector_inreg.
20620 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
20621 SDValue Lo, Hi;
20622 if (InVT == MVT::v16i8) {
20623 Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
20624 Hi = DAG.getVectorShuffle(
20625 InVT, DL, In, In,
20626 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
20627 Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
20628 } else {
20629 assert(InVT == MVT::v16i16 && "Unexpected VT!");
20630 Lo = extract128BitVector(In, 0, DAG, DL);
20631 Hi = extract128BitVector(In, 8, DAG, DL);
20632 }
20633 // We're split now, just emit two truncates and a concat. The two
20634 // truncates will trigger legalization to come back to this function.
20635 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
20636 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
20637 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
20638 }
20639 // We either have 8 elements or we're allowed to use 512-bit vectors.
20640 // If we have VLX, we want to use the narrowest vector that can get the
20641 // job done so we use vXi32.
20642 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
20643 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
20644 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
20645 InVT = ExtVT;
20646 ShiftInx = InVT.getScalarSizeInBits() - 1;
20647 }
20648
20649 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
20650 // We need to shift to get the lsb into sign position.
20651 In = DAG.getNode(ISD::SHL, DL, InVT, In,
20652 DAG.getConstant(ShiftInx, DL, InVT));
20653 }
20654 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
20655 if (Subtarget.hasDQI())
20656 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
20657 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
20658}
20659
20660SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
20661 SDLoc DL(Op);
20662 MVT VT = Op.getSimpleValueType();
20663 SDValue In = Op.getOperand(0);
20664 MVT InVT = In.getSimpleValueType();
20666 "Invalid TRUNCATE operation");
20667
20668 // If we're called by the type legalizer, handle a few cases.
20669 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20670 if (!TLI.isTypeLegal(VT) || !TLI.isTypeLegal(InVT)) {
20671 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
20672 VT.is128BitVector() && Subtarget.hasAVX512()) {
20673 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
20674 "Unexpected subtarget!");
20675 // The default behavior is to truncate one step, concatenate, and then
20676 // truncate the remainder. We'd rather produce two 64-bit results and
20677 // concatenate those.
20678 SDValue Lo, Hi;
20679 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
20680
20681 EVT LoVT, HiVT;
20682 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
20683
20684 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
20685 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
20686 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
20687 }
20688
20689 // Pre-AVX512 (or prefer-256bit) see if we can make use of PACKSS/PACKUS.
20690 if (!Subtarget.hasAVX512() ||
20691 (InVT.is512BitVector() && VT.is256BitVector()))
20692 if (SDValue SignPack =
20693 LowerTruncateVecPackWithSignBits(VT, In, DL, Subtarget, DAG))
20694 return SignPack;
20695
20696 // Pre-AVX512 see if we can make use of PACKSS/PACKUS.
20697 if (!Subtarget.hasAVX512())
20698 return LowerTruncateVecPack(VT, In, DL, Subtarget, DAG);
20699
20700 // Otherwise let default legalization handle it.
20701 return SDValue();
20702 }
20703
20704 if (VT.getVectorElementType() == MVT::i1)
20705 return LowerTruncateVecI1(Op, DAG, Subtarget);
20706
20707 // Attempt to truncate with PACKUS/PACKSS even on AVX512 if we'd have to
20708 // concat from subvectors to use VPTRUNC etc.
20709 if (!Subtarget.hasAVX512() || isFreeToSplitVector(In.getNode(), DAG))
20710 if (SDValue SignPack =
20711 LowerTruncateVecPackWithSignBits(VT, In, DL, Subtarget, DAG))
20712 return SignPack;
20713
20714 // vpmovqb/w/d, vpmovdb/w, vpmovwb
20715 if (Subtarget.hasAVX512()) {
20716 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
20717 assert(VT == MVT::v32i8 && "Unexpected VT!");
20718 return splitVectorIntUnary(Op, DAG, DL);
20719 }
20720
20721 // word to byte only under BWI. Otherwise we have to promoted to v16i32
20722 // and then truncate that. But we should only do that if we haven't been
20723 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
20724 // handled by isel patterns.
20725 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
20726 Subtarget.canExtendTo512DQ())
20727 return Op;
20728 }
20729
20730 // Handle truncation of V256 to V128 using shuffles.
20731 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
20732
20733 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
20734 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
20735 if (Subtarget.hasInt256()) {
20736 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
20737 In = DAG.getBitcast(MVT::v8i32, In);
20738 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
20739 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
20740 DAG.getIntPtrConstant(0, DL));
20741 }
20742
20743 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
20744 DAG.getIntPtrConstant(0, DL));
20745 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
20746 DAG.getIntPtrConstant(2, DL));
20747 static const int ShufMask[] = {0, 2, 4, 6};
20748 return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),
20749 DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);
20750 }
20751
20752 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
20753 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
20754 if (Subtarget.hasInt256()) {
20755 // The PSHUFB mask:
20756 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
20757 -1, -1, -1, -1, -1, -1, -1, -1,
20758 16, 17, 20, 21, 24, 25, 28, 29,
20759 -1, -1, -1, -1, -1, -1, -1, -1 };
20760 In = DAG.getBitcast(MVT::v32i8, In);
20761 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
20762 In = DAG.getBitcast(MVT::v4i64, In);
20763
20764 static const int ShufMask2[] = {0, 2, -1, -1};
20765 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
20766 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
20767 DAG.getIntPtrConstant(0, DL));
20768 return DAG.getBitcast(MVT::v8i16, In);
20769 }
20770
20771 return Subtarget.hasSSE41()
20772 ? truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG)
20773 : truncateVectorWithPACKSS(VT, In, DL, Subtarget, DAG);
20774 }
20775
20776 if (VT == MVT::v16i8 && InVT == MVT::v16i16)
20777 return truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG);
20778
20779 llvm_unreachable("All 256->128 cases should have been handled above!");
20780}
20781
20782// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
20783// behaves on out of range inputs to generate optimized conversions.
20785 SelectionDAG &DAG,
20786 const X86Subtarget &Subtarget) {
20787 MVT SrcVT = Src.getSimpleValueType();
20788 unsigned DstBits = VT.getScalarSizeInBits();
20789 assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported");
20790
20791 // Calculate the converted result for values in the range 0 to
20792 // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
20793 SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
20794 SDValue Big =
20795 DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
20796 DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
20797 DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
20798
20799 // The "CVTTP2SI" instruction conveniently sets the sign bit if
20800 // and only if the value was out of range. So we can use that
20801 // as our indicator that we rather use "Big" instead of "Small".
20802 //
20803 // Use "Small" if "IsOverflown" has all bits cleared
20804 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
20805
20806 // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
20807 // use the slightly slower blendv select instead.
20808 if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
20809 SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
20810 return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
20811 }
20812
20813 SDValue IsOverflown =
20814 DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
20815 DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
20816 return DAG.getNode(ISD::OR, dl, VT, Small,
20817 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
20818}
20819
20820SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
20821 bool IsStrict = Op->isStrictFPOpcode();
20822 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
20823 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
20824 MVT VT = Op->getSimpleValueType(0);
20825 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20826 SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
20827 MVT SrcVT = Src.getSimpleValueType();
20828 SDLoc dl(Op);
20829
20830 SDValue Res;
20831 if (isSoftF16(SrcVT, Subtarget)) {
20832 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
20833 if (IsStrict)
20834 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
20835 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
20836 {NVT, MVT::Other}, {Chain, Src})});
20837 return DAG.getNode(Op.getOpcode(), dl, VT,
20838 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
20839 } else if (isTypeLegal(SrcVT) && isLegalConversion(VT, IsSigned, Subtarget)) {
20840 return Op;
20841 }
20842
20843 if (VT.isVector()) {
20844 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
20845 MVT ResVT = MVT::v4i32;
20846 MVT TruncVT = MVT::v4i1;
20847 unsigned Opc;
20848 if (IsStrict)
20850 else
20851 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
20852
20853 if (!IsSigned && !Subtarget.hasVLX()) {
20854 assert(Subtarget.useAVX512Regs() && "Unexpected features!");
20855 // Widen to 512-bits.
20856 ResVT = MVT::v8i32;
20857 TruncVT = MVT::v8i1;
20858 Opc = Op.getOpcode();
20859 // Need to concat with zero vector for strict fp to avoid spurious
20860 // exceptions.
20861 // TODO: Should we just do this for non-strict as well?
20862 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
20863 : DAG.getUNDEF(MVT::v8f64);
20864 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
20865 DAG.getIntPtrConstant(0, dl));
20866 }
20867 if (IsStrict) {
20868 Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
20869 Chain = Res.getValue(1);
20870 } else {
20871 Res = DAG.getNode(Opc, dl, ResVT, Src);
20872 }
20873
20874 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
20875 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
20876 DAG.getIntPtrConstant(0, dl));
20877 if (IsStrict)
20878 return DAG.getMergeValues({Res, Chain}, dl);
20879 return Res;
20880 }
20881
20882 if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
20883 if (VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16)
20884 return Op;
20885
20886 MVT ResVT = VT;
20887 MVT EleVT = VT.getVectorElementType();
20888 if (EleVT != MVT::i64)
20889 ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
20890
20891 if (SrcVT != MVT::v8f16) {
20892 SDValue Tmp =
20893 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
20894 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
20895 Ops[0] = Src;
20896 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
20897 }
20898
20899 if (IsStrict) {
20900 Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
20902 dl, {ResVT, MVT::Other}, {Chain, Src});
20903 Chain = Res.getValue(1);
20904 } else {
20905 Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,
20906 ResVT, Src);
20907 }
20908
20909 // TODO: Need to add exception check code for strict FP.
20910 if (EleVT.getSizeInBits() < 16) {
20911 ResVT = MVT::getVectorVT(EleVT, 8);
20912 Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
20913 }
20914
20915 if (ResVT != VT)
20916 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
20917 DAG.getIntPtrConstant(0, dl));
20918
20919 if (IsStrict)
20920 return DAG.getMergeValues({Res, Chain}, dl);
20921 return Res;
20922 }
20923
20924 // v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first.
20925 if (VT.getVectorElementType() == MVT::i16) {
20926 assert((SrcVT.getVectorElementType() == MVT::f32 ||
20927 SrcVT.getVectorElementType() == MVT::f64) &&
20928 "Expected f32/f64 vector!");
20929 MVT NVT = VT.changeVectorElementType(MVT::i32);
20930 if (IsStrict) {
20931 Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT
20933 dl, {NVT, MVT::Other}, {Chain, Src});
20934 Chain = Res.getValue(1);
20935 } else {
20936 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
20937 NVT, Src);
20938 }
20939
20940 // TODO: Need to add exception check code for strict FP.
20941 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20942
20943 if (IsStrict)
20944 return DAG.getMergeValues({Res, Chain}, dl);
20945 return Res;
20946 }
20947
20948 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
20949 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
20950 assert(!IsSigned && "Expected unsigned conversion!");
20951 assert(Subtarget.useAVX512Regs() && "Requires avx512f");
20952 return Op;
20953 }
20954
20955 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
20956 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
20957 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
20958 Subtarget.useAVX512Regs()) {
20959 assert(!IsSigned && "Expected unsigned conversion!");
20960 assert(!Subtarget.hasVLX() && "Unexpected features!");
20961 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
20962 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
20963 // Need to concat with zero vector for strict fp to avoid spurious
20964 // exceptions.
20965 // TODO: Should we just do this for non-strict as well?
20966 SDValue Tmp =
20967 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
20968 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
20969 DAG.getIntPtrConstant(0, dl));
20970
20971 if (IsStrict) {
20972 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
20973 {Chain, Src});
20974 Chain = Res.getValue(1);
20975 } else {
20976 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
20977 }
20978
20979 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
20980 DAG.getIntPtrConstant(0, dl));
20981
20982 if (IsStrict)
20983 return DAG.getMergeValues({Res, Chain}, dl);
20984 return Res;
20985 }
20986
20987 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
20988 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
20989 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
20990 Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
20991 assert(!Subtarget.hasVLX() && "Unexpected features!");
20992 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
20993 // Need to concat with zero vector for strict fp to avoid spurious
20994 // exceptions.
20995 // TODO: Should we just do this for non-strict as well?
20996 SDValue Tmp =
20997 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
20998 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
20999 DAG.getIntPtrConstant(0, dl));
21000
21001 if (IsStrict) {
21002 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21003 {Chain, Src});
21004 Chain = Res.getValue(1);
21005 } else {
21006 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
21007 }
21008
21009 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21010 DAG.getIntPtrConstant(0, dl));
21011
21012 if (IsStrict)
21013 return DAG.getMergeValues({Res, Chain}, dl);
21014 return Res;
21015 }
21016
21017 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
21018 if (!Subtarget.hasVLX()) {
21019 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
21020 // legalizer and then widened again by vector op legalization.
21021 if (!IsStrict)
21022 return SDValue();
21023
21024 SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
21025 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
21026 {Src, Zero, Zero, Zero});
21027 Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21028 {Chain, Tmp});
21029 SDValue Chain = Tmp.getValue(1);
21030 Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
21031 DAG.getIntPtrConstant(0, dl));
21032 return DAG.getMergeValues({Tmp, Chain}, dl);
21033 }
21034
21035 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
21036 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
21037 DAG.getUNDEF(MVT::v2f32));
21038 if (IsStrict) {
21039 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
21041 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
21042 }
21043 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21044 return DAG.getNode(Opc, dl, VT, Tmp);
21045 }
21046
21047 // Generate optimized instructions for pre AVX512 unsigned conversions from
21048 // vXf32 to vXi32.
21049 if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
21050 (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
21051 (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
21052 assert(!IsSigned && "Expected unsigned conversion!");
21053 return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
21054 }
21055
21056 return SDValue();
21057 }
21058
21059 assert(!VT.isVector());
21060
21061 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
21062
21063 if (!IsSigned && UseSSEReg) {
21064 // Conversions from f32/f64 with AVX512 should be legal.
21065 if (Subtarget.hasAVX512())
21066 return Op;
21067
21068 // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
21069 // behaves on out of range inputs to generate optimized conversions.
21070 if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
21071 (VT == MVT::i64 && Subtarget.is64Bit()))) {
21072 unsigned DstBits = VT.getScalarSizeInBits();
21073 APInt UIntLimit = APInt::getSignMask(DstBits);
21074 SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
21075 DAG.getConstant(UIntLimit, dl, VT));
21076 MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
21077
21078 // Calculate the converted result for values in the range:
21079 // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21080 // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
21081 SDValue Small =
21082 DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
21083 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
21084 SDValue Big = DAG.getNode(
21085 X86ISD::CVTTS2SI, dl, VT,
21086 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
21087 DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
21088
21089 // The "CVTTS2SI" instruction conveniently sets the sign bit if
21090 // and only if the value was out of range. So we can use that
21091 // as our indicator that we rather use "Big" instead of "Small".
21092 //
21093 // Use "Small" if "IsOverflown" has all bits cleared
21094 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21095 SDValue IsOverflown = DAG.getNode(
21096 ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
21097 return DAG.getNode(ISD::OR, dl, VT, Small,
21098 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21099 }
21100
21101 // Use default expansion for i64.
21102 if (VT == MVT::i64)
21103 return SDValue();
21104
21105 assert(VT == MVT::i32 && "Unexpected VT!");
21106
21107 // Promote i32 to i64 and use a signed operation on 64-bit targets.
21108 // FIXME: This does not generate an invalid exception if the input does not
21109 // fit in i32. PR44019
21110 if (Subtarget.is64Bit()) {
21111 if (IsStrict) {
21112 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},
21113 {Chain, Src});
21114 Chain = Res.getValue(1);
21115 } else
21116 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
21117
21118 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21119 if (IsStrict)
21120 return DAG.getMergeValues({Res, Chain}, dl);
21121 return Res;
21122 }
21123
21124 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
21125 // use fisttp which will be handled later.
21126 if (!Subtarget.hasSSE3())
21127 return SDValue();
21128 }
21129
21130 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
21131 // FIXME: This does not generate an invalid exception if the input does not
21132 // fit in i16. PR44019
21133 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
21134 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
21135 if (IsStrict) {
21136 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},
21137 {Chain, Src});
21138 Chain = Res.getValue(1);
21139 } else
21140 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
21141
21142 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21143 if (IsStrict)
21144 return DAG.getMergeValues({Res, Chain}, dl);
21145 return Res;
21146 }
21147
21148 // If this is a FP_TO_SINT using SSEReg we're done.
21149 if (UseSSEReg && IsSigned)
21150 return Op;
21151
21152 // fp128 needs to use a libcall.
21153 if (SrcVT == MVT::f128) {
21154 RTLIB::Libcall LC;
21155 if (IsSigned)
21156 LC = RTLIB::getFPTOSINT(SrcVT, VT);
21157 else
21158 LC = RTLIB::getFPTOUINT(SrcVT, VT);
21159
21160 MakeLibCallOptions CallOptions;
21161 std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,
21162 SDLoc(Op), Chain);
21163
21164 if (IsStrict)
21165 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
21166
21167 return Tmp.first;
21168 }
21169
21170 // Fall back to X87.
21171 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
21172 if (IsStrict)
21173 return DAG.getMergeValues({V, Chain}, dl);
21174 return V;
21175 }
21176
21177 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
21178}
21179
21180SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
21181 SelectionDAG &DAG) const {
21182 SDValue Src = Op.getOperand(0);
21183 EVT DstVT = Op.getSimpleValueType();
21184 MVT SrcVT = Src.getSimpleValueType();
21185
21186 if (SrcVT.isVector())
21187 return DstVT.getScalarType() == MVT::i32 ? Op : SDValue();
21188
21189 if (SrcVT == MVT::f16)
21190 return SDValue();
21191
21192 // If the source is in an SSE register, the node is Legal.
21193 if (isScalarFPTypeInSSEReg(SrcVT))
21194 return Op;
21195
21196 return LRINT_LLRINTHelper(Op.getNode(), DAG);
21197}
21198
21199SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
21200 SelectionDAG &DAG) const {
21201 EVT DstVT = N->getValueType(0);
21202 SDValue Src = N->getOperand(0);
21203 EVT SrcVT = Src.getValueType();
21204
21205 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
21206 // f16 must be promoted before using the lowering in this routine.
21207 // fp128 does not use this lowering.
21208 return SDValue();
21209 }
21210
21211 SDLoc DL(N);
21212 SDValue Chain = DAG.getEntryNode();
21213
21214 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
21215
21216 // If we're converting from SSE, the stack slot needs to hold both types.
21217 // Otherwise it only needs to hold the DstVT.
21218 EVT OtherVT = UseSSE ? SrcVT : DstVT;
21219 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
21220 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
21221 MachinePointerInfo MPI =
21223
21224 if (UseSSE) {
21225 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
21226 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
21227 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
21228 SDValue Ops[] = { Chain, StackPtr };
21229
21230 Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
21231 /*Align*/ std::nullopt,
21233 Chain = Src.getValue(1);
21234 }
21235
21236 SDValue StoreOps[] = { Chain, Src, StackPtr };
21237 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
21238 StoreOps, DstVT, MPI, /*Align*/ std::nullopt,
21240
21241 return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
21242}
21243
21244SDValue
21245X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
21246 // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
21247 // but making use of X86 specifics to produce better instruction sequences.
21248 SDNode *Node = Op.getNode();
21249 bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
21250 unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
21251 SDLoc dl(SDValue(Node, 0));
21252 SDValue Src = Node->getOperand(0);
21253
21254 // There are three types involved here: SrcVT is the source floating point
21255 // type, DstVT is the type of the result, and TmpVT is the result of the
21256 // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
21257 // DstVT).
21258 EVT SrcVT = Src.getValueType();
21259 EVT DstVT = Node->getValueType(0);
21260 EVT TmpVT = DstVT;
21261
21262 // This code is only for floats and doubles. Fall back to generic code for
21263 // anything else.
21264 if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftF16(SrcVT, Subtarget))
21265 return SDValue();
21266
21267 EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
21268 unsigned SatWidth = SatVT.getScalarSizeInBits();
21269 unsigned DstWidth = DstVT.getScalarSizeInBits();
21270 unsigned TmpWidth = TmpVT.getScalarSizeInBits();
21271 assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&
21272 "Expected saturation width smaller than result width");
21273
21274 // Promote result of FP_TO_*INT to at least 32 bits.
21275 if (TmpWidth < 32) {
21276 TmpVT = MVT::i32;
21277 TmpWidth = 32;
21278 }
21279
21280 // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
21281 // us to use a native signed conversion instead.
21282 if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
21283 TmpVT = MVT::i64;
21284 TmpWidth = 64;
21285 }
21286
21287 // If the saturation width is smaller than the size of the temporary result,
21288 // we can always use signed conversion, which is native.
21289 if (SatWidth < TmpWidth)
21290 FpToIntOpcode = ISD::FP_TO_SINT;
21291
21292 // Determine minimum and maximum integer values and their corresponding
21293 // floating-point values.
21294 APInt MinInt, MaxInt;
21295 if (IsSigned) {
21296 MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);
21297 MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);
21298 } else {
21299 MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);
21300 MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);
21301 }
21302
21303 APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));
21304 APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT));
21305
21306 APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
21307 MinInt, IsSigned, APFloat::rmTowardZero);
21308 APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
21309 MaxInt, IsSigned, APFloat::rmTowardZero);
21310 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
21311 && !(MaxStatus & APFloat::opStatus::opInexact);
21312
21313 SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
21314 SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
21315
21316 // If the integer bounds are exactly representable as floats, emit a
21317 // min+max+fptoi sequence. Otherwise use comparisons and selects.
21318 if (AreExactFloatBounds) {
21319 if (DstVT != TmpVT) {
21320 // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
21321 SDValue MinClamped = DAG.getNode(
21322 X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
21323 // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
21324 SDValue BothClamped = DAG.getNode(
21325 X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
21326 // Convert clamped value to integer.
21327 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
21328
21329 // NaN will become INDVAL, with the top bit set and the rest zero.
21330 // Truncation will discard the top bit, resulting in zero.
21331 return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
21332 }
21333
21334 // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
21335 SDValue MinClamped = DAG.getNode(
21336 X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
21337 // Clamp by MaxFloat from above. NaN cannot occur.
21338 SDValue BothClamped = DAG.getNode(
21339 X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
21340 // Convert clamped value to integer.
21341 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
21342
21343 if (!IsSigned) {
21344 // In the unsigned case we're done, because we mapped NaN to MinFloat,
21345 // which is zero.
21346 return FpToInt;
21347 }
21348
21349 // Otherwise, select zero if Src is NaN.
21350 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
21351 return DAG.getSelectCC(
21352 dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
21353 }
21354
21355 SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
21356 SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
21357
21358 // Result of direct conversion, which may be selected away.
21359 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
21360
21361 if (DstVT != TmpVT) {
21362 // NaN will become INDVAL, with the top bit set and the rest zero.
21363 // Truncation will discard the top bit, resulting in zero.
21364 FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
21365 }
21366
21367 SDValue Select = FpToInt;
21368 // For signed conversions where we saturate to the same size as the
21369 // result type of the fptoi instructions, INDVAL coincides with integer
21370 // minimum, so we don't need to explicitly check it.
21371 if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
21372 // If Src ULT MinFloat, select MinInt. In particular, this also selects
21373 // MinInt if Src is NaN.
21374 Select = DAG.getSelectCC(
21375 dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
21376 }
21377
21378 // If Src OGT MaxFloat, select MaxInt.
21379 Select = DAG.getSelectCC(
21380 dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
21381
21382 // In the unsigned case we are done, because we mapped NaN to MinInt, which
21383 // is already zero. The promoted case was already handled above.
21384 if (!IsSigned || DstVT != TmpVT) {
21385 return Select;
21386 }
21387
21388 // Otherwise, select 0 if Src is NaN.
21389 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
21390 return DAG.getSelectCC(
21391 dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
21392}
21393
21394SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
21395 bool IsStrict = Op->isStrictFPOpcode();
21396
21397 SDLoc DL(Op);
21398 MVT VT = Op.getSimpleValueType();
21399 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
21400 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
21401 MVT SVT = In.getSimpleValueType();
21402
21403 // Let f16->f80 get lowered to a libcall, except for darwin, where we should
21404 // lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available)
21405 if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 &&
21406 !Subtarget.getTargetTriple().isOSDarwin()))
21407 return SDValue();
21408
21409 if ((SVT == MVT::v8f16 && Subtarget.hasF16C()) ||
21410 (SVT == MVT::v16f16 && Subtarget.useAVX512Regs()))
21411 return Op;
21412
21413 if (SVT == MVT::f16) {
21414 if (Subtarget.hasFP16())
21415 return Op;
21416
21417 if (VT != MVT::f32) {
21418 if (IsStrict)
21419 return DAG.getNode(
21420 ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},
21421 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,
21422 {MVT::f32, MVT::Other}, {Chain, In})});
21423
21424 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
21425 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));
21426 }
21427
21428 if (!Subtarget.hasF16C()) {
21429 if (!Subtarget.getTargetTriple().isOSDarwin())
21430 return SDValue();
21431
21432 assert(VT == MVT::f32 && SVT == MVT::f16 && "unexpected extend libcall");
21433
21434 // Need a libcall, but ABI for f16 is soft-float on MacOS.
21436 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
21437
21438 In = DAG.getBitcast(MVT::i16, In);
21441 Entry.Node = In;
21442 Entry.Ty = EVT(MVT::i16).getTypeForEVT(*DAG.getContext());
21443 Entry.IsSExt = false;
21444 Entry.IsZExt = true;
21445 Args.push_back(Entry);
21446
21448 getLibcallName(RTLIB::FPEXT_F16_F32),
21450 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
21451 CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee,
21452 std::move(Args));
21453
21454 SDValue Res;
21455 std::tie(Res,Chain) = LowerCallTo(CLI);
21456 if (IsStrict)
21457 Res = DAG.getMergeValues({Res, Chain}, DL);
21458
21459 return Res;
21460 }
21461
21462 In = DAG.getBitcast(MVT::i16, In);
21463 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,
21464 getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,
21465 DAG.getIntPtrConstant(0, DL));
21466 SDValue Res;
21467 if (IsStrict) {
21468 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},
21469 {Chain, In});
21470 Chain = Res.getValue(1);
21471 } else {
21472 Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,
21473 DAG.getTargetConstant(4, DL, MVT::i32));
21474 }
21475 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,
21476 DAG.getIntPtrConstant(0, DL));
21477 if (IsStrict)
21478 return DAG.getMergeValues({Res, Chain}, DL);
21479 return Res;
21480 }
21481
21482 if (!SVT.isVector() || SVT.getVectorElementType() == MVT::bf16)
21483 return Op;
21484
21485 if (SVT.getVectorElementType() == MVT::f16) {
21486 if (Subtarget.hasFP16() && isTypeLegal(SVT))
21487 return Op;
21488 assert(Subtarget.hasF16C() && "Unexpected features!");
21489 if (SVT == MVT::v2f16)
21490 In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
21491 DAG.getUNDEF(MVT::v2f16));
21492 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,
21493 DAG.getUNDEF(MVT::v4f16));
21494 if (IsStrict)
21495 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
21496 {Op->getOperand(0), Res});
21497 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
21498 } else if (VT == MVT::v4f64 || VT == MVT::v8f64) {
21499 return Op;
21500 }
21501
21502 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
21503
21504 SDValue Res =
21505 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
21506 if (IsStrict)
21507 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
21508 {Op->getOperand(0), Res});
21509 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
21510}
21511
21512SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
21513 bool IsStrict = Op->isStrictFPOpcode();
21514
21515 SDLoc DL(Op);
21516 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
21517 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
21518 MVT VT = Op.getSimpleValueType();
21519 MVT SVT = In.getSimpleValueType();
21520
21521 if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))
21522 return SDValue();
21523
21524 if (VT == MVT::f16 && (SVT == MVT::f64 || SVT == MVT::f32) &&
21525 !Subtarget.hasFP16() && (SVT == MVT::f64 || !Subtarget.hasF16C())) {
21526 if (!Subtarget.getTargetTriple().isOSDarwin())
21527 return SDValue();
21528
21529 // We need a libcall but the ABI for f16 libcalls on MacOS is soft.
21531 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
21532
21535 Entry.Node = In;
21536 Entry.Ty = EVT(SVT).getTypeForEVT(*DAG.getContext());
21537 Entry.IsSExt = false;
21538 Entry.IsZExt = true;
21539 Args.push_back(Entry);
21540
21542 getLibcallName(SVT == MVT::f64 ? RTLIB::FPROUND_F64_F16
21543 : RTLIB::FPROUND_F32_F16),
21545 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
21546 CallingConv::C, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()), Callee,
21547 std::move(Args));
21548
21549 SDValue Res;
21550 std::tie(Res, Chain) = LowerCallTo(CLI);
21551
21552 Res = DAG.getBitcast(MVT::f16, Res);
21553
21554 if (IsStrict)
21555 Res = DAG.getMergeValues({Res, Chain}, DL);
21556
21557 return Res;
21558 }
21559
21560 if (VT.getScalarType() == MVT::bf16) {
21561 if (SVT.getScalarType() == MVT::f32 &&
21562 ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
21563 Subtarget.hasAVXNECONVERT()))
21564 return Op;
21565 return SDValue();
21566 }
21567
21568 if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {
21569 if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)
21570 return SDValue();
21571
21572 if (VT.isVector())
21573 return Op;
21574
21575 SDValue Res;
21577 MVT::i32);
21578 if (IsStrict) {
21579 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,
21580 DAG.getConstantFP(0, DL, MVT::v4f32), In,
21581 DAG.getIntPtrConstant(0, DL));
21582 Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},
21583 {Chain, Res, Rnd});
21584 Chain = Res.getValue(1);
21585 } else {
21586 // FIXME: Should we use zeros for upper elements for non-strict?
21587 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);
21588 Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);
21589 }
21590
21591 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
21592 DAG.getIntPtrConstant(0, DL));
21593 Res = DAG.getBitcast(MVT::f16, Res);
21594
21595 if (IsStrict)
21596 return DAG.getMergeValues({Res, Chain}, DL);
21597
21598 return Res;
21599 }
21600
21601 return Op;
21602}
21603
21605 bool IsStrict = Op->isStrictFPOpcode();
21606 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21607 assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&
21608 "Unexpected VT!");
21609
21610 SDLoc dl(Op);
21611 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
21612 DAG.getConstant(0, dl, MVT::v8i16), Src,
21613 DAG.getIntPtrConstant(0, dl));
21614
21615 SDValue Chain;
21616 if (IsStrict) {
21617 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
21618 {Op.getOperand(0), Res});
21619 Chain = Res.getValue(1);
21620 } else {
21621 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
21622 }
21623
21624 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
21625 DAG.getIntPtrConstant(0, dl));
21626
21627 if (IsStrict)
21628 return DAG.getMergeValues({Res, Chain}, dl);
21629
21630 return Res;
21631}
21632
21634 bool IsStrict = Op->isStrictFPOpcode();
21635 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21636 assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&
21637 "Unexpected VT!");
21638
21639 SDLoc dl(Op);
21640 SDValue Res, Chain;
21641 if (IsStrict) {
21642 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
21643 DAG.getConstantFP(0, dl, MVT::v4f32), Src,
21644 DAG.getIntPtrConstant(0, dl));
21645 Res = DAG.getNode(
21646 X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
21647 {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
21648 Chain = Res.getValue(1);
21649 } else {
21650 // FIXME: Should we use zeros for upper elements for non-strict?
21651 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
21652 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
21653 DAG.getTargetConstant(4, dl, MVT::i32));
21654 }
21655
21656 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
21657 DAG.getIntPtrConstant(0, dl));
21658
21659 if (IsStrict)
21660 return DAG.getMergeValues({Res, Chain}, dl);
21661
21662 return Res;
21663}
21664
21665SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,
21666 SelectionDAG &DAG) const {
21667 SDLoc DL(Op);
21668
21669 MVT SVT = Op.getOperand(0).getSimpleValueType();
21670 if (SVT == MVT::f32 && ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
21671 Subtarget.hasAVXNECONVERT())) {
21672 SDValue Res;
21673 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, Op.getOperand(0));
21674 Res = DAG.getNode(X86ISD::CVTNEPS2BF16, DL, MVT::v8bf16, Res);
21675 Res = DAG.getBitcast(MVT::v8i16, Res);
21676 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
21677 DAG.getIntPtrConstant(0, DL));
21678 }
21679
21680 MakeLibCallOptions CallOptions;
21681 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16);
21682 SDValue Res =
21683 makeLibCall(DAG, LC, MVT::f16, Op.getOperand(0), CallOptions, DL).first;
21684 return DAG.getBitcast(MVT::i16, Res);
21685}
21686
21687/// Depending on uarch and/or optimizing for size, we might prefer to use a
21688/// vector operation in place of the typical scalar operation.
21690 SelectionDAG &DAG,
21691 const X86Subtarget &Subtarget) {
21692 // If both operands have other uses, this is probably not profitable.
21693 SDValue LHS = Op.getOperand(0);
21694 SDValue RHS = Op.getOperand(1);
21695 if (!LHS.hasOneUse() && !RHS.hasOneUse())
21696 return Op;
21697
21698 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
21699 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
21700 if (IsFP && !Subtarget.hasSSE3())
21701 return Op;
21702 if (!IsFP && !Subtarget.hasSSSE3())
21703 return Op;
21704
21705 // Extract from a common vector.
21706 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21707 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21708 LHS.getOperand(0) != RHS.getOperand(0) ||
21709 !isa<ConstantSDNode>(LHS.getOperand(1)) ||
21710 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
21711 !shouldUseHorizontalOp(true, DAG, Subtarget))
21712 return Op;
21713
21714 // Allow commuted 'hadd' ops.
21715 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
21716 unsigned HOpcode;
21717 switch (Op.getOpcode()) {
21718 // clang-format off
21719 case ISD::ADD: HOpcode = X86ISD::HADD; break;
21720 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
21721 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
21722 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
21723 default:
21724 llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
21725 // clang-format on
21726 }
21727 unsigned LExtIndex = LHS.getConstantOperandVal(1);
21728 unsigned RExtIndex = RHS.getConstantOperandVal(1);
21729 if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
21730 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
21731 std::swap(LExtIndex, RExtIndex);
21732
21733 if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
21734 return Op;
21735
21736 SDValue X = LHS.getOperand(0);
21737 EVT VecVT = X.getValueType();
21738 unsigned BitWidth = VecVT.getSizeInBits();
21739 unsigned NumLanes = BitWidth / 128;
21740 unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
21741 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
21742 "Not expecting illegal vector widths here");
21743
21744 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
21745 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
21746 if (BitWidth == 256 || BitWidth == 512) {
21747 unsigned LaneIdx = LExtIndex / NumEltsPerLane;
21748 X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
21749 LExtIndex %= NumEltsPerLane;
21750 }
21751
21752 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
21753 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
21754 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
21755 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
21756 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
21757 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
21758 DAG.getIntPtrConstant(LExtIndex / 2, DL));
21759}
21760
21761/// Depending on uarch and/or optimizing for size, we might prefer to use a
21762/// vector operation in place of the typical scalar operation.
21763SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
21764 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
21765 "Only expecting float/double");
21766 return lowerAddSubToHorizontalOp(Op, SDLoc(Op), DAG, Subtarget);
21767}
21768
21769/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
21770/// This mode isn't supported in hardware on X86. But as long as we aren't
21771/// compiling with trapping math, we can emulate this with
21772/// trunc(X + copysign(nextafter(0.5, 0.0), X)).
21774 SDValue N0 = Op.getOperand(0);
21775 SDLoc dl(Op);
21776 MVT VT = Op.getSimpleValueType();
21777
21778 // N0 += copysign(nextafter(0.5, 0.0), N0)
21780 bool Ignored;
21781 APFloat Point5Pred = APFloat(0.5f);
21782 Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
21783 Point5Pred.next(/*nextDown*/true);
21784
21785 SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
21786 DAG.getConstantFP(Point5Pred, dl, VT), N0);
21787 N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
21788
21789 // Truncate the result to remove fraction.
21790 return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
21791}
21792
21793/// The only differences between FABS and FNEG are the mask and the logic op.
21794/// FNEG also has a folding opportunity for FNEG(FABS(x)).
21796 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
21797 "Wrong opcode for lowering FABS or FNEG.");
21798
21799 bool IsFABS = (Op.getOpcode() == ISD::FABS);
21800
21801 // If this is a FABS and it has an FNEG user, bail out to fold the combination
21802 // into an FNABS. We'll lower the FABS after that if it is still in use.
21803 if (IsFABS)
21804 for (SDNode *User : Op->uses())
21805 if (User->getOpcode() == ISD::FNEG)
21806 return Op;
21807
21808 SDLoc dl(Op);
21809 MVT VT = Op.getSimpleValueType();
21810
21811 bool IsF128 = (VT == MVT::f128);
21812 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
21814 "Unexpected type in LowerFABSorFNEG");
21815
21816 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOptLevel to
21817 // decide if we should generate a 16-byte constant mask when we only need 4 or
21818 // 8 bytes for the scalar case.
21819
21820 // There are no scalar bitwise logical SSE/AVX instructions, so we
21821 // generate a 16-byte vector constant and logic op even for the scalar case.
21822 // Using a 16-byte mask allows folding the load of the mask with
21823 // the logic op, so it can save (~4 bytes) on code size.
21824 bool IsFakeVector = !VT.isVector() && !IsF128;
21825 MVT LogicVT = VT;
21826 if (IsFakeVector)
21827 LogicVT = (VT == MVT::f64) ? MVT::v2f64
21828 : (VT == MVT::f32) ? MVT::v4f32
21829 : MVT::v8f16;
21830
21831 unsigned EltBits = VT.getScalarSizeInBits();
21832 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
21833 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
21834 APInt::getSignMask(EltBits);
21836 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
21837
21838 SDValue Op0 = Op.getOperand(0);
21839 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
21840 unsigned LogicOp = IsFABS ? X86ISD::FAND :
21841 IsFNABS ? X86ISD::FOR :
21843 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
21844
21845 if (VT.isVector() || IsF128)
21846 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
21847
21848 // For the scalar case extend to a 128-bit vector, perform the logic op,
21849 // and extract the scalar result back out.
21850 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
21851 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
21852 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
21853 DAG.getIntPtrConstant(0, dl));
21854}
21855
21857 SDValue Mag = Op.getOperand(0);
21858 SDValue Sign = Op.getOperand(1);
21859 SDLoc dl(Op);
21860
21861 // If the sign operand is smaller, extend it first.
21862 MVT VT = Op.getSimpleValueType();
21863 if (Sign.getSimpleValueType().bitsLT(VT))
21864 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
21865
21866 // And if it is bigger, shrink it first.
21867 if (Sign.getSimpleValueType().bitsGT(VT))
21868 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign,
21869 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
21870
21871 // At this point the operands and the result should have the same
21872 // type, and that won't be f80 since that is not custom lowered.
21873 bool IsF128 = (VT == MVT::f128);
21874 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
21876 "Unexpected type in LowerFCOPYSIGN");
21877
21879
21880 // Perform all scalar logic operations as 16-byte vectors because there are no
21881 // scalar FP logic instructions in SSE.
21882 // TODO: This isn't necessary. If we used scalar types, we might avoid some
21883 // unnecessary splats, but we might miss load folding opportunities. Should
21884 // this decision be based on OptimizeForSize?
21885 bool IsFakeVector = !VT.isVector() && !IsF128;
21886 MVT LogicVT = VT;
21887 if (IsFakeVector)
21888 LogicVT = (VT == MVT::f64) ? MVT::v2f64
21889 : (VT == MVT::f32) ? MVT::v4f32
21890 : MVT::v8f16;
21891
21892 // The mask constants are automatically splatted for vector types.
21893 unsigned EltSizeInBits = VT.getScalarSizeInBits();
21894 SDValue SignMask = DAG.getConstantFP(
21895 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
21896 SDValue MagMask = DAG.getConstantFP(
21897 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
21898
21899 // First, clear all bits but the sign bit from the second operand (sign).
21900 if (IsFakeVector)
21901 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
21902 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
21903
21904 // Next, clear the sign bit from the first operand (magnitude).
21905 // TODO: If we had general constant folding for FP logic ops, this check
21906 // wouldn't be necessary.
21907 SDValue MagBits;
21908 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
21909 APFloat APF = Op0CN->getValueAPF();
21910 APF.clearSign();
21911 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
21912 } else {
21913 // If the magnitude operand wasn't a constant, we need to AND out the sign.
21914 if (IsFakeVector)
21915 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
21916 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
21917 }
21918
21919 // OR the magnitude value with the sign bit.
21920 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
21921 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
21922 DAG.getIntPtrConstant(0, dl));
21923}
21924
21926 SDValue N0 = Op.getOperand(0);
21927 SDLoc dl(Op);
21928 MVT VT = Op.getSimpleValueType();
21929
21930 MVT OpVT = N0.getSimpleValueType();
21931 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
21932 "Unexpected type for FGETSIGN");
21933
21934 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
21935 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
21936 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
21937 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
21938 Res = DAG.getZExtOrTrunc(Res, dl, VT);
21939 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
21940 return Res;
21941}
21942
21943/// Helper for attempting to create a X86ISD::BT node.
21944static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {
21945 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
21946 // instruction. Since the shift amount is in-range-or-undefined, we know
21947 // that doing a bittest on the i32 value is ok. We extend to i32 because
21948 // the encoding for the i16 version is larger than the i32 version.
21949 // Also promote i16 to i32 for performance / code size reason.
21950 if (Src.getValueType().getScalarSizeInBits() < 32)
21951 Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
21952
21953 // No legal type found, give up.
21954 if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))
21955 return SDValue();
21956
21957 // See if we can use the 32-bit instruction instead of the 64-bit one for a
21958 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
21959 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
21960 // known to be zero.
21961 if (Src.getValueType() == MVT::i64 &&
21962 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
21963 Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
21964
21965 // If the operand types disagree, extend the shift amount to match. Since
21966 // BT ignores high bits (like shifts) we can use anyextend.
21967 if (Src.getValueType() != BitNo.getValueType()) {
21968 // Peek through a mask/modulo operation.
21969 // TODO: DAGCombine fails to do this as it just checks isTruncateFree, but
21970 // we probably need a better IsDesirableToPromoteOp to handle this as well.
21971 if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())
21972 BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(),
21973 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
21974 BitNo.getOperand(0)),
21975 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
21976 BitNo.getOperand(1)));
21977 else
21978 BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);
21979 }
21980
21981 return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);
21982}
21983
21984/// Helper for creating a X86ISD::SETCC node.
21986 SelectionDAG &DAG) {
21987 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
21988 DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
21989}
21990
21991/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
21992/// recognizable memcmp expansion.
21993static bool isOrXorXorTree(SDValue X, bool Root = true) {
21994 if (X.getOpcode() == ISD::OR)
21995 return isOrXorXorTree(X.getOperand(0), false) &&
21996 isOrXorXorTree(X.getOperand(1), false);
21997 if (Root)
21998 return false;
21999 return X.getOpcode() == ISD::XOR;
22000}
22001
22002/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
22003/// expansion.
22004template <typename F>
22006 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
22007 SDValue Op0 = X.getOperand(0);
22008 SDValue Op1 = X.getOperand(1);
22009 if (X.getOpcode() == ISD::OR) {
22010 SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22011 SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22012 if (VecVT != CmpVT)
22013 return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
22014 if (HasPT)
22015 return DAG.getNode(ISD::OR, DL, VecVT, A, B);
22016 return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
22017 }
22018 if (X.getOpcode() == ISD::XOR) {
22019 SDValue A = SToV(Op0);
22020 SDValue B = SToV(Op1);
22021 if (VecVT != CmpVT)
22022 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
22023 if (HasPT)
22024 return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
22025 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
22026 }
22027 llvm_unreachable("Impossible");
22028}
22029
22030/// Try to map a 128-bit or larger integer comparison to vector instructions
22031/// before type legalization splits it up into chunks.
22034 const SDLoc &DL,
22035 SelectionDAG &DAG,
22036 const X86Subtarget &Subtarget) {
22037 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
22038
22039 // We're looking for an oversized integer equality comparison.
22040 EVT OpVT = X.getValueType();
22041 unsigned OpSize = OpVT.getSizeInBits();
22042 if (!OpVT.isScalarInteger() || OpSize < 128)
22043 return SDValue();
22044
22045 // Ignore a comparison with zero because that gets special treatment in
22046 // EmitTest(). But make an exception for the special case of a pair of
22047 // logically-combined vector-sized operands compared to zero. This pattern may
22048 // be generated by the memcmp expansion pass with oversized integer compares
22049 // (see PR33325).
22050 bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
22051 if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
22052 return SDValue();
22053
22054 // Don't perform this combine if constructing the vector will be expensive.
22055 auto IsVectorBitCastCheap = [](SDValue X) {
22057 return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
22058 X.getOpcode() == ISD::LOAD;
22059 };
22060 if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
22061 !IsOrXorXorTreeCCZero)
22062 return SDValue();
22063
22064 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
22065 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
22066 // Otherwise use PCMPEQ (plus AND) and mask testing.
22067 bool NoImplicitFloatOps =
22069 Attribute::NoImplicitFloat);
22070 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
22071 ((OpSize == 128 && Subtarget.hasSSE2()) ||
22072 (OpSize == 256 && Subtarget.hasAVX()) ||
22073 (OpSize == 512 && Subtarget.useAVX512Regs()))) {
22074 bool HasPT = Subtarget.hasSSE41();
22075
22076 // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
22077 // vector registers are essentially free. (Technically, widening registers
22078 // prevents load folding, but the tradeoff is worth it.)
22079 bool PreferKOT = Subtarget.preferMaskRegisters();
22080 bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
22081
22082 EVT VecVT = MVT::v16i8;
22083 EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
22084 if (OpSize == 256) {
22085 VecVT = MVT::v32i8;
22086 CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
22087 }
22088 EVT CastVT = VecVT;
22089 bool NeedsAVX512FCast = false;
22090 if (OpSize == 512 || NeedZExt) {
22091 if (Subtarget.hasBWI()) {
22092 VecVT = MVT::v64i8;
22093 CmpVT = MVT::v64i1;
22094 if (OpSize == 512)
22095 CastVT = VecVT;
22096 } else {
22097 VecVT = MVT::v16i32;
22098 CmpVT = MVT::v16i1;
22099 CastVT = OpSize == 512 ? VecVT
22100 : OpSize == 256 ? MVT::v8i32
22101 : MVT::v4i32;
22102 NeedsAVX512FCast = true;
22103 }
22104 }
22105
22106 auto ScalarToVector = [&](SDValue X) -> SDValue {
22107 bool TmpZext = false;
22108 EVT TmpCastVT = CastVT;
22109 if (X.getOpcode() == ISD::ZERO_EXTEND) {
22110 SDValue OrigX = X.getOperand(0);
22111 unsigned OrigSize = OrigX.getScalarValueSizeInBits();
22112 if (OrigSize < OpSize) {
22113 if (OrigSize == 128) {
22114 TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
22115 X = OrigX;
22116 TmpZext = true;
22117 } else if (OrigSize == 256) {
22118 TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
22119 X = OrigX;
22120 TmpZext = true;
22121 }
22122 }
22123 }
22124 X = DAG.getBitcast(TmpCastVT, X);
22125 if (!NeedZExt && !TmpZext)
22126 return X;
22127 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
22128 DAG.getConstant(0, DL, VecVT), X,
22129 DAG.getVectorIdxConstant(0, DL));
22130 };
22131
22132 SDValue Cmp;
22133 if (IsOrXorXorTreeCCZero) {
22134 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
22135 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
22136 // Use 2 vector equality compares and 'and' the results before doing a
22137 // MOVMSK.
22138 Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
22139 } else {
22140 SDValue VecX = ScalarToVector(X);
22141 SDValue VecY = ScalarToVector(Y);
22142 if (VecVT != CmpVT) {
22143 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
22144 } else if (HasPT) {
22145 Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
22146 } else {
22147 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
22148 }
22149 }
22150 // AVX512 should emit a setcc that will lower to kortest.
22151 if (VecVT != CmpVT) {
22152 EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64
22153 : CmpVT == MVT::v32i1 ? MVT::i32
22154 : MVT::i16;
22155 return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
22156 DAG.getConstant(0, DL, KRegVT), CC);
22157 }
22158 if (HasPT) {
22159 SDValue BCCmp =
22160 DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, Cmp);
22161 SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
22163 SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
22164 return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
22165 }
22166 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
22167 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
22168 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
22169 assert(Cmp.getValueType() == MVT::v16i8 &&
22170 "Non 128-bit vector on pre-SSE41 target");
22171 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
22172 SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
22173 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
22174 }
22175
22176 return SDValue();
22177}
22178
22179/// Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...))
22180/// style scalarized (associative) reduction patterns. Partial reductions
22181/// are supported when the pointer SrcMask is non-null.
22182/// TODO - move this to SelectionDAG?
22185 SmallVectorImpl<APInt> *SrcMask = nullptr) {
22187 DenseMap<SDValue, APInt> SrcOpMap;
22188 EVT VT = MVT::Other;
22189
22190 // Recognize a special case where a vector is casted into wide integer to
22191 // test all 0s.
22192 assert(Op.getOpcode() == unsigned(BinOp) &&
22193 "Unexpected bit reduction opcode");
22194 Opnds.push_back(Op.getOperand(0));
22195 Opnds.push_back(Op.getOperand(1));
22196
22197 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
22199 // BFS traverse all BinOp operands.
22200 if (I->getOpcode() == unsigned(BinOp)) {
22201 Opnds.push_back(I->getOperand(0));
22202 Opnds.push_back(I->getOperand(1));
22203 // Re-evaluate the number of nodes to be traversed.
22204 e += 2; // 2 more nodes (LHS and RHS) are pushed.
22205 continue;
22206 }
22207
22208 // Quit if a non-EXTRACT_VECTOR_ELT
22209 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
22210 return false;
22211
22212 // Quit if without a constant index.
22213 auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
22214 if (!Idx)
22215 return false;
22216
22217 SDValue Src = I->getOperand(0);
22218 DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
22219 if (M == SrcOpMap.end()) {
22220 VT = Src.getValueType();
22221 // Quit if not the same type.
22222 if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
22223 return false;
22224 unsigned NumElts = VT.getVectorNumElements();
22225 APInt EltCount = APInt::getZero(NumElts);
22226 M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
22227 SrcOps.push_back(Src);
22228 }
22229
22230 // Quit if element already used.
22231 unsigned CIdx = Idx->getZExtValue();
22232 if (M->second[CIdx])
22233 return false;
22234 M->second.setBit(CIdx);
22235 }
22236
22237 if (SrcMask) {
22238 // Collect the source partial masks.
22239 for (SDValue &SrcOp : SrcOps)
22240 SrcMask->push_back(SrcOpMap[SrcOp]);
22241 } else {
22242 // Quit if not all elements are used.
22243 for (const auto &I : SrcOpMap)
22244 if (!I.second.isAllOnes())
22245 return false;
22246 }
22247
22248 return true;
22249}
22250
22251// Helper function for comparing all bits of two vectors.
22253 ISD::CondCode CC, const APInt &OriginalMask,
22254 const X86Subtarget &Subtarget,
22255 SelectionDAG &DAG, X86::CondCode &X86CC) {
22256 EVT VT = LHS.getValueType();
22257 unsigned ScalarSize = VT.getScalarSizeInBits();
22258 if (OriginalMask.getBitWidth() != ScalarSize) {
22259 assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch");
22260 return SDValue();
22261 }
22262
22263 // Quit if not convertable to legal scalar or 128/256-bit vector.
22264 if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))
22265 return SDValue();
22266
22267 // FCMP may use ISD::SETNE when nnan - early out if we manage to get here.
22268 if (VT.isFloatingPoint())
22269 return SDValue();
22270
22271 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
22272 X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
22273
22274 APInt Mask = OriginalMask;
22275
22276 auto MaskBits = [&](SDValue Src) {
22277 if (Mask.isAllOnes())
22278 return Src;
22279 EVT SrcVT = Src.getValueType();
22280 SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
22281 return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
22282 };
22283
22284 // For sub-128-bit vector, cast to (legal) integer and compare with zero.
22285 if (VT.getSizeInBits() < 128) {
22286 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
22287 if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT)) {
22288 if (IntVT != MVT::i64)
22289 return SDValue();
22290 auto SplitLHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(LHS)), DL,
22291 MVT::i32, MVT::i32);
22292 auto SplitRHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(RHS)), DL,
22293 MVT::i32, MVT::i32);
22294 SDValue Lo =
22295 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.first, SplitRHS.first);
22296 SDValue Hi =
22297 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.second, SplitRHS.second);
22298 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
22299 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi),
22300 DAG.getConstant(0, DL, MVT::i32));
22301 }
22302 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
22303 DAG.getBitcast(IntVT, MaskBits(LHS)),
22304 DAG.getBitcast(IntVT, MaskBits(RHS)));
22305 }
22306
22307 // Without PTEST, a masked v2i64 or-reduction is not faster than
22308 // scalarization.
22309 bool UseKORTEST = Subtarget.useAVX512Regs();
22310 bool UsePTEST = Subtarget.hasSSE41();
22311 if (!UsePTEST && !Mask.isAllOnes() && ScalarSize > 32)
22312 return SDValue();
22313
22314 // Split down to 128/256/512-bit vector.
22315 unsigned TestSize = UseKORTEST ? 512 : (Subtarget.hasAVX() ? 256 : 128);
22316
22317 // If the input vector has vector elements wider than the target test size,
22318 // then cast to <X x i64> so it will safely split.
22319 if (ScalarSize > TestSize) {
22320 if (!Mask.isAllOnes())
22321 return SDValue();
22322 VT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, VT.getSizeInBits() / 64);
22323 LHS = DAG.getBitcast(VT, LHS);
22324 RHS = DAG.getBitcast(VT, RHS);
22325 Mask = APInt::getAllOnes(64);
22326 }
22327
22328 if (VT.getSizeInBits() > TestSize) {
22329 KnownBits KnownRHS = DAG.computeKnownBits(RHS);
22330 if (KnownRHS.isConstant() && KnownRHS.getConstant() == Mask) {
22331 // If ICMP(AND(LHS,MASK),MASK) - reduce using AND splits.
22332 while (VT.getSizeInBits() > TestSize) {
22333 auto Split = DAG.SplitVector(LHS, DL);
22334 VT = Split.first.getValueType();
22335 LHS = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
22336 }
22337 RHS = DAG.getAllOnesConstant(DL, VT);
22338 } else if (!UsePTEST && !KnownRHS.isZero()) {
22339 // MOVMSK Special Case:
22340 // ALLOF(CMPEQ(X,Y)) -> AND(CMPEQ(X[0],Y[0]),CMPEQ(X[1],Y[1]),....)
22341 MVT SVT = ScalarSize >= 32 ? MVT::i32 : MVT::i8;
22342 VT = MVT::getVectorVT(SVT, VT.getSizeInBits() / SVT.getSizeInBits());
22343 LHS = DAG.getBitcast(VT, MaskBits(LHS));
22344 RHS = DAG.getBitcast(VT, MaskBits(RHS));
22345 EVT BoolVT = VT.changeVectorElementType(MVT::i1);
22346 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETEQ);
22347 V = DAG.getSExtOrTrunc(V, DL, VT);
22348 while (VT.getSizeInBits() > TestSize) {
22349 auto Split = DAG.SplitVector(V, DL);
22350 VT = Split.first.getValueType();
22351 V = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
22352 }
22353 V = DAG.getNOT(DL, V, VT);
22354 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
22355 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
22356 DAG.getConstant(0, DL, MVT::i32));
22357 } else {
22358 // Convert to a ICMP_EQ(XOR(LHS,RHS),0) pattern.
22359 SDValue V = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
22360 while (VT.getSizeInBits() > TestSize) {
22361 auto Split = DAG.SplitVector(V, DL);
22362 VT = Split.first.getValueType();
22363 V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
22364 }
22365 LHS = V;
22366 RHS = DAG.getConstant(0, DL, VT);
22367 }
22368 }
22369
22370 if (UseKORTEST && VT.is512BitVector()) {
22371 MVT TestVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
22372 MVT BoolVT = TestVT.changeVectorElementType(MVT::i1);
22373 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
22374 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
22375 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETNE);
22376 return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V);
22377 }
22378
22379 if (UsePTEST) {
22380 MVT TestVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
22381 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
22382 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
22383 SDValue V = DAG.getNode(ISD::XOR, DL, TestVT, LHS, RHS);
22384 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
22385 }
22386
22387 assert(VT.getSizeInBits() == 128 && "Failure to split to 128-bits");
22388 MVT MaskVT = ScalarSize >= 32 ? MVT::v4i32 : MVT::v16i8;
22389 LHS = DAG.getBitcast(MaskVT, MaskBits(LHS));
22390 RHS = DAG.getBitcast(MaskVT, MaskBits(RHS));
22391 SDValue V = DAG.getNode(X86ISD::PCMPEQ, DL, MaskVT, LHS, RHS);
22392 V = DAG.getNOT(DL, V, MaskVT);
22393 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
22394 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
22395 DAG.getConstant(0, DL, MVT::i32));
22396}
22397
22398// Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback
22399// to CMP(MOVMSK(PCMPEQB(X,Y))).
22401 ISD::CondCode CC, const SDLoc &DL,
22402 const X86Subtarget &Subtarget,
22403 SelectionDAG &DAG,
22404 X86::CondCode &X86CC) {
22405 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
22406
22407 bool CmpNull = isNullConstant(RHS);
22408 bool CmpAllOnes = isAllOnesConstant(RHS);
22409 if (!CmpNull && !CmpAllOnes)
22410 return SDValue();
22411
22412 SDValue Op = LHS;
22413 if (!Subtarget.hasSSE2() || !Op->hasOneUse())
22414 return SDValue();
22415
22416 // Check whether we're masking/truncating an OR-reduction result, in which
22417 // case track the masked bits.
22418 // TODO: Add CmpAllOnes support.
22419 APInt Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());
22420 if (CmpNull) {
22421 switch (Op.getOpcode()) {
22422 case ISD::TRUNCATE: {
22423 SDValue Src = Op.getOperand(0);
22424 Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
22425 Op.getScalarValueSizeInBits());
22426 Op = Src;
22427 break;
22428 }
22429 case ISD::AND: {
22430 if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
22431 Mask = Cst->getAPIntValue();
22432 Op = Op.getOperand(0);
22433 }
22434 break;
22435 }
22436 }
22437 }
22438
22439 ISD::NodeType LogicOp = CmpNull ? ISD::OR : ISD::AND;
22440
22441 // Match icmp(or(extract(X,0),extract(X,1)),0) anyof reduction patterns.
22442 // Match icmp(and(extract(X,0),extract(X,1)),-1) allof reduction patterns.
22444 if (Op.getOpcode() == LogicOp && matchScalarReduction(Op, LogicOp, VecIns)) {
22445 EVT VT = VecIns[0].getValueType();
22446 assert(llvm::all_of(VecIns,
22447 [VT](SDValue V) { return VT == V.getValueType(); }) &&
22448 "Reduction source vector mismatch");
22449
22450 // Quit if not splittable to scalar/128/256/512-bit vector.
22451 if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))
22452 return SDValue();
22453
22454 // If more than one full vector is evaluated, AND/OR them first before
22455 // PTEST.
22456 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
22457 Slot += 2, e += 1) {
22458 // Each iteration will AND/OR 2 nodes and append the result until there is
22459 // only 1 node left, i.e. the final value of all vectors.
22460 SDValue LHS = VecIns[Slot];
22461 SDValue RHS = VecIns[Slot + 1];
22462 VecIns.push_back(DAG.getNode(LogicOp, DL, VT, LHS, RHS));
22463 }
22464
22465 return LowerVectorAllEqual(DL, VecIns.back(),
22466 CmpNull ? DAG.getConstant(0, DL, VT)
22467 : DAG.getAllOnesConstant(DL, VT),
22468 CC, Mask, Subtarget, DAG, X86CC);
22469 }
22470
22471 // Match icmp(reduce_or(X),0) anyof reduction patterns.
22472 // Match icmp(reduce_and(X),-1) allof reduction patterns.
22473 if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
22474 ISD::NodeType BinOp;
22475 if (SDValue Match =
22476 DAG.matchBinOpReduction(Op.getNode(), BinOp, {LogicOp})) {
22477 EVT MatchVT = Match.getValueType();
22479 CmpNull ? DAG.getConstant(0, DL, MatchVT)
22480 : DAG.getAllOnesConstant(DL, MatchVT),
22481 CC, Mask, Subtarget, DAG, X86CC);
22482 }
22483 }
22484
22485 if (Mask.isAllOnes()) {
22486 assert(!Op.getValueType().isVector() &&
22487 "Illegal vector type for reduction pattern");
22489 if (Src.getValueType().isFixedLengthVector() &&
22490 Src.getValueType().getScalarType() == MVT::i1) {
22491 // Match icmp(bitcast(icmp_ne(X,Y)),0) reduction patterns.
22492 // Match icmp(bitcast(icmp_eq(X,Y)),-1) reduction patterns.
22493 if (Src.getOpcode() == ISD::SETCC) {
22494 SDValue LHS = Src.getOperand(0);
22495 SDValue RHS = Src.getOperand(1);
22496 EVT LHSVT = LHS.getValueType();
22497 ISD::CondCode SrcCC = cast<CondCodeSDNode>(Src.getOperand(2))->get();
22498 if (SrcCC == (CmpNull ? ISD::SETNE : ISD::SETEQ) &&
22499 llvm::has_single_bit<uint32_t>(LHSVT.getSizeInBits())) {
22500 APInt SrcMask = APInt::getAllOnes(LHSVT.getScalarSizeInBits());
22501 return LowerVectorAllEqual(DL, LHS, RHS, CC, SrcMask, Subtarget, DAG,
22502 X86CC);
22503 }
22504 }
22505 // Match icmp(bitcast(vXi1 trunc(Y)),0) reduction patterns.
22506 // Match icmp(bitcast(vXi1 trunc(Y)),-1) reduction patterns.
22507 // Peek through truncation, mask the LSB and compare against zero/LSB.
22508 if (Src.getOpcode() == ISD::TRUNCATE) {
22509 SDValue Inner = Src.getOperand(0);
22510 EVT InnerVT = Inner.getValueType();
22511 if (llvm::has_single_bit<uint32_t>(InnerVT.getSizeInBits())) {
22512 unsigned BW = InnerVT.getScalarSizeInBits();
22513 APInt SrcMask = APInt(BW, 1);
22514 APInt Cmp = CmpNull ? APInt::getZero(BW) : SrcMask;
22515 return LowerVectorAllEqual(DL, Inner,
22516 DAG.getConstant(Cmp, DL, InnerVT), CC,
22517 SrcMask, Subtarget, DAG, X86CC);
22518 }
22519 }
22520 }
22521 }
22522
22523 return SDValue();
22524}
22525
22526/// return true if \c Op has a use that doesn't just read flags.
22528 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
22529 ++UI) {
22530 SDNode *User = *UI;
22531 unsigned UOpNo = UI.getOperandNo();
22532 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
22533 // Look pass truncate.
22534 UOpNo = User->use_begin().getOperandNo();
22535 User = *User->use_begin();
22536 }
22537
22538 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
22539 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
22540 return true;
22541 }
22542 return false;
22543}
22544
22545// Transform to an x86-specific ALU node with flags if there is a chance of
22546// using an RMW op or only the flags are used. Otherwise, leave
22547// the node alone and emit a 'cmp' or 'test' instruction.
22549 for (SDNode *U : Op->uses())
22550 if (U->getOpcode() != ISD::CopyToReg &&
22551 U->getOpcode() != ISD::SETCC &&
22552 U->getOpcode() != ISD::STORE)
22553 return false;
22554
22555 return true;
22556}
22557
22558/// Emit nodes that will be selected as "test Op0,Op0", or something
22559/// equivalent.
22560static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
22561 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
22562 // CF and OF aren't always set the way we want. Determine which
22563 // of these we need.
22564 bool NeedCF = false;
22565 bool NeedOF = false;
22566 switch (X86CC) {
22567 default: break;
22568 case X86::COND_A: case X86::COND_AE:
22569 case X86::COND_B: case X86::COND_BE:
22570 NeedCF = true;
22571 break;
22572 case X86::COND_G: case X86::COND_GE:
22573 case X86::COND_L: case X86::COND_LE:
22574 case X86::COND_O: case X86::COND_NO: {
22575 // Check if we really need to set the
22576 // Overflow flag. If NoSignedWrap is present
22577 // that is not actually needed.
22578 switch (Op->getOpcode()) {
22579 case ISD::ADD:
22580 case ISD::SUB:
22581 case ISD::MUL:
22582 case ISD::SHL:
22583 if (Op.getNode()->getFlags().hasNoSignedWrap())
22584 break;
22585 [[fallthrough]];
22586 default:
22587 NeedOF = true;
22588 break;
22589 }
22590 break;
22591 }
22592 }
22593 // See if we can use the EFLAGS value from the operand instead of
22594 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
22595 // we prove that the arithmetic won't overflow, we can't use OF or CF.
22596 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
22597 // Emit a CMP with 0, which is the TEST pattern.
22598 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
22599 DAG.getConstant(0, dl, Op.getValueType()));
22600 }
22601 unsigned Opcode = 0;
22602 unsigned NumOperands = 0;
22603
22604 SDValue ArithOp = Op;
22605
22606 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
22607 // which may be the result of a CAST. We use the variable 'Op', which is the
22608 // non-casted variable when we check for possible users.
22609 switch (ArithOp.getOpcode()) {
22610 case ISD::AND:
22611 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
22612 // because a TEST instruction will be better.
22613 if (!hasNonFlagsUse(Op))
22614 break;
22615
22616 [[fallthrough]];
22617 case ISD::ADD:
22618 case ISD::SUB:
22619 case ISD::OR:
22620 case ISD::XOR:
22622 break;
22623
22624 // Otherwise use a regular EFLAGS-setting instruction.
22625 switch (ArithOp.getOpcode()) {
22626 // clang-format off
22627 default: llvm_unreachable("unexpected operator!");
22628 case ISD::ADD: Opcode = X86ISD::ADD; break;
22629 case ISD::SUB: Opcode = X86ISD::SUB; break;
22630 case ISD::XOR: Opcode = X86ISD::XOR; break;
22631 case ISD::AND: Opcode = X86ISD::AND; break;
22632 case ISD::OR: Opcode = X86ISD::OR; break;
22633 // clang-format on
22634 }
22635
22636 NumOperands = 2;
22637 break;
22638 case X86ISD::ADD:
22639 case X86ISD::SUB:
22640 case X86ISD::OR:
22641 case X86ISD::XOR:
22642 case X86ISD::AND:
22643 return SDValue(Op.getNode(), 1);
22644 case ISD::SSUBO:
22645 case ISD::USUBO: {
22646 // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
22647 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
22648 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
22649 Op->getOperand(1)).getValue(1);
22650 }
22651 default:
22652 break;
22653 }
22654
22655 if (Opcode == 0) {
22656 // Emit a CMP with 0, which is the TEST pattern.
22657 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
22658 DAG.getConstant(0, dl, Op.getValueType()));
22659 }
22660 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
22661 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
22662
22663 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
22664 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
22665 return SDValue(New.getNode(), 1);
22666}
22667
22668/// Emit nodes that will be selected as "cmp Op0,Op1", or something
22669/// equivalent.
22670static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
22671 const SDLoc &dl, SelectionDAG &DAG,
22672 const X86Subtarget &Subtarget) {
22673 if (isNullConstant(Op1))
22674 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
22675
22676 EVT CmpVT = Op0.getValueType();
22677
22678 assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
22679 CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
22680
22681 // Only promote the compare up to I32 if it is a 16 bit operation
22682 // with an immediate. 16 bit immediates are to be avoided.
22683 if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
22685 ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
22686 ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
22687 // Don't do this if the immediate can fit in 8-bits.
22688 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
22689 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
22690 unsigned ExtendOp =
22692 if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
22693 // For equality comparisons try to use SIGN_EXTEND if the input was
22694 // truncate from something with enough sign bits.
22695 if (Op0.getOpcode() == ISD::TRUNCATE) {
22696 if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)
22697 ExtendOp = ISD::SIGN_EXTEND;
22698 } else if (Op1.getOpcode() == ISD::TRUNCATE) {
22699 if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)
22700 ExtendOp = ISD::SIGN_EXTEND;
22701 }
22702 }
22703
22704 CmpVT = MVT::i32;
22705 Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
22706 Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
22707 }
22708 }
22709
22710 // Try to shrink i64 compares if the input has enough zero bits.
22711 // TODO: Add sign-bits equivalent for isX86CCSigned(X86CC)?
22712 if (CmpVT == MVT::i64 && !isX86CCSigned(X86CC) &&
22713 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
22714 DAG.MaskedValueIsZero(Op1, APInt::getHighBitsSet(64, 32)) &&
22715 DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
22716 CmpVT = MVT::i32;
22717 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
22718 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
22719 }
22720
22721 // 0-x == y --> x+y == 0
22722 // 0-x != y --> x+y != 0
22723 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
22724 Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
22725 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22726 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
22727 return Add.getValue(1);
22728 }
22729
22730 // x == 0-y --> x+y == 0
22731 // x != 0-y --> x+y != 0
22732 if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
22733 Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
22734 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22735 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
22736 return Add.getValue(1);
22737 }
22738
22739 // Use SUB instead of CMP to enable CSE between SUB and CMP.
22740 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22741 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
22742 return Sub.getValue(1);
22743}
22744
22746 EVT VT) const {
22747 return !VT.isVector() || Cond != ISD::CondCode::SETEQ;
22748}
22749
22750bool X86TargetLowering::optimizeFMulOrFDivAsShiftAddBitcast(
22751 SDNode *N, SDValue, SDValue IntPow2) const {
22752 if (N->getOpcode() == ISD::FDIV)
22753 return true;
22754
22755 EVT FPVT = N->getValueType(0);
22756 EVT IntVT = IntPow2.getValueType();
22757
22758 // This indicates a non-free bitcast.
22759 // TODO: This is probably overly conservative as we will need to scale the
22760 // integer vector anyways for the int->fp cast.
22761 if (FPVT.isVector() &&
22762 FPVT.getScalarSizeInBits() != IntVT.getScalarSizeInBits())
22763 return false;
22764
22765 return true;
22766}
22767
22768/// Check if replacement of SQRT with RSQRT should be disabled.
22769bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
22770 EVT VT = Op.getValueType();
22771
22772 // We don't need to replace SQRT with RSQRT for half type.
22773 if (VT.getScalarType() == MVT::f16)
22774 return true;
22775
22776 // We never want to use both SQRT and RSQRT instructions for the same input.
22777 if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
22778 return false;
22779
22780 if (VT.isVector())
22781 return Subtarget.hasFastVectorFSQRT();
22782 return Subtarget.hasFastScalarFSQRT();
22783}
22784
22785/// The minimum architected relative accuracy is 2^-12. We need one
22786/// Newton-Raphson step to have a good float result (24 bits of precision).
22787SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
22788 SelectionDAG &DAG, int Enabled,
22789 int &RefinementSteps,
22790 bool &UseOneConstNR,
22791 bool Reciprocal) const {
22792 SDLoc DL(Op);
22793 EVT VT = Op.getValueType();
22794
22795 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
22796 // It is likely not profitable to do this for f64 because a double-precision
22797 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
22798 // instructions: convert to single, rsqrtss, convert back to double, refine
22799 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
22800 // along with FMA, this could be a throughput win.
22801 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
22802 // after legalize types.
22803 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
22804 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
22805 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
22806 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
22807 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
22808 if (RefinementSteps == ReciprocalEstimate::Unspecified)
22809 RefinementSteps = 1;
22810
22811 UseOneConstNR = false;
22812 // There is no FSQRT for 512-bits, but there is RSQRT14.
22813 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
22814 SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);
22815 if (RefinementSteps == 0 && !Reciprocal)
22816 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);
22817 return Estimate;
22818 }
22819
22820 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
22821 Subtarget.hasFP16()) {
22822 assert(Reciprocal && "Don't replace SQRT with RSQRT for half type");
22823 if (RefinementSteps == ReciprocalEstimate::Unspecified)
22824 RefinementSteps = 0;
22825
22826 if (VT == MVT::f16) {
22827 SDValue Zero = DAG.getIntPtrConstant(0, DL);
22828 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
22829 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
22830 Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);
22831 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
22832 }
22833
22834 return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);
22835 }
22836 return SDValue();
22837}
22838
22839/// The minimum architected relative accuracy is 2^-12. We need one
22840/// Newton-Raphson step to have a good float result (24 bits of precision).
22841SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
22842 int Enabled,
22843 int &RefinementSteps) const {
22844 SDLoc DL(Op);
22845 EVT VT = Op.getValueType();
22846
22847 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
22848 // It is likely not profitable to do this for f64 because a double-precision
22849 // reciprocal estimate with refinement on x86 prior to FMA requires
22850 // 15 instructions: convert to single, rcpss, convert back to double, refine
22851 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
22852 // along with FMA, this could be a throughput win.
22853
22854 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
22855 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
22856 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
22857 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
22858 // Enable estimate codegen with 1 refinement step for vector division.
22859 // Scalar division estimates are disabled because they break too much
22860 // real-world code. These defaults are intended to match GCC behavior.
22861 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
22862 return SDValue();
22863
22864 if (RefinementSteps == ReciprocalEstimate::Unspecified)
22865 RefinementSteps = 1;
22866
22867 // There is no FSQRT for 512-bits, but there is RCP14.
22868 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
22869 return DAG.getNode(Opcode, DL, VT, Op);
22870 }
22871
22872 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
22873 Subtarget.hasFP16()) {
22874 if (RefinementSteps == ReciprocalEstimate::Unspecified)
22875 RefinementSteps = 0;
22876
22877 if (VT == MVT::f16) {
22878 SDValue Zero = DAG.getIntPtrConstant(0, DL);
22879 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
22880 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
22881 Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);
22882 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
22883 }
22884
22885 return DAG.getNode(X86ISD::RCP14, DL, VT, Op);
22886 }
22887 return SDValue();
22888}
22889
22890/// If we have at least two divisions that use the same divisor, convert to
22891/// multiplication by a reciprocal. This may need to be adjusted for a given
22892/// CPU if a division's cost is not at least twice the cost of a multiplication.
22893/// This is because we still need one division to calculate the reciprocal and
22894/// then we need two multiplies by that reciprocal as replacements for the
22895/// original divisions.
22896unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
22897 return 2;
22898}
22899
22900SDValue
22901X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
22902 SelectionDAG &DAG,
22903 SmallVectorImpl<SDNode *> &Created) const {
22905 if (isIntDivCheap(N->getValueType(0), Attr))
22906 return SDValue(N,0); // Lower SDIV as SDIV
22907
22908 assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&
22909 "Unexpected divisor!");
22910
22911 // Only perform this transform if CMOV is supported otherwise the select
22912 // below will become a branch.
22913 if (!Subtarget.canUseCMOV())
22914 return SDValue();
22915
22916 // fold (sdiv X, pow2)
22917 EVT VT = N->getValueType(0);
22918 // FIXME: Support i8.
22919 if (VT != MVT::i16 && VT != MVT::i32 &&
22920 !(Subtarget.is64Bit() && VT == MVT::i64))
22921 return SDValue();
22922
22923 // If the divisor is 2 or -2, the default expansion is better.
22924 if (Divisor == 2 ||
22925 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
22926 return SDValue();
22927
22928 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
22929}
22930
22931/// Result of 'and' is compared against zero. Change to a BT node if possible.
22932/// Returns the BT node and the condition code needed to use it.
22934 SelectionDAG &DAG, X86::CondCode &X86CC) {
22935 assert(And.getOpcode() == ISD::AND && "Expected AND node!");
22936 SDValue Op0 = And.getOperand(0);
22937 SDValue Op1 = And.getOperand(1);
22938 if (Op0.getOpcode() == ISD::TRUNCATE)
22939 Op0 = Op0.getOperand(0);
22940 if (Op1.getOpcode() == ISD::TRUNCATE)
22941 Op1 = Op1.getOperand(0);
22942
22943 SDValue Src, BitNo;
22944 if (Op1.getOpcode() == ISD::SHL)
22945 std::swap(Op0, Op1);
22946 if (Op0.getOpcode() == ISD::SHL) {
22947 if (isOneConstant(Op0.getOperand(0))) {
22948 // If we looked past a truncate, check that it's only truncating away
22949 // known zeros.
22950 unsigned BitWidth = Op0.getValueSizeInBits();
22951 unsigned AndBitWidth = And.getValueSizeInBits();
22952 if (BitWidth > AndBitWidth) {
22953 KnownBits Known = DAG.computeKnownBits(Op0);
22954 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
22955 return SDValue();
22956 }
22957 Src = Op1;
22958 BitNo = Op0.getOperand(1);
22959 }
22960 } else if (Op1.getOpcode() == ISD::Constant) {
22961 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
22962 uint64_t AndRHSVal = AndRHS->getZExtValue();
22963 SDValue AndLHS = Op0;
22964
22965 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
22966 Src = AndLHS.getOperand(0);
22967 BitNo = AndLHS.getOperand(1);
22968 } else {
22969 // Use BT if the immediate can't be encoded in a TEST instruction or we
22970 // are optimizing for size and the immedaite won't fit in a byte.
22971 bool OptForSize = DAG.shouldOptForSize();
22972 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
22973 isPowerOf2_64(AndRHSVal)) {
22974 Src = AndLHS;
22975 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
22976 Src.getValueType());
22977 }
22978 }
22979 }
22980
22981 // No patterns found, give up.
22982 if (!Src.getNode())
22983 return SDValue();
22984
22985 // Remove any bit flip.
22986 if (isBitwiseNot(Src)) {
22987 Src = Src.getOperand(0);
22989 }
22990
22991 // Attempt to create the X86ISD::BT node.
22992 if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {
22993 X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
22994 return BT;
22995 }
22996
22997 return SDValue();
22998}
22999
23000// Check if pre-AVX condcode can be performed by a single FCMP op.
23001static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {
23002 return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);
23003}
23004
23005/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
23006/// CMPs.
23007static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
23008 SDValue &Op1, bool &IsAlwaysSignaling) {
23009 unsigned SSECC;
23010 bool Swap = false;
23011
23012 // SSE Condition code mapping:
23013 // 0 - EQ
23014 // 1 - LT
23015 // 2 - LE
23016 // 3 - UNORD
23017 // 4 - NEQ
23018 // 5 - NLT
23019 // 6 - NLE
23020 // 7 - ORD
23021 switch (SetCCOpcode) {
23022 // clang-format off
23023 default: llvm_unreachable("Unexpected SETCC condition");
23024 case ISD::SETOEQ:
23025 case ISD::SETEQ: SSECC = 0; break;
23026 case ISD::SETOGT:
23027 case ISD::SETGT: Swap = true; [[fallthrough]];
23028 case ISD::SETLT:
23029 case ISD::SETOLT: SSECC = 1; break;
23030 case ISD::SETOGE:
23031 case ISD::SETGE: Swap = true; [[fallthrough]];
23032 case ISD::SETLE:
23033 case ISD::SETOLE: SSECC = 2; break;
23034 case ISD::SETUO: SSECC = 3; break;
23035 case ISD::SETUNE:
23036 case ISD::SETNE: SSECC = 4; break;
23037 case ISD::SETULE: Swap = true; [[fallthrough]];
23038 case ISD::SETUGE: SSECC = 5; break;
23039 case ISD::SETULT: Swap = true; [[fallthrough]];
23040 case ISD::SETUGT: SSECC = 6; break;
23041 case ISD::SETO: SSECC = 7; break;
23042 case ISD::SETUEQ: SSECC = 8; break;
23043 case ISD::SETONE: SSECC = 12; break;
23044 // clang-format on
23045 }
23046 if (Swap)
23047 std::swap(Op0, Op1);
23048
23049 switch (SetCCOpcode) {
23050 default:
23051 IsAlwaysSignaling = true;
23052 break;
23053 case ISD::SETEQ:
23054 case ISD::SETOEQ:
23055 case ISD::SETUEQ:
23056 case ISD::SETNE:
23057 case ISD::SETONE:
23058 case ISD::SETUNE:
23059 case ISD::SETO:
23060 case ISD::SETUO:
23061 IsAlwaysSignaling = false;
23062 break;
23063 }
23064
23065 return SSECC;
23066}
23067
23068/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
23069/// concatenate the result back.
23072 const SDLoc &dl) {
23073 assert(VT.isInteger() && VT == LHS.getValueType() &&
23074 VT == RHS.getValueType() && "Unsupported VTs!");
23075
23076 SDValue CC = DAG.getCondCode(Cond);
23077
23078 // Extract the LHS Lo/Hi vectors
23079 SDValue LHS1, LHS2;
23080 std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
23081
23082 // Extract the RHS Lo/Hi vectors
23083 SDValue RHS1, RHS2;
23084 std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
23085
23086 // Issue the operation on the smaller types and concatenate the result back
23087 EVT LoVT, HiVT;
23088 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
23089 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
23090 DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
23091 DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
23092}
23093
23095
23096 SDValue Op0 = Op.getOperand(0);
23097 SDValue Op1 = Op.getOperand(1);
23098 SDValue CC = Op.getOperand(2);
23099 MVT VT = Op.getSimpleValueType();
23100 SDLoc dl(Op);
23101
23102 assert(VT.getVectorElementType() == MVT::i1 &&
23103 "Cannot set masked compare for this operation");
23104
23105 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
23106
23107 // Prefer SETGT over SETLT.
23108 if (SetCCOpcode == ISD::SETLT) {
23109 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
23110 std::swap(Op0, Op1);
23111 }
23112
23113 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
23114}
23115
23116/// Given a buildvector constant, return a new vector constant with each element
23117/// incremented or decremented. If incrementing or decrementing would result in
23118/// unsigned overflow or underflow or this is not a simple vector constant,
23119/// return an empty value.
23121 bool NSW) {
23122 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
23123 if (!BV || !V.getValueType().isSimple())
23124 return SDValue();
23125
23126 MVT VT = V.getSimpleValueType();
23127 MVT EltVT = VT.getVectorElementType();
23128 unsigned NumElts = VT.getVectorNumElements();
23130 SDLoc DL(V);
23131 for (unsigned i = 0; i < NumElts; ++i) {
23132 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
23133 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
23134 return SDValue();
23135
23136 // Avoid overflow/underflow.
23137 const APInt &EltC = Elt->getAPIntValue();
23138 if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))
23139 return SDValue();
23140 if (NSW && ((IsInc && EltC.isMaxSignedValue()) ||
23141 (!IsInc && EltC.isMinSignedValue())))
23142 return SDValue();
23143
23144 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
23145 }
23146
23147 return DAG.getBuildVector(VT, DL, NewVecC);
23148}
23149
23150/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
23151/// Op0 u<= Op1:
23152/// t = psubus Op0, Op1
23153/// pcmpeq t, <0..0>
23155 ISD::CondCode Cond, const SDLoc &dl,
23156 const X86Subtarget &Subtarget,
23157 SelectionDAG &DAG) {
23158 if (!Subtarget.hasSSE2())
23159 return SDValue();
23160
23161 MVT VET = VT.getVectorElementType();
23162 if (VET != MVT::i8 && VET != MVT::i16)
23163 return SDValue();
23164
23165 switch (Cond) {
23166 default:
23167 return SDValue();
23168 case ISD::SETULT: {
23169 // If the comparison is against a constant we can turn this into a
23170 // setule. With psubus, setule does not require a swap. This is
23171 // beneficial because the constant in the register is no longer
23172 // destructed as the destination so it can be hoisted out of a loop.
23173 // Only do this pre-AVX since vpcmp* is no longer destructive.
23174 if (Subtarget.hasAVX())
23175 return SDValue();
23176 SDValue ULEOp1 =
23177 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false);
23178 if (!ULEOp1)
23179 return SDValue();
23180 Op1 = ULEOp1;
23181 break;
23182 }
23183 case ISD::SETUGT: {
23184 // If the comparison is against a constant, we can turn this into a setuge.
23185 // This is beneficial because materializing a constant 0 for the PCMPEQ is
23186 // probably cheaper than XOR+PCMPGT using 2 different vector constants:
23187 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
23188 SDValue UGEOp1 =
23189 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false);
23190 if (!UGEOp1)
23191 return SDValue();
23192 Op1 = Op0;
23193 Op0 = UGEOp1;
23194 break;
23195 }
23196 // Psubus is better than flip-sign because it requires no inversion.
23197 case ISD::SETUGE:
23198 std::swap(Op0, Op1);
23199 break;
23200 case ISD::SETULE:
23201 break;
23202 }
23203
23204 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
23205 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
23206 DAG.getConstant(0, dl, VT));
23207}
23208
23209static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
23210 SelectionDAG &DAG) {
23211 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
23212 Op.getOpcode() == ISD::STRICT_FSETCCS;
23213 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
23214 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
23215 SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
23216 MVT VT = Op->getSimpleValueType(0);
23217 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
23218 bool isFP = Op1.getSimpleValueType().isFloatingPoint();
23219 SDLoc dl(Op);
23220
23221 if (isFP) {
23223 assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64);
23224 if (isSoftF16(EltVT, Subtarget))
23225 return SDValue();
23226
23227 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
23228 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23229
23230 // If we have a strict compare with a vXi1 result and the input is 128/256
23231 // bits we can't use a masked compare unless we have VLX. If we use a wider
23232 // compare like we do for non-strict, we might trigger spurious exceptions
23233 // from the upper elements. Instead emit a AVX compare and convert to mask.
23234 unsigned Opc;
23235 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
23236 (!IsStrict || Subtarget.hasVLX() ||
23238#ifndef NDEBUG
23239 unsigned Num = VT.getVectorNumElements();
23240 assert(Num <= 16 || (Num == 32 && EltVT == MVT::f16));
23241#endif
23242 Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
23243 } else {
23244 Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
23245 // The SSE/AVX packed FP comparison nodes are defined with a
23246 // floating-point vector result that matches the operand type. This allows
23247 // them to work with an SSE1 target (integer vector types are not legal).
23248 VT = Op0.getSimpleValueType();
23249 }
23250
23251 SDValue Cmp;
23252 bool IsAlwaysSignaling;
23253 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
23254 if (!Subtarget.hasAVX()) {
23255 // TODO: We could use following steps to handle a quiet compare with
23256 // signaling encodings.
23257 // 1. Get ordered masks from a quiet ISD::SETO
23258 // 2. Use the masks to mask potential unordered elements in operand A, B
23259 // 3. Get the compare results of masked A, B
23260 // 4. Calculating final result using the mask and result from 3
23261 // But currently, we just fall back to scalar operations.
23262 if (IsStrict && IsAlwaysSignaling && !IsSignaling)
23263 return SDValue();
23264
23265 // Insert an extra signaling instruction to raise exception.
23266 if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
23267 SDValue SignalCmp = DAG.getNode(
23268 Opc, dl, {VT, MVT::Other},
23269 {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
23270 // FIXME: It seems we need to update the flags of all new strict nodes.
23271 // Otherwise, mayRaiseFPException in MI will return false due to
23272 // NoFPExcept = false by default. However, I didn't find it in other
23273 // patches.
23274 SignalCmp->setFlags(Op->getFlags());
23275 Chain = SignalCmp.getValue(1);
23276 }
23277
23278 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
23279 // emit two comparisons and a logic op to tie them together.
23280 if (!cheapX86FSETCC_SSE(Cond)) {
23281 // LLVM predicate is SETUEQ or SETONE.
23282 unsigned CC0, CC1;
23283 unsigned CombineOpc;
23284 if (Cond == ISD::SETUEQ) {
23285 CC0 = 3; // UNORD
23286 CC1 = 0; // EQ
23287 CombineOpc = X86ISD::FOR;
23288 } else {
23290 CC0 = 7; // ORD
23291 CC1 = 4; // NEQ
23292 CombineOpc = X86ISD::FAND;
23293 }
23294
23295 SDValue Cmp0, Cmp1;
23296 if (IsStrict) {
23297 Cmp0 = DAG.getNode(
23298 Opc, dl, {VT, MVT::Other},
23299 {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
23300 Cmp1 = DAG.getNode(
23301 Opc, dl, {VT, MVT::Other},
23302 {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
23303 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
23304 Cmp1.getValue(1));
23305 } else {
23306 Cmp0 = DAG.getNode(
23307 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
23308 Cmp1 = DAG.getNode(
23309 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
23310 }
23311 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
23312 } else {
23313 if (IsStrict) {
23314 Cmp = DAG.getNode(
23315 Opc, dl, {VT, MVT::Other},
23316 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23317 Chain = Cmp.getValue(1);
23318 } else
23319 Cmp = DAG.getNode(
23320 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23321 }
23322 } else {
23323 // Handle all other FP comparisons here.
23324 if (IsStrict) {
23325 // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
23326 SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
23327 Cmp = DAG.getNode(
23328 Opc, dl, {VT, MVT::Other},
23329 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23330 Chain = Cmp.getValue(1);
23331 } else
23332 Cmp = DAG.getNode(
23333 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23334 }
23335
23336 if (VT.getFixedSizeInBits() >
23337 Op.getSimpleValueType().getFixedSizeInBits()) {
23338 // We emitted a compare with an XMM/YMM result. Finish converting to a
23339 // mask register using a vptestm.
23341 Cmp = DAG.getBitcast(CastVT, Cmp);
23342 Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
23343 DAG.getConstant(0, dl, CastVT), ISD::SETNE);
23344 } else {
23345 // If this is SSE/AVX CMPP, bitcast the result back to integer to match
23346 // the result type of SETCC. The bitcast is expected to be optimized
23347 // away during combining/isel.
23348 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
23349 }
23350
23351 if (IsStrict)
23352 return DAG.getMergeValues({Cmp, Chain}, dl);
23353
23354 return Cmp;
23355 }
23356
23357 assert(!IsStrict && "Strict SETCC only handles FP operands.");
23358
23359 MVT VTOp0 = Op0.getSimpleValueType();
23360 (void)VTOp0;
23361 assert(VTOp0 == Op1.getSimpleValueType() &&
23362 "Expected operands with same type!");
23364 "Invalid number of packed elements for source and destination!");
23365
23366 // The non-AVX512 code below works under the assumption that source and
23367 // destination types are the same.
23368 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
23369 "Value types for source and destination must be the same!");
23370
23371 // The result is boolean, but operands are int/float
23372 if (VT.getVectorElementType() == MVT::i1) {
23373 // In AVX-512 architecture setcc returns mask with i1 elements,
23374 // But there is no compare instruction for i8 and i16 elements in KNL.
23375 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
23376 "Unexpected operand type");
23377 return LowerIntVSETCC_AVX512(Op, DAG);
23378 }
23379
23380 // Lower using XOP integer comparisons.
23381 if (VT.is128BitVector() && Subtarget.hasXOP()) {
23382 // Translate compare code to XOP PCOM compare mode.
23383 unsigned CmpMode = 0;
23384 switch (Cond) {
23385 // clang-format off
23386 default: llvm_unreachable("Unexpected SETCC condition");
23387 case ISD::SETULT:
23388 case ISD::SETLT: CmpMode = 0x00; break;
23389 case ISD::SETULE:
23390 case ISD::SETLE: CmpMode = 0x01; break;
23391 case ISD::SETUGT:
23392 case ISD::SETGT: CmpMode = 0x02; break;
23393 case ISD::SETUGE:
23394 case ISD::SETGE: CmpMode = 0x03; break;
23395 case ISD::SETEQ: CmpMode = 0x04; break;
23396 case ISD::SETNE: CmpMode = 0x05; break;
23397 // clang-format on
23398 }
23399
23400 // Are we comparing unsigned or signed integers?
23401 unsigned Opc =
23403
23404 return DAG.getNode(Opc, dl, VT, Op0, Op1,
23405 DAG.getTargetConstant(CmpMode, dl, MVT::i8));
23406 }
23407
23408 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
23409 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
23411 SDValue BC0 = peekThroughBitcasts(Op0);
23412 if (BC0.getOpcode() == ISD::AND) {
23413 APInt UndefElts;
23414 SmallVector<APInt, 64> EltBits;
23416 BC0.getOperand(1), VT.getScalarSizeInBits(), UndefElts, EltBits,
23417 /*AllowWholeUndefs*/ false, /*AllowPartialUndefs*/ false)) {
23418 if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
23419 Cond = ISD::SETEQ;
23420 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
23421 }
23422 }
23423 }
23424 }
23425
23426 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
23427 if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
23428 Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
23430 if (C1 && C1->getAPIntValue().isPowerOf2()) {
23431 unsigned BitWidth = VT.getScalarSizeInBits();
23432 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
23433
23434 SDValue Result = Op0.getOperand(0);
23435 Result = DAG.getNode(ISD::SHL, dl, VT, Result,
23436 DAG.getConstant(ShiftAmt, dl, VT));
23437 Result = DAG.getNode(ISD::SRA, dl, VT, Result,
23438 DAG.getConstant(BitWidth - 1, dl, VT));
23439 return Result;
23440 }
23441 }
23442
23443 // Break 256-bit integer vector compare into smaller ones.
23444 if (VT.is256BitVector() && !Subtarget.hasInt256())
23445 return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23446
23447 // Break 512-bit integer vector compare into smaller ones.
23448 // TODO: Try harder to use VPCMPx + VPMOV2x?
23449 if (VT.is512BitVector())
23450 return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23451
23452 // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
23453 // not-of-PCMPEQ:
23454 // X != INT_MIN --> X >s INT_MIN
23455 // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
23456 // +X != 0 --> +X >s 0
23457 APInt ConstValue;
23458 if (Cond == ISD::SETNE &&
23459 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
23460 if (ConstValue.isMinSignedValue())
23461 Cond = ISD::SETGT;
23462 else if (ConstValue.isMaxSignedValue())
23463 Cond = ISD::SETLT;
23464 else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))
23465 Cond = ISD::SETGT;
23466 }
23467
23468 // If both operands are known non-negative, then an unsigned compare is the
23469 // same as a signed compare and there's no need to flip signbits.
23470 // TODO: We could check for more general simplifications here since we're
23471 // computing known bits.
23472 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
23473 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
23474
23475 // Special case: Use min/max operations for unsigned compares.
23476 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23478 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
23479 TLI.isOperationLegal(ISD::UMIN, VT)) {
23480 // If we have a constant operand, increment/decrement it and change the
23481 // condition to avoid an invert.
23482 if (Cond == ISD::SETUGT) {
23483 // X > C --> X >= (C+1) --> X == umax(X, C+1)
23484 if (SDValue UGTOp1 =
23485 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false)) {
23486 Op1 = UGTOp1;
23487 Cond = ISD::SETUGE;
23488 }
23489 }
23490 if (Cond == ISD::SETULT) {
23491 // X < C --> X <= (C-1) --> X == umin(X, C-1)
23492 if (SDValue ULTOp1 =
23493 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false)) {
23494 Op1 = ULTOp1;
23495 Cond = ISD::SETULE;
23496 }
23497 }
23498 bool Invert = false;
23499 unsigned Opc;
23500 switch (Cond) {
23501 // clang-format off
23502 default: llvm_unreachable("Unexpected condition code");
23503 case ISD::SETUGT: Invert = true; [[fallthrough]];
23504 case ISD::SETULE: Opc = ISD::UMIN; break;
23505 case ISD::SETULT: Invert = true; [[fallthrough]];
23506 case ISD::SETUGE: Opc = ISD::UMAX; break;
23507 // clang-format on
23508 }
23509
23510 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
23511 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
23512
23513 // If the logical-not of the result is required, perform that now.
23514 if (Invert)
23515 Result = DAG.getNOT(dl, Result, VT);
23516
23517 return Result;
23518 }
23519
23520 // Try to use SUBUS and PCMPEQ.
23521 if (FlipSigns)
23522 if (SDValue V =
23523 LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
23524 return V;
23525
23526 // We are handling one of the integer comparisons here. Since SSE only has
23527 // GT and EQ comparisons for integer, swapping operands and multiple
23528 // operations may be required for some comparisons.
23529 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
23531 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
23533 bool Invert = Cond == ISD::SETNE ||
23535
23536 if (Swap)
23537 std::swap(Op0, Op1);
23538
23539 // Check that the operation in question is available (most are plain SSE2,
23540 // but PCMPGTQ and PCMPEQQ have different requirements).
23541 if (VT == MVT::v2i64) {
23542 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
23543 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
23544
23545 // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
23546 // the odd elements over the even elements.
23547 if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
23548 Op0 = DAG.getConstant(0, dl, MVT::v4i32);
23549 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23550
23551 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23552 static const int MaskHi[] = { 1, 1, 3, 3 };
23553 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23554
23555 return DAG.getBitcast(VT, Result);
23556 }
23557
23558 if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
23559 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23560 Op1 = DAG.getConstant(-1, dl, MVT::v4i32);
23561
23562 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23563 static const int MaskHi[] = { 1, 1, 3, 3 };
23564 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23565
23566 return DAG.getBitcast(VT, Result);
23567 }
23568
23569 // If the i64 elements are sign-extended enough to be representable as i32
23570 // then we can compare the lower i32 bits and splat.
23571 if (!FlipSigns && !Invert && DAG.ComputeNumSignBits(Op0) > 32 &&
23572 DAG.ComputeNumSignBits(Op1) > 32) {
23573 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23574 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23575
23576 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23577 static const int MaskLo[] = {0, 0, 2, 2};
23578 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
23579
23580 return DAG.getBitcast(VT, Result);
23581 }
23582
23583 // Since SSE has no unsigned integer comparisons, we need to flip the sign
23584 // bits of the inputs before performing those operations. The lower
23585 // compare is always unsigned.
23586 SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL
23587 : 0x0000000080000000ULL,
23588 dl, MVT::v2i64);
23589
23590 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
23591 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
23592
23593 // Cast everything to the right type.
23594 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23595 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23596
23597 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
23598 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23599 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
23600
23601 // Create masks for only the low parts/high parts of the 64 bit integers.
23602 static const int MaskHi[] = { 1, 1, 3, 3 };
23603 static const int MaskLo[] = { 0, 0, 2, 2 };
23604 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
23605 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
23606 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23607
23608 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
23609 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
23610
23611 if (Invert)
23612 Result = DAG.getNOT(dl, Result, MVT::v4i32);
23613
23614 return DAG.getBitcast(VT, Result);
23615 }
23616
23617 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
23618 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
23619 // pcmpeqd + pshufd + pand.
23620 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
23621
23622 // First cast everything to the right type.
23623 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23624 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23625
23626 // Do the compare.
23627 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
23628
23629 // Make sure the lower and upper halves are both all-ones.
23630 static const int Mask[] = { 1, 0, 3, 2 };
23631 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
23632 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
23633
23634 if (Invert)
23635 Result = DAG.getNOT(dl, Result, MVT::v4i32);
23636
23637 return DAG.getBitcast(VT, Result);
23638 }
23639 }
23640
23641 // Since SSE has no unsigned integer comparisons, we need to flip the sign
23642 // bits of the inputs before performing those operations.
23643 if (FlipSigns) {
23644 MVT EltVT = VT.getVectorElementType();
23646 VT);
23647 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
23648 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
23649 }
23650
23651 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
23652
23653 // If the logical-not of the result is required, perform that now.
23654 if (Invert)
23655 Result = DAG.getNOT(dl, Result, VT);
23656
23657 return Result;
23658}
23659
23660// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
23662 const SDLoc &dl, SelectionDAG &DAG,
23663 const X86Subtarget &Subtarget,
23664 SDValue &X86CC) {
23665 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
23666
23667 // Must be a bitcast from vXi1.
23668 if (Op0.getOpcode() != ISD::BITCAST)
23669 return SDValue();
23670
23671 Op0 = Op0.getOperand(0);
23672 MVT VT = Op0.getSimpleValueType();
23673 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
23674 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
23675 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
23676 return SDValue();
23677
23678 X86::CondCode X86Cond;
23679 if (isNullConstant(Op1)) {
23680 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
23681 } else if (isAllOnesConstant(Op1)) {
23682 // C flag is set for all ones.
23683 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
23684 } else
23685 return SDValue();
23686
23687 // If the input is an AND, we can combine it's operands into the KTEST.
23688 bool KTestable = false;
23689 if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
23690 KTestable = true;
23691 if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
23692 KTestable = true;
23693 if (!isNullConstant(Op1))
23694 KTestable = false;
23695 if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
23696 SDValue LHS = Op0.getOperand(0);
23697 SDValue RHS = Op0.getOperand(1);
23698 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
23699 return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
23700 }
23701
23702 // If the input is an OR, we can combine it's operands into the KORTEST.
23703 SDValue LHS = Op0;
23704 SDValue RHS = Op0;
23705 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
23706 LHS = Op0.getOperand(0);
23707 RHS = Op0.getOperand(1);
23708 }
23709
23710 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
23711 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
23712}
23713
23714/// Emit flags for the given setcc condition and operands. Also returns the
23715/// corresponding X86 condition code constant in X86CC.
23716SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
23717 ISD::CondCode CC, const SDLoc &dl,
23718 SelectionDAG &DAG,
23719 SDValue &X86CC) const {
23720 // Equality Combines.
23721 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
23722 X86::CondCode X86CondCode;
23723
23724 // Optimize to BT if possible.
23725 // Lower (X & (1 << N)) == 0 to BT(X, N).
23726 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
23727 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
23728 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1)) {
23729 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {
23730 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
23731 return BT;
23732 }
23733 }
23734
23735 // Try to use PTEST/PMOVMSKB for a tree AND/ORs equality compared with -1/0.
23736 if (SDValue CmpZ = MatchVectorAllEqualTest(Op0, Op1, CC, dl, Subtarget, DAG,
23737 X86CondCode)) {
23738 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
23739 return CmpZ;
23740 }
23741
23742 // Try to lower using KORTEST or KTEST.
23743 if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
23744 return Test;
23745
23746 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms
23747 // of these.
23748 if (isOneConstant(Op1) || isNullConstant(Op1)) {
23749 // If the input is a setcc, then reuse the input setcc or use a new one
23750 // with the inverted condition.
23751 if (Op0.getOpcode() == X86ISD::SETCC) {
23752 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
23753
23754 X86CC = Op0.getOperand(0);
23755 if (Invert) {
23756 X86CondCode = (X86::CondCode)Op0.getConstantOperandVal(0);
23757 X86CondCode = X86::GetOppositeBranchCondition(X86CondCode);
23758 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
23759 }
23760
23761 return Op0.getOperand(1);
23762 }
23763 }
23764
23765 // Try to use the carry flag from the add in place of an separate CMP for:
23766 // (seteq (add X, -1), -1). Similar for setne.
23767 if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
23768 Op0.getOperand(1) == Op1) {
23769 if (isProfitableToUseFlagOp(Op0)) {
23770 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
23771
23772 SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
23773 Op0.getOperand(1));
23774 DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
23775 X86CondCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
23776 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
23777 return SDValue(New.getNode(), 1);
23778 }
23779 }
23780 }
23781
23783 TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
23784 assert(CondCode != X86::COND_INVALID && "Unexpected condition code!");
23785
23786 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
23787 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
23788 return EFLAGS;
23789}
23790
23791SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
23792
23793 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
23794 Op.getOpcode() == ISD::STRICT_FSETCCS;
23795 MVT VT = Op->getSimpleValueType(0);
23796
23797 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
23798
23799 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
23800 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23801 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
23802 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
23803 SDLoc dl(Op);
23805 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
23806
23807 if (isSoftF16(Op0.getValueType(), Subtarget))
23808 return SDValue();
23809
23810 // Handle f128 first, since one possible outcome is a normal integer
23811 // comparison which gets handled by emitFlagsForSetcc.
23812 if (Op0.getValueType() == MVT::f128) {
23813 softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
23814 Op.getOpcode() == ISD::STRICT_FSETCCS);
23815
23816 // If softenSetCCOperands returned a scalar, use it.
23817 if (!Op1.getNode()) {
23818 assert(Op0.getValueType() == Op.getValueType() &&
23819 "Unexpected setcc expansion!");
23820 if (IsStrict)
23821 return DAG.getMergeValues({Op0, Chain}, dl);
23822 return Op0;
23823 }
23824 }
23825
23826 if (Op0.getSimpleValueType().isInteger()) {
23827 // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
23828 // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
23829 // this may translate to less uops depending on uarch implementation. The
23830 // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
23831 // canonicalize to that CondCode.
23832 // NOTE: Only do this if incrementing the constant doesn't increase the bit
23833 // encoding size - so it must either already be a i8 or i32 immediate, or it
23834 // shrinks down to that. We don't do this for any i64's to avoid additional
23835 // constant materializations.
23836 // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
23837 if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
23838 const APInt &Op1Val = Op1C->getAPIntValue();
23839 if (!Op1Val.isZero()) {
23840 // Ensure the constant+1 doesn't overflow.
23841 if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
23842 (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
23843 APInt Op1ValPlusOne = Op1Val + 1;
23844 if (Op1ValPlusOne.isSignedIntN(32) &&
23845 (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
23846 Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
23849 }
23850 }
23851 }
23852 }
23853
23854 SDValue X86CC;
23855 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
23856 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
23857 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
23858 }
23859
23860 // Handle floating point.
23861 X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
23862 if (CondCode == X86::COND_INVALID)
23863 return SDValue();
23864
23865 SDValue EFLAGS;
23866 if (IsStrict) {
23867 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
23868 EFLAGS =
23870 dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
23871 Chain = EFLAGS.getValue(1);
23872 } else {
23873 EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
23874 }
23875
23876 SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
23877 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
23878 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
23879}
23880
23881SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
23882 SDValue LHS = Op.getOperand(0);
23883 SDValue RHS = Op.getOperand(1);
23884 SDValue Carry = Op.getOperand(2);
23885 SDValue Cond = Op.getOperand(3);
23886 SDLoc DL(Op);
23887
23888 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
23889 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
23890
23891 // Recreate the carry if needed.
23892 EVT CarryVT = Carry.getValueType();
23893 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
23894 Carry, DAG.getAllOnesConstant(DL, CarryVT));
23895
23896 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
23897 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
23898 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
23899}
23900
23901// This function returns three things: the arithmetic computation itself
23902// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
23903// flag and the condition code define the case in which the arithmetic
23904// computation overflows.
23905static std::pair<SDValue, SDValue>
23907 assert(Op.getResNo() == 0 && "Unexpected result number!");
23908 SDValue Value, Overflow;
23909 SDValue LHS = Op.getOperand(0);
23910 SDValue RHS = Op.getOperand(1);
23911 unsigned BaseOp = 0;
23912 SDLoc DL(Op);
23913 switch (Op.getOpcode()) {
23914 default: llvm_unreachable("Unknown ovf instruction!");
23915 case ISD::SADDO:
23916 BaseOp = X86ISD::ADD;
23917 Cond = X86::COND_O;
23918 break;
23919 case ISD::UADDO:
23920 BaseOp = X86ISD::ADD;
23922 break;
23923 case ISD::SSUBO:
23924 BaseOp = X86ISD::SUB;
23925 Cond = X86::COND_O;
23926 break;
23927 case ISD::USUBO:
23928 BaseOp = X86ISD::SUB;
23929 Cond = X86::COND_B;
23930 break;
23931 case ISD::SMULO:
23932 BaseOp = X86ISD::SMUL;
23933 Cond = X86::COND_O;
23934 break;
23935 case ISD::UMULO:
23936 BaseOp = X86ISD::UMUL;
23937 Cond = X86::COND_O;
23938 break;
23939 }
23940
23941 if (BaseOp) {
23942 // Also sets EFLAGS.
23943 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23944 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
23945 Overflow = Value.getValue(1);
23946 }
23947
23948 return std::make_pair(Value, Overflow);
23949}
23950
23952 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
23953 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
23954 // looks for this combo and may remove the "setcc" instruction if the "setcc"
23955 // has only one use.
23956 SDLoc DL(Op);
23958 SDValue Value, Overflow;
23959 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
23960
23961 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
23962 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
23963 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
23964}
23965
23966/// Return true if opcode is a X86 logical comparison.
23968 unsigned Opc = Op.getOpcode();
23969 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
23970 Opc == X86ISD::FCMP)
23971 return true;
23972 if (Op.getResNo() == 1 &&
23973 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
23974 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
23975 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
23976 return true;
23977
23978 return false;
23979}
23980
23982 if (V.getOpcode() != ISD::TRUNCATE)
23983 return false;
23984
23985 SDValue VOp0 = V.getOperand(0);
23986 unsigned InBits = VOp0.getValueSizeInBits();
23987 unsigned Bits = V.getValueSizeInBits();
23988 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
23989}
23990
23991SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
23992 bool AddTest = true;
23993 SDValue Cond = Op.getOperand(0);
23994 SDValue Op1 = Op.getOperand(1);
23995 SDValue Op2 = Op.getOperand(2);
23996 SDLoc DL(Op);
23997 MVT VT = Op1.getSimpleValueType();
23998 SDValue CC;
23999
24000 if (isSoftF16(VT, Subtarget)) {
24001 MVT NVT = VT.changeTypeToInteger();
24002 return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,
24003 DAG.getBitcast(NVT, Op1),
24004 DAG.getBitcast(NVT, Op2)));
24005 }
24006
24007 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
24008 // are available or VBLENDV if AVX is available.
24009 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
24010 if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
24011 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
24012 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
24013 bool IsAlwaysSignaling;
24014 unsigned SSECC =
24015 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
24016 CondOp0, CondOp1, IsAlwaysSignaling);
24017
24018 if (Subtarget.hasAVX512()) {
24019 SDValue Cmp =
24020 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
24021 DAG.getTargetConstant(SSECC, DL, MVT::i8));
24022 assert(!VT.isVector() && "Not a scalar type?");
24023 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
24024 }
24025
24026 if (SSECC < 8 || Subtarget.hasAVX()) {
24027 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
24028 DAG.getTargetConstant(SSECC, DL, MVT::i8));
24029
24030 // If we have AVX, we can use a variable vector select (VBLENDV) instead
24031 // of 3 logic instructions for size savings and potentially speed.
24032 // Unfortunately, there is no scalar form of VBLENDV.
24033
24034 // If either operand is a +0.0 constant, don't try this. We can expect to
24035 // optimize away at least one of the logic instructions later in that
24036 // case, so that sequence would be faster than a variable blend.
24037
24038 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
24039 // uses XMM0 as the selection register. That may need just as many
24040 // instructions as the AND/ANDN/OR sequence due to register moves, so
24041 // don't bother.
24042 if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
24043 !isNullFPConstant(Op2)) {
24044 // Convert to vectors, do a VSELECT, and convert back to scalar.
24045 // All of the conversions should be optimized away.
24046 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
24047 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
24048 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
24049 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
24050
24051 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
24052 VCmp = DAG.getBitcast(VCmpVT, VCmp);
24053
24054 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
24055
24056 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
24057 VSel, DAG.getIntPtrConstant(0, DL));
24058 }
24059 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
24060 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
24061 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
24062 }
24063 }
24064
24065 // AVX512 fallback is to lower selects of scalar floats to masked moves.
24066 if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
24067 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
24068 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
24069 }
24070
24071 if (Cond.getOpcode() == ISD::SETCC &&
24072 !isSoftF16(Cond.getOperand(0).getSimpleValueType(), Subtarget)) {
24073 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
24074 Cond = NewCond;
24075 // If the condition was updated, it's possible that the operands of the
24076 // select were also updated (for example, EmitTest has a RAUW). Refresh
24077 // the local references to the select operands in case they got stale.
24078 Op1 = Op.getOperand(1);
24079 Op2 = Op.getOperand(2);
24080 }
24081 }
24082
24083 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
24084 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
24085 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
24086 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
24087 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
24088 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
24089 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
24090 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
24091 if (Cond.getOpcode() == X86ISD::SETCC &&
24092 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
24093 isNullConstant(Cond.getOperand(1).getOperand(1))) {
24094 SDValue Cmp = Cond.getOperand(1);
24095 SDValue CmpOp0 = Cmp.getOperand(0);
24096 unsigned CondCode = Cond.getConstantOperandVal(0);
24097
24098 // Special handling for __builtin_ffs(X) - 1 pattern which looks like
24099 // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
24100 // handle to keep the CMP with 0. This should be removed by
24101 // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
24102 // cttz_zero_undef.
24103 auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
24104 return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
24105 Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
24106 };
24107 if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&
24108 ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
24109 (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
24110 // Keep Cmp.
24111 } else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
24112 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
24113 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
24114 SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
24115
24116 // 'X - 1' sets the carry flag if X == 0.
24117 // '0 - X' sets the carry flag if X != 0.
24118 // Convert the carry flag to a -1/0 mask with sbb:
24119 // select (X != 0), -1, Y --> 0 - X; or (sbb), Y
24120 // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y
24121 // select (X != 0), Y, -1 --> X - 1; or (sbb), Y
24122 // select (X == 0), -1, Y --> X - 1; or (sbb), Y
24123 SDValue Sub;
24124 if (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE)) {
24125 SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
24126 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);
24127 } else {
24128 SDValue One = DAG.getConstant(1, DL, CmpOp0.getValueType());
24129 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpOp0, One);
24130 }
24132 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
24133 Sub.getValue(1));
24134 return DAG.getNode(ISD::OR, DL, VT, SBB, Y);
24135 } else if (!Subtarget.canUseCMOV() && CondCode == X86::COND_E &&
24136 CmpOp0.getOpcode() == ISD::AND &&
24137 isOneConstant(CmpOp0.getOperand(1))) {
24138 SDValue Src1, Src2;
24139 // true if Op2 is XOR or OR operator and one of its operands
24140 // is equal to Op1
24141 // ( a , a op b) || ( b , a op b)
24142 auto isOrXorPattern = [&]() {
24143 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
24144 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
24145 Src1 =
24146 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
24147 Src2 = Op1;
24148 return true;
24149 }
24150 return false;
24151 };
24152
24153 if (isOrXorPattern()) {
24154 SDValue Neg;
24155 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
24156 // we need mask of all zeros or ones with same size of the other
24157 // operands.
24158 if (CmpSz > VT.getSizeInBits())
24159 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
24160 else if (CmpSz < VT.getSizeInBits())
24161 Neg = DAG.getNode(ISD::AND, DL, VT,
24162 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
24163 DAG.getConstant(1, DL, VT));
24164 else
24165 Neg = CmpOp0;
24166 SDValue Mask = DAG.getNegative(Neg, DL, VT); // -(and (x, 0x1))
24167 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
24168 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
24169 }
24170 } else if ((VT == MVT::i32 || VT == MVT::i64) && isNullConstant(Op2) &&
24171 Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&
24172 ((CondCode == X86::COND_S) || // smin(x, 0)
24173 (CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)
24174 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
24175 //
24176 // If the comparison is testing for a positive value, we have to invert
24177 // the sign bit mask, so only do that transform if the target has a
24178 // bitwise 'and not' instruction (the invert is free).
24179 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
24180 unsigned ShCt = VT.getSizeInBits() - 1;
24181 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);
24182 SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);
24183 if (CondCode == X86::COND_G)
24184 Shift = DAG.getNOT(DL, Shift, VT);
24185 return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);
24186 }
24187 }
24188
24189 // Look past (and (setcc_carry (cmp ...)), 1).
24190 if (Cond.getOpcode() == ISD::AND &&
24191 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
24192 isOneConstant(Cond.getOperand(1)))
24193 Cond = Cond.getOperand(0);
24194
24195 // If condition flag is set by a X86ISD::CMP, then use it as the condition
24196 // setting operand in place of the X86ISD::SETCC.
24197 unsigned CondOpcode = Cond.getOpcode();
24198 if (CondOpcode == X86ISD::SETCC ||
24199 CondOpcode == X86ISD::SETCC_CARRY) {
24200 CC = Cond.getOperand(0);
24201
24202 SDValue Cmp = Cond.getOperand(1);
24203 bool IllegalFPCMov = false;
24204 if (VT.isFloatingPoint() && !VT.isVector() &&
24205 !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack?
24206 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
24207
24208 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
24209 Cmp.getOpcode() == X86ISD::BT) { // FIXME
24210 Cond = Cmp;
24211 AddTest = false;
24212 }
24213 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
24214 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
24215 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
24216 SDValue Value;
24217 X86::CondCode X86Cond;
24218 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
24219
24220 CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
24221 AddTest = false;
24222 }
24223
24224 if (AddTest) {
24225 // Look past the truncate if the high bits are known zero.
24227 Cond = Cond.getOperand(0);
24228
24229 // We know the result of AND is compared against zero. Try to match
24230 // it to BT.
24231 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
24232 X86::CondCode X86CondCode;
24233 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
24234 CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
24235 Cond = BT;
24236 AddTest = false;
24237 }
24238 }
24239 }
24240
24241 if (AddTest) {
24242 CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
24243 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
24244 }
24245
24246 // a < b ? -1 : 0 -> RES = ~setcc_carry
24247 // a < b ? 0 : -1 -> RES = setcc_carry
24248 // a >= b ? -1 : 0 -> RES = setcc_carry
24249 // a >= b ? 0 : -1 -> RES = ~setcc_carry
24250 if (Cond.getOpcode() == X86ISD::SUB) {
24251 unsigned CondCode = CC->getAsZExtVal();
24252
24253 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
24254 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
24255 (isNullConstant(Op1) || isNullConstant(Op2))) {
24256 SDValue Res =
24257 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
24258 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
24259 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
24260 return DAG.getNOT(DL, Res, Res.getValueType());
24261 return Res;
24262 }
24263 }
24264
24265 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
24266 // widen the cmov and push the truncate through. This avoids introducing a new
24267 // branch during isel and doesn't add any extensions.
24268 if (Op.getValueType() == MVT::i8 &&
24269 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
24270 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
24271 if (T1.getValueType() == T2.getValueType() &&
24272 // Exclude CopyFromReg to avoid partial register stalls.
24273 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
24274 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
24275 CC, Cond);
24276 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
24277 }
24278 }
24279
24280 // Or finally, promote i8 cmovs if we have CMOV,
24281 // or i16 cmovs if it won't prevent folding a load.
24282 // FIXME: we should not limit promotion of i8 case to only when the CMOV is
24283 // legal, but EmitLoweredSelect() can not deal with these extensions
24284 // being inserted between two CMOV's. (in i16 case too TBN)
24285 // https://bugs.llvm.org/show_bug.cgi?id=40974
24286 if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||
24287 (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&
24288 !X86::mayFoldLoad(Op2, Subtarget))) {
24289 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
24290 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
24291 SDValue Ops[] = { Op2, Op1, CC, Cond };
24292 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
24293 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
24294 }
24295
24296 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
24297 // condition is true.
24298 SDValue Ops[] = { Op2, Op1, CC, Cond };
24299 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops, Op->getFlags());
24300}
24301
24303 const X86Subtarget &Subtarget,
24304 SelectionDAG &DAG) {
24305 MVT VT = Op->getSimpleValueType(0);
24306 SDValue In = Op->getOperand(0);
24307 MVT InVT = In.getSimpleValueType();
24308 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
24309 MVT VTElt = VT.getVectorElementType();
24310 SDLoc dl(Op);
24311
24312 unsigned NumElts = VT.getVectorNumElements();
24313
24314 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
24315 MVT ExtVT = VT;
24316 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
24317 // If v16i32 is to be avoided, we'll need to split and concatenate.
24318 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
24319 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
24320
24321 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
24322 }
24323
24324 // Widen to 512-bits if VLX is not supported.
24325 MVT WideVT = ExtVT;
24326 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
24327 NumElts *= 512 / ExtVT.getSizeInBits();
24328 InVT = MVT::getVectorVT(MVT::i1, NumElts);
24329 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
24330 In, DAG.getIntPtrConstant(0, dl));
24331 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
24332 }
24333
24334 SDValue V;
24335 MVT WideEltVT = WideVT.getVectorElementType();
24336 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
24337 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
24338 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
24339 } else {
24340 SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
24341 SDValue Zero = DAG.getConstant(0, dl, WideVT);
24342 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
24343 }
24344
24345 // Truncate if we had to extend i16/i8 above.
24346 if (VT != ExtVT) {
24347 WideVT = MVT::getVectorVT(VTElt, NumElts);
24348 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
24349 }
24350
24351 // Extract back to 128/256-bit if we widened.
24352 if (WideVT != VT)
24353 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
24354 DAG.getIntPtrConstant(0, dl));
24355
24356 return V;
24357}
24358
24360 SelectionDAG &DAG) {
24361 SDValue In = Op->getOperand(0);
24362 MVT InVT = In.getSimpleValueType();
24363
24364 if (InVT.getVectorElementType() == MVT::i1)
24365 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
24366
24367 assert(Subtarget.hasAVX() && "Expected AVX support");
24368 return LowerAVXExtend(Op, DAG, Subtarget);
24369}
24370
24371// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
24372// For sign extend this needs to handle all vector sizes and SSE4.1 and
24373// non-SSE4.1 targets. For zero extend this should only handle inputs of
24374// MVT::v64i8 when BWI is not supported, but AVX512 is.
24376 const X86Subtarget &Subtarget,
24377 SelectionDAG &DAG) {
24378 SDValue In = Op->getOperand(0);
24379 MVT VT = Op->getSimpleValueType(0);
24380 MVT InVT = In.getSimpleValueType();
24381
24382 MVT SVT = VT.getVectorElementType();
24383 MVT InSVT = InVT.getVectorElementType();
24385
24386 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
24387 return SDValue();
24388 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
24389 return SDValue();
24390 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
24391 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
24392 !(VT.is512BitVector() && Subtarget.hasAVX512()))
24393 return SDValue();
24394
24395 SDLoc dl(Op);
24396 unsigned Opc = Op.getOpcode();
24397 unsigned NumElts = VT.getVectorNumElements();
24398
24399 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
24400 // For 512-bit vectors, we need 128-bits or 256-bits.
24401 if (InVT.getSizeInBits() > 128) {
24402 // Input needs to be at least the same number of elements as output, and
24403 // at least 128-bits.
24404 int InSize = InSVT.getSizeInBits() * NumElts;
24405 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
24406 InVT = In.getSimpleValueType();
24407 }
24408
24409 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
24410 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
24411 // need to be handled here for 256/512-bit results.
24412 if (Subtarget.hasInt256()) {
24413 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
24414
24415 if (InVT.getVectorNumElements() != NumElts)
24416 return DAG.getNode(Op.getOpcode(), dl, VT, In);
24417
24418 // FIXME: Apparently we create inreg operations that could be regular
24419 // extends.
24420 unsigned ExtOpc =
24423 return DAG.getNode(ExtOpc, dl, VT, In);
24424 }
24425
24426 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
24427 if (Subtarget.hasAVX()) {
24428 assert(VT.is256BitVector() && "256-bit vector expected");
24429 MVT HalfVT = VT.getHalfNumVectorElementsVT();
24430 int HalfNumElts = HalfVT.getVectorNumElements();
24431
24432 unsigned NumSrcElts = InVT.getVectorNumElements();
24433 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
24434 for (int i = 0; i != HalfNumElts; ++i)
24435 HiMask[i] = HalfNumElts + i;
24436
24437 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
24438 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
24439 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
24440 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
24441 }
24442
24443 // We should only get here for sign extend.
24444 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
24445 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");
24446 unsigned InNumElts = InVT.getVectorNumElements();
24447
24448 // If the source elements are already all-signbits, we don't need to extend,
24449 // just splat the elements.
24450 APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts);
24451 if (DAG.ComputeNumSignBits(In, DemandedElts) == InVT.getScalarSizeInBits()) {
24452 unsigned Scale = InNumElts / NumElts;
24453 SmallVector<int, 16> ShuffleMask;
24454 for (unsigned I = 0; I != NumElts; ++I)
24455 ShuffleMask.append(Scale, I);
24456 return DAG.getBitcast(VT,
24457 DAG.getVectorShuffle(InVT, dl, In, In, ShuffleMask));
24458 }
24459
24460 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
24461 SDValue Curr = In;
24462 SDValue SignExt = Curr;
24463
24464 // As SRAI is only available on i16/i32 types, we expand only up to i32
24465 // and handle i64 separately.
24466 if (InVT != MVT::v4i32) {
24467 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
24468
24469 unsigned DestWidth = DestVT.getScalarSizeInBits();
24470 unsigned Scale = DestWidth / InSVT.getSizeInBits();
24471 unsigned DestElts = DestVT.getVectorNumElements();
24472
24473 // Build a shuffle mask that takes each input element and places it in the
24474 // MSBs of the new element size.
24475 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
24476 for (unsigned i = 0; i != DestElts; ++i)
24477 Mask[i * Scale + (Scale - 1)] = i;
24478
24479 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
24480 Curr = DAG.getBitcast(DestVT, Curr);
24481
24482 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
24483 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
24484 DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
24485 }
24486
24487 if (VT == MVT::v2i64) {
24488 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
24489 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
24490 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
24491 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
24492 SignExt = DAG.getBitcast(VT, SignExt);
24493 }
24494
24495 return SignExt;
24496}
24497
24499 SelectionDAG &DAG) {
24500 MVT VT = Op->getSimpleValueType(0);
24501 SDValue In = Op->getOperand(0);
24502 MVT InVT = In.getSimpleValueType();
24503 SDLoc dl(Op);
24504
24505 if (InVT.getVectorElementType() == MVT::i1)
24506 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
24507
24508 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
24510 "Expected same number of elements");
24511 assert((VT.getVectorElementType() == MVT::i16 ||
24512 VT.getVectorElementType() == MVT::i32 ||
24513 VT.getVectorElementType() == MVT::i64) &&
24514 "Unexpected element type");
24515 assert((InVT.getVectorElementType() == MVT::i8 ||
24516 InVT.getVectorElementType() == MVT::i16 ||
24517 InVT.getVectorElementType() == MVT::i32) &&
24518 "Unexpected element type");
24519
24520 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
24521 assert(InVT == MVT::v32i8 && "Unexpected VT!");
24522 return splitVectorIntUnary(Op, DAG, dl);
24523 }
24524
24525 if (Subtarget.hasInt256())
24526 return Op;
24527
24528 // Optimize vectors in AVX mode
24529 // Sign extend v8i16 to v8i32 and
24530 // v4i32 to v4i64
24531 //
24532 // Divide input vector into two parts
24533 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
24534 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
24535 // concat the vectors to original VT
24536 MVT HalfVT = VT.getHalfNumVectorElementsVT();
24537 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
24538
24539 unsigned NumElems = InVT.getVectorNumElements();
24540 SmallVector<int,8> ShufMask(NumElems, -1);
24541 for (unsigned i = 0; i != NumElems/2; ++i)
24542 ShufMask[i] = i + NumElems/2;
24543
24544 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
24545 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
24546
24547 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
24548}
24549
24550/// Change a vector store into a pair of half-size vector stores.
24552 SDValue StoredVal = Store->getValue();
24553 assert((StoredVal.getValueType().is256BitVector() ||
24554 StoredVal.getValueType().is512BitVector()) &&
24555 "Expecting 256/512-bit op");
24556
24557 // Splitting volatile memory ops is not allowed unless the operation was not
24558 // legal to begin with. Assume the input store is legal (this transform is
24559 // only used for targets with AVX). Note: It is possible that we have an
24560 // illegal type like v2i128, and so we could allow splitting a volatile store
24561 // in that case if that is important.
24562 if (!Store->isSimple())
24563 return SDValue();
24564
24565 SDLoc DL(Store);
24566 SDValue Value0, Value1;
24567 std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
24568 unsigned HalfOffset = Value0.getValueType().getStoreSize();
24569 SDValue Ptr0 = Store->getBasePtr();
24570 SDValue Ptr1 =
24571 DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(HalfOffset), DL);
24572 SDValue Ch0 =
24573 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
24574 Store->getOriginalAlign(),
24575 Store->getMemOperand()->getFlags());
24576 SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
24577 Store->getPointerInfo().getWithOffset(HalfOffset),
24578 Store->getOriginalAlign(),
24579 Store->getMemOperand()->getFlags());
24580 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
24581}
24582
24583/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
24584/// type.
24586 SelectionDAG &DAG) {
24587 SDValue StoredVal = Store->getValue();
24588 assert(StoreVT.is128BitVector() &&
24589 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
24590 StoredVal = DAG.getBitcast(StoreVT, StoredVal);
24591
24592 // Splitting volatile memory ops is not allowed unless the operation was not
24593 // legal to begin with. We are assuming the input op is legal (this transform
24594 // is only used for targets with AVX).
24595 if (!Store->isSimple())
24596 return SDValue();
24597
24598 MVT StoreSVT = StoreVT.getScalarType();
24599 unsigned NumElems = StoreVT.getVectorNumElements();
24600 unsigned ScalarSize = StoreSVT.getStoreSize();
24601
24602 SDLoc DL(Store);
24604 for (unsigned i = 0; i != NumElems; ++i) {
24605 unsigned Offset = i * ScalarSize;
24606 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
24608 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
24609 DAG.getIntPtrConstant(i, DL));
24610 SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
24611 Store->getPointerInfo().getWithOffset(Offset),
24612 Store->getOriginalAlign(),
24613 Store->getMemOperand()->getFlags());
24614 Stores.push_back(Ch);
24615 }
24616 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
24617}
24618
24619static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
24620 SelectionDAG &DAG) {
24621 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
24622 SDLoc dl(St);
24623 SDValue StoredVal = St->getValue();
24624
24625 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
24626 if (StoredVal.getValueType().isVector() &&
24627 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
24628 unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
24629 assert(NumElts <= 8 && "Unexpected VT");
24630 assert(!St->isTruncatingStore() && "Expected non-truncating store");
24631 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
24632 "Expected AVX512F without AVX512DQI");
24633
24634 // We must pad with zeros to ensure we store zeroes to any unused bits.
24635 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
24636 DAG.getUNDEF(MVT::v16i1), StoredVal,
24637 DAG.getIntPtrConstant(0, dl));
24638 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
24639 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
24640 // Make sure we store zeros in the extra bits.
24641 if (NumElts < 8)
24642 StoredVal = DAG.getZeroExtendInReg(
24643 StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
24644
24645 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
24646 St->getPointerInfo(), St->getOriginalAlign(),
24647 St->getMemOperand()->getFlags());
24648 }
24649
24650 if (St->isTruncatingStore())
24651 return SDValue();
24652
24653 // If this is a 256-bit store of concatenated ops, we are better off splitting
24654 // that store into two 128-bit stores. This avoids spurious use of 256-bit ops
24655 // and each half can execute independently. Some cores would split the op into
24656 // halves anyway, so the concat (vinsertf128) is purely an extra op.
24657 MVT StoreVT = StoredVal.getSimpleValueType();
24658 if (StoreVT.is256BitVector() ||
24659 ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&
24660 !Subtarget.hasBWI())) {
24661 if (StoredVal.hasOneUse() && isFreeToSplitVector(StoredVal.getNode(), DAG))
24662 return splitVectorStore(St, DAG);
24663 return SDValue();
24664 }
24665
24666 if (StoreVT.is32BitVector())
24667 return SDValue();
24668
24669 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24670 assert(StoreVT.is64BitVector() && "Unexpected VT");
24671 assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
24673 "Unexpected type action!");
24674
24675 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
24676 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
24677 DAG.getUNDEF(StoreVT));
24678
24679 if (Subtarget.hasSSE2()) {
24680 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
24681 // and store it.
24682 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
24683 MVT CastVT = MVT::getVectorVT(StVT, 2);
24684 StoredVal = DAG.getBitcast(CastVT, StoredVal);
24685 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
24686 DAG.getIntPtrConstant(0, dl));
24687
24688 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
24689 St->getPointerInfo(), St->getOriginalAlign(),
24690 St->getMemOperand()->getFlags());
24691 }
24692 assert(Subtarget.hasSSE1() && "Expected SSE");
24693 SDVTList Tys = DAG.getVTList(MVT::Other);
24694 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
24695 return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
24696 St->getMemOperand());
24697}
24698
24699// Lower vector extended loads using a shuffle. If SSSE3 is not available we
24700// may emit an illegal shuffle but the expansion is still better than scalar
24701// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
24702// we'll emit a shuffle and a arithmetic shift.
24703// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
24704// TODO: It is possible to support ZExt by zeroing the undef values during
24705// the shuffle phase or after the shuffle.
24706static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
24707 SelectionDAG &DAG) {
24708 MVT RegVT = Op.getSimpleValueType();
24709 assert(RegVT.isVector() && "We only custom lower vector loads.");
24710 assert(RegVT.isInteger() &&
24711 "We only custom lower integer vector loads.");
24712
24713 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
24714 SDLoc dl(Ld);
24715
24716 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
24717 if (RegVT.getVectorElementType() == MVT::i1) {
24718 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
24719 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
24720 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
24721 "Expected AVX512F without AVX512DQI");
24722
24723 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
24724 Ld->getPointerInfo(), Ld->getOriginalAlign(),
24725 Ld->getMemOperand()->getFlags());
24726
24727 // Replace chain users with the new chain.
24728 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
24729
24730 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
24731 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
24732 DAG.getBitcast(MVT::v16i1, Val),
24733 DAG.getIntPtrConstant(0, dl));
24734 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
24735 }
24736
24737 return SDValue();
24738}
24739
24740/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
24741/// each of which has no other use apart from the AND / OR.
24742static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
24743 Opc = Op.getOpcode();
24744 if (Opc != ISD::OR && Opc != ISD::AND)
24745 return false;
24746 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
24747 Op.getOperand(0).hasOneUse() &&
24748 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
24749 Op.getOperand(1).hasOneUse());
24750}
24751
24752SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
24753 SDValue Chain = Op.getOperand(0);
24754 SDValue Cond = Op.getOperand(1);
24755 SDValue Dest = Op.getOperand(2);
24756 SDLoc dl(Op);
24757
24758 // Bail out when we don't have native compare instructions.
24759 if (Cond.getOpcode() == ISD::SETCC &&
24760 Cond.getOperand(0).getValueType() != MVT::f128 &&
24761 !isSoftF16(Cond.getOperand(0).getValueType(), Subtarget)) {
24762 SDValue LHS = Cond.getOperand(0);
24763 SDValue RHS = Cond.getOperand(1);
24764 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
24765
24766 // Special case for
24767 // setcc([su]{add,sub,mul}o == 0)
24768 // setcc([su]{add,sub,mul}o != 1)
24769 if (ISD::isOverflowIntrOpRes(LHS) &&
24770 (CC == ISD::SETEQ || CC == ISD::SETNE) &&
24771 (isNullConstant(RHS) || isOneConstant(RHS))) {
24772 SDValue Value, Overflow;
24773 X86::CondCode X86Cond;
24774 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
24775
24776 if ((CC == ISD::SETEQ) == isNullConstant(RHS))
24777 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
24778
24779 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24780 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24781 Overflow);
24782 }
24783
24784 if (LHS.getSimpleValueType().isInteger()) {
24785 SDValue CCVal;
24786 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
24787 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24788 EFLAGS);
24789 }
24790
24791 if (CC == ISD::SETOEQ) {
24792 // For FCMP_OEQ, we can emit
24793 // two branches instead of an explicit AND instruction with a
24794 // separate test. However, we only do this if this block doesn't
24795 // have a fall-through edge, because this requires an explicit
24796 // jmp when the condition is false.
24797 if (Op.getNode()->hasOneUse()) {
24798 SDNode *User = *Op.getNode()->use_begin();
24799 // Look for an unconditional branch following this conditional branch.
24800 // We need this because we need to reverse the successors in order
24801 // to implement FCMP_OEQ.
24802 if (User->getOpcode() == ISD::BR) {
24803 SDValue FalseBB = User->getOperand(1);
24804 SDNode *NewBR =
24805 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
24806 assert(NewBR == User);
24807 (void)NewBR;
24808 Dest = FalseBB;
24809
24810 SDValue Cmp =
24811 DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24812 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
24813 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
24814 CCVal, Cmp);
24815 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
24816 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24817 Cmp);
24818 }
24819 }
24820 } else if (CC == ISD::SETUNE) {
24821 // For FCMP_UNE, we can emit
24822 // two branches instead of an explicit OR instruction with a
24823 // separate test.
24824 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24825 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
24826 Chain =
24827 DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);
24828 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
24829 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24830 Cmp);
24831 } else {
24832 X86::CondCode X86Cond =
24833 TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
24834 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24835 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24836 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24837 Cmp);
24838 }
24839 }
24840
24842 SDValue Value, Overflow;
24843 X86::CondCode X86Cond;
24844 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
24845
24846 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24847 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24848 Overflow);
24849 }
24850
24851 // Look past the truncate if the high bits are known zero.
24853 Cond = Cond.getOperand(0);
24854
24855 EVT CondVT = Cond.getValueType();
24856
24857 // Add an AND with 1 if we don't already have one.
24858 if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
24859 Cond =
24860 DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
24861
24862 SDValue LHS = Cond;
24863 SDValue RHS = DAG.getConstant(0, dl, CondVT);
24864
24865 SDValue CCVal;
24866 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
24867 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24868 EFLAGS);
24869}
24870
24871// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
24872// Calls to _alloca are needed to probe the stack when allocating more than 4k
24873// bytes in one go. Touching the stack at 4K increments is necessary to ensure
24874// that the guard pages used by the OS virtual memory manager are allocated in
24875// correct sequence.
24876SDValue
24877X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
24878 SelectionDAG &DAG) const {
24880 bool SplitStack = MF.shouldSplitStack();
24881 bool EmitStackProbeCall = hasStackProbeSymbol(MF);
24882 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
24883 SplitStack || EmitStackProbeCall;
24884 SDLoc dl(Op);
24885
24886 // Get the inputs.
24887 SDNode *Node = Op.getNode();
24888 SDValue Chain = Op.getOperand(0);
24889 SDValue Size = Op.getOperand(1);
24890 MaybeAlign Alignment(Op.getConstantOperandVal(2));
24891 EVT VT = Node->getValueType(0);
24892
24893 // Chain the dynamic stack allocation so that it doesn't modify the stack
24894 // pointer when other instructions are using the stack.
24895 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
24896
24897 bool Is64Bit = Subtarget.is64Bit();
24898 MVT SPTy = getPointerTy(DAG.getDataLayout());
24899
24901 if (!Lower) {
24902 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24904 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
24905 " not tell us which reg is the stack pointer!");
24906
24907 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
24908 const Align StackAlign = TFI.getStackAlign();
24909 if (hasInlineStackProbe(MF)) {
24911
24912 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
24913 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
24914 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
24915 Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,
24916 DAG.getRegister(Vreg, SPTy));
24917 } else {
24918 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
24919 Chain = SP.getValue(1);
24920 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
24921 }
24922 if (Alignment && *Alignment > StackAlign)
24923 Result =
24924 DAG.getNode(ISD::AND, dl, VT, Result,
24925 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
24926 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
24927 } else if (SplitStack) {
24929
24930 if (Is64Bit) {
24931 // The 64 bit implementation of segmented stacks needs to clobber both r10
24932 // r11. This makes it impossible to use it along with nested parameters.
24933 const Function &F = MF.getFunction();
24934 for (const auto &A : F.args()) {
24935 if (A.hasNestAttr())
24936 report_fatal_error("Cannot use segmented stacks with functions that "
24937 "have nested arguments.");
24938 }
24939 }
24940
24941 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
24942 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
24943 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
24944 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
24945 DAG.getRegister(Vreg, SPTy));
24946 } else {
24947 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
24948 Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);
24949 MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);
24950
24951 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
24952 Register SPReg = RegInfo->getStackRegister();
24953 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
24954 Chain = SP.getValue(1);
24955
24956 if (Alignment) {
24957 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
24958 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
24959 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
24960 }
24961
24962 Result = SP;
24963 }
24964
24965 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
24966
24967 SDValue Ops[2] = {Result, Chain};
24968 return DAG.getMergeValues(Ops, dl);
24969}
24970
24971SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
24973 auto PtrVT = getPointerTy(MF.getDataLayout());
24975
24976 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
24977 SDLoc DL(Op);
24978
24979 if (!Subtarget.is64Bit() ||
24980 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
24981 // vastart just stores the address of the VarArgsFrameIndex slot into the
24982 // memory location argument.
24983 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
24984 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
24985 MachinePointerInfo(SV));
24986 }
24987
24988 // __va_list_tag:
24989 // gp_offset (0 - 6 * 8)
24990 // fp_offset (48 - 48 + 8 * 16)
24991 // overflow_arg_area (point to parameters coming in memory).
24992 // reg_save_area
24994 SDValue FIN = Op.getOperand(1);
24995 // Store gp_offset
24996 SDValue Store = DAG.getStore(
24997 Op.getOperand(0), DL,
24998 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
24999 MachinePointerInfo(SV));
25000 MemOps.push_back(Store);
25001
25002 // Store fp_offset
25003 FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::getFixed(4), DL);
25004 Store = DAG.getStore(
25005 Op.getOperand(0), DL,
25006 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
25007 MachinePointerInfo(SV, 4));
25008 MemOps.push_back(Store);
25009
25010 // Store ptr to overflow_arg_area
25011 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
25012 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25013 Store =
25014 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
25015 MemOps.push_back(Store);
25016
25017 // Store ptr to reg_save_area.
25018 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
25019 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
25020 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
25021 Store = DAG.getStore(
25022 Op.getOperand(0), DL, RSFIN, FIN,
25023 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
25024 MemOps.push_back(Store);
25025 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
25026}
25027
25028SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
25029 assert(Subtarget.is64Bit() &&
25030 "LowerVAARG only handles 64-bit va_arg!");
25031 assert(Op.getNumOperands() == 4);
25032
25034 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
25035 // The Win64 ABI uses char* instead of a structure.
25036 return DAG.expandVAArg(Op.getNode());
25037
25038 SDValue Chain = Op.getOperand(0);
25039 SDValue SrcPtr = Op.getOperand(1);
25040 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25041 unsigned Align = Op.getConstantOperandVal(3);
25042 SDLoc dl(Op);
25043
25044 EVT ArgVT = Op.getNode()->getValueType(0);
25045 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
25046 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
25047 uint8_t ArgMode;
25048
25049 // Decide which area this value should be read from.
25050 // TODO: Implement the AMD64 ABI in its entirety. This simple
25051 // selection mechanism works only for the basic types.
25052 assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
25053 if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
25054 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
25055 } else {
25056 assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&
25057 "Unhandled argument type in LowerVAARG");
25058 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
25059 }
25060
25061 if (ArgMode == 2) {
25062 // Make sure using fp_offset makes sense.
25063 assert(!Subtarget.useSoftFloat() &&
25064 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
25065 Subtarget.hasSSE1());
25066 }
25067
25068 // Insert VAARG node into the DAG
25069 // VAARG returns two values: Variable Argument Address, Chain
25070 SDValue InstOps[] = {Chain, SrcPtr,
25071 DAG.getTargetConstant(ArgSize, dl, MVT::i32),
25072 DAG.getTargetConstant(ArgMode, dl, MVT::i8),
25073 DAG.getTargetConstant(Align, dl, MVT::i32)};
25074 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
25077 VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
25078 /*Alignment=*/std::nullopt,
25080 Chain = VAARG.getValue(1);
25081
25082 // Load the next argument and return it
25083 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
25084}
25085
25086static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
25087 SelectionDAG &DAG) {
25088 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
25089 // where a va_list is still an i8*.
25090 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
25091 if (Subtarget.isCallingConvWin64(
25093 // Probably a Win64 va_copy.
25094 return DAG.expandVACopy(Op.getNode());
25095
25096 SDValue Chain = Op.getOperand(0);
25097 SDValue DstPtr = Op.getOperand(1);
25098 SDValue SrcPtr = Op.getOperand(2);
25099 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
25100 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
25101 SDLoc DL(Op);
25102
25103 return DAG.getMemcpy(
25104 Chain, DL, DstPtr, SrcPtr,
25105 DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
25106 Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
25107 false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
25108}
25109
25110// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
25111static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
25112 switch (Opc) {
25113 case ISD::SHL:
25114 case X86ISD::VSHL:
25115 case X86ISD::VSHLI:
25116 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
25117 case ISD::SRL:
25118 case X86ISD::VSRL:
25119 case X86ISD::VSRLI:
25120 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
25121 case ISD::SRA:
25122 case X86ISD::VSRA:
25123 case X86ISD::VSRAI:
25124 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
25125 }
25126 llvm_unreachable("Unknown target vector shift node");
25127}
25128
25129/// Handle vector element shifts where the shift amount is a constant.
25130/// Takes immediate version of shift as input.
25131static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
25132 SDValue SrcOp, uint64_t ShiftAmt,
25133 SelectionDAG &DAG) {
25134 MVT ElementType = VT.getVectorElementType();
25135
25136 // Bitcast the source vector to the output type, this is mainly necessary for
25137 // vXi8/vXi64 shifts.
25138 if (VT != SrcOp.getSimpleValueType())
25139 SrcOp = DAG.getBitcast(VT, SrcOp);
25140
25141 // Fold this packed shift into its first operand if ShiftAmt is 0.
25142 if (ShiftAmt == 0)
25143 return SrcOp;
25144
25145 // Check for ShiftAmt >= element width
25146 if (ShiftAmt >= ElementType.getSizeInBits()) {
25147 if (Opc == X86ISD::VSRAI)
25148 ShiftAmt = ElementType.getSizeInBits() - 1;
25149 else
25150 return DAG.getConstant(0, dl, VT);
25151 }
25152
25153 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
25154 && "Unknown target vector shift-by-constant node");
25155
25156 // Fold this packed vector shift into a build vector if SrcOp is a
25157 // vector of Constants or UNDEFs.
25159 unsigned ShiftOpc;
25160 switch (Opc) {
25161 default: llvm_unreachable("Unknown opcode!");
25162 case X86ISD::VSHLI:
25163 ShiftOpc = ISD::SHL;
25164 break;
25165 case X86ISD::VSRLI:
25166 ShiftOpc = ISD::SRL;
25167 break;
25168 case X86ISD::VSRAI:
25169 ShiftOpc = ISD::SRA;
25170 break;
25171 }
25172
25173 SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);
25174 if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))
25175 return C;
25176 }
25177
25178 return DAG.getNode(Opc, dl, VT, SrcOp,
25179 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
25180}
25181
25182/// Handle vector element shifts by a splat shift amount
25183static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
25184 SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,
25185 const X86Subtarget &Subtarget,
25186 SelectionDAG &DAG) {
25187 MVT AmtVT = ShAmt.getSimpleValueType();
25188 assert(AmtVT.isVector() && "Vector shift type mismatch");
25189 assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&
25190 "Illegal vector splat index");
25191
25192 // Move the splat element to the bottom element.
25193 if (ShAmtIdx != 0) {
25194 SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
25195 Mask[0] = ShAmtIdx;
25196 ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);
25197 }
25198
25199 // Peek through any zext node if we can get back to a 128-bit source.
25200 if (AmtVT.getScalarSizeInBits() == 64 &&
25201 (ShAmt.getOpcode() == ISD::ZERO_EXTEND ||
25203 ShAmt.getOperand(0).getValueType().isSimple() &&
25204 ShAmt.getOperand(0).getValueType().is128BitVector()) {
25205 ShAmt = ShAmt.getOperand(0);
25206 AmtVT = ShAmt.getSimpleValueType();
25207 }
25208
25209 // See if we can mask off the upper elements using the existing source node.
25210 // The shift uses the entire lower 64-bits of the amount vector, so no need to
25211 // do this for vXi64 types.
25212 bool IsMasked = false;
25213 if (AmtVT.getScalarSizeInBits() < 64) {
25214 if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||
25215 ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {
25216 // If the shift amount has come from a scalar, then zero-extend the scalar
25217 // before moving to the vector.
25218 ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);
25219 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
25220 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
25221 AmtVT = MVT::v4i32;
25222 IsMasked = true;
25223 } else if (ShAmt.getOpcode() == ISD::AND) {
25224 // See if the shift amount is already masked (e.g. for rotation modulo),
25225 // then we can zero-extend it by setting all the other mask elements to
25226 // zero.
25227 SmallVector<SDValue> MaskElts(
25228 AmtVT.getVectorNumElements(),
25229 DAG.getConstant(0, dl, AmtVT.getScalarType()));
25230 MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());
25231 SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);
25232 if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,
25233 {ShAmt.getOperand(1), Mask}))) {
25234 ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);
25235 IsMasked = true;
25236 }
25237 }
25238 }
25239
25240 // Extract if the shift amount vector is larger than 128-bits.
25241 if (AmtVT.getSizeInBits() > 128) {
25242 ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);
25243 AmtVT = ShAmt.getSimpleValueType();
25244 }
25245
25246 // Zero-extend bottom element to v2i64 vector type, either by extension or
25247 // shuffle masking.
25248 if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {
25249 if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||
25250 ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {
25251 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);
25252 } else if (Subtarget.hasSSE41()) {
25253 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
25254 MVT::v2i64, ShAmt);
25255 } else {
25256 SDValue ByteShift = DAG.getTargetConstant(
25257 (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
25258 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
25259 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
25260 ByteShift);
25261 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
25262 ByteShift);
25263 }
25264 }
25265
25266 // Change opcode to non-immediate version.
25267 Opc = getTargetVShiftUniformOpcode(Opc, true);
25268
25269 // The return type has to be a 128-bit type with the same element
25270 // type as the input type.
25271 MVT EltVT = VT.getVectorElementType();
25272 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
25273
25274 ShAmt = DAG.getBitcast(ShVT, ShAmt);
25275 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
25276}
25277
25278/// Return Mask with the necessary casting or extending
25279/// for \p Mask according to \p MaskVT when lowering masking intrinsics
25280static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
25281 const X86Subtarget &Subtarget, SelectionDAG &DAG,
25282 const SDLoc &dl) {
25283
25284 if (isAllOnesConstant(Mask))
25285 return DAG.getConstant(1, dl, MaskVT);
25286 if (X86::isZeroNode(Mask))
25287 return DAG.getConstant(0, dl, MaskVT);
25288
25289 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");
25290
25291 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
25292 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
25293 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
25294 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
25295 SDValue Lo, Hi;
25296 std::tie(Lo, Hi) = DAG.SplitScalar(Mask, dl, MVT::i32, MVT::i32);
25297 Lo = DAG.getBitcast(MVT::v32i1, Lo);
25298 Hi = DAG.getBitcast(MVT::v32i1, Hi);
25299 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
25300 } else {
25301 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
25302 Mask.getSimpleValueType().getSizeInBits());
25303 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
25304 // are extracted by EXTRACT_SUBVECTOR.
25305 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
25306 DAG.getBitcast(BitcastVT, Mask),
25307 DAG.getIntPtrConstant(0, dl));
25308 }
25309}
25310
25311/// Return (and \p Op, \p Mask) for compare instructions or
25312/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
25313/// necessary casting or extending for \p Mask when lowering masking intrinsics
25315 SDValue PreservedSrc,
25316 const X86Subtarget &Subtarget,
25317 SelectionDAG &DAG) {
25318 MVT VT = Op.getSimpleValueType();
25319 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
25320 unsigned OpcodeSelect = ISD::VSELECT;
25321 SDLoc dl(Op);
25322
25323 if (isAllOnesConstant(Mask))
25324 return Op;
25325
25326 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25327
25328 if (PreservedSrc.isUndef())
25329 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
25330 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
25331}
25332
25333/// Creates an SDNode for a predicated scalar operation.
25334/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
25335/// The mask is coming as MVT::i8 and it should be transformed
25336/// to MVT::v1i1 while lowering masking intrinsics.
25337/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
25338/// "X86select" instead of "vselect". We just can't create the "vselect" node
25339/// for a scalar instruction.
25341 SDValue PreservedSrc,
25342 const X86Subtarget &Subtarget,
25343 SelectionDAG &DAG) {
25344
25345 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
25346 if (MaskConst->getZExtValue() & 0x1)
25347 return Op;
25348
25349 MVT VT = Op.getSimpleValueType();
25350 SDLoc dl(Op);
25351
25352 assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
25353 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
25354 DAG.getBitcast(MVT::v8i1, Mask),
25355 DAG.getIntPtrConstant(0, dl));
25356 if (Op.getOpcode() == X86ISD::FSETCCM ||
25357 Op.getOpcode() == X86ISD::FSETCCM_SAE ||
25358 Op.getOpcode() == X86ISD::VFPCLASSS)
25359 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
25360
25361 if (PreservedSrc.isUndef())
25362 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
25363 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
25364}
25365
25367 if (!Fn->hasPersonalityFn())
25369 "querying registration node size for function without personality");
25370 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
25371 // WinEHStatePass for the full struct definition.
25372 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
25373 case EHPersonality::MSVC_X86SEH: return 24;
25374 case EHPersonality::MSVC_CXX: return 16;
25375 default: break;
25376 }
25378 "can only recover FP for 32-bit MSVC EH personality functions");
25379}
25380
25381/// When the MSVC runtime transfers control to us, either to an outlined
25382/// function or when returning to a parent frame after catching an exception, we
25383/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
25384/// Here's the math:
25385/// RegNodeBase = EntryEBP - RegNodeSize
25386/// ParentFP = RegNodeBase - ParentFrameOffset
25387/// Subtracting RegNodeSize takes us to the offset of the registration node, and
25388/// subtracting the offset (negative on x86) takes us back to the parent FP.
25390 SDValue EntryEBP) {
25392 SDLoc dl;
25393
25394 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25395 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
25396
25397 // It's possible that the parent function no longer has a personality function
25398 // if the exceptional code was optimized away, in which case we just return
25399 // the incoming EBP.
25400 if (!Fn->hasPersonalityFn())
25401 return EntryEBP;
25402
25403 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
25404 // registration, or the .set_setframe offset.
25405 MCSymbol *OffsetSym =
25408 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
25409 SDValue ParentFrameOffset =
25410 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
25411
25412 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
25413 // prologue to RBP in the parent function.
25414 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
25415 if (Subtarget.is64Bit())
25416 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
25417
25418 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
25419 // RegNodeBase = EntryEBP - RegNodeSize
25420 // ParentFP = RegNodeBase - ParentFrameOffset
25421 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
25422 DAG.getConstant(RegNodeSize, dl, PtrVT));
25423 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
25424}
25425
25426SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
25427 SelectionDAG &DAG) const {
25428 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
25429 auto isRoundModeCurDirection = [](SDValue Rnd) {
25430 if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
25431 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
25432
25433 return false;
25434 };
25435 auto isRoundModeSAE = [](SDValue Rnd) {
25436 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
25437 unsigned RC = C->getZExtValue();
25439 // Clear the NO_EXC bit and check remaining bits.
25441 // As a convenience we allow no other bits or explicitly
25442 // current direction.
25443 return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
25444 }
25445 }
25446
25447 return false;
25448 };
25449 auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
25450 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
25451 RC = C->getZExtValue();
25453 // Clear the NO_EXC bit and check remaining bits.
25459 }
25460 }
25461
25462 return false;
25463 };
25464
25465 SDLoc dl(Op);
25466 unsigned IntNo = Op.getConstantOperandVal(0);
25467 MVT VT = Op.getSimpleValueType();
25468 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
25469
25470 // Propagate flags from original node to transformed node(s).
25471 SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
25472
25473 if (IntrData) {
25474 switch(IntrData->Type) {
25475 case INTR_TYPE_1OP: {
25476 // We specify 2 possible opcodes for intrinsics with rounding modes.
25477 // First, we check if the intrinsic may have non-default rounding mode,
25478 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25479 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25480 if (IntrWithRoundingModeOpcode != 0) {
25481 SDValue Rnd = Op.getOperand(2);
25482 unsigned RC = 0;
25483 if (isRoundModeSAEToX(Rnd, RC))
25484 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25485 Op.getOperand(1),
25486 DAG.getTargetConstant(RC, dl, MVT::i32));
25487 if (!isRoundModeCurDirection(Rnd))
25488 return SDValue();
25489 }
25490 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25491 Op.getOperand(1));
25492 }
25493 case INTR_TYPE_1OP_SAE: {
25494 SDValue Sae = Op.getOperand(2);
25495
25496 unsigned Opc;
25497 if (isRoundModeCurDirection(Sae))
25498 Opc = IntrData->Opc0;
25499 else if (isRoundModeSAE(Sae))
25500 Opc = IntrData->Opc1;
25501 else
25502 return SDValue();
25503
25504 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
25505 }
25506 case INTR_TYPE_2OP: {
25507 SDValue Src2 = Op.getOperand(2);
25508
25509 // We specify 2 possible opcodes for intrinsics with rounding modes.
25510 // First, we check if the intrinsic may have non-default rounding mode,
25511 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25512 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25513 if (IntrWithRoundingModeOpcode != 0) {
25514 SDValue Rnd = Op.getOperand(3);
25515 unsigned RC = 0;
25516 if (isRoundModeSAEToX(Rnd, RC))
25517 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25518 Op.getOperand(1), Src2,
25519 DAG.getTargetConstant(RC, dl, MVT::i32));
25520 if (!isRoundModeCurDirection(Rnd))
25521 return SDValue();
25522 }
25523
25524 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25525 Op.getOperand(1), Src2);
25526 }
25527 case INTR_TYPE_2OP_SAE: {
25528 SDValue Sae = Op.getOperand(3);
25529
25530 unsigned Opc;
25531 if (isRoundModeCurDirection(Sae))
25532 Opc = IntrData->Opc0;
25533 else if (isRoundModeSAE(Sae))
25534 Opc = IntrData->Opc1;
25535 else
25536 return SDValue();
25537
25538 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
25539 Op.getOperand(2));
25540 }
25541 case INTR_TYPE_3OP:
25542 case INTR_TYPE_3OP_IMM8: {
25543 SDValue Src1 = Op.getOperand(1);
25544 SDValue Src2 = Op.getOperand(2);
25545 SDValue Src3 = Op.getOperand(3);
25546
25547 if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
25548 Src3.getValueType() != MVT::i8) {
25549 Src3 = DAG.getTargetConstant(Src3->getAsZExtVal() & 0xff, dl, MVT::i8);
25550 }
25551
25552 // We specify 2 possible opcodes for intrinsics with rounding modes.
25553 // First, we check if the intrinsic may have non-default rounding mode,
25554 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25555 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25556 if (IntrWithRoundingModeOpcode != 0) {
25557 SDValue Rnd = Op.getOperand(4);
25558 unsigned RC = 0;
25559 if (isRoundModeSAEToX(Rnd, RC))
25560 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25561 Src1, Src2, Src3,
25562 DAG.getTargetConstant(RC, dl, MVT::i32));
25563 if (!isRoundModeCurDirection(Rnd))
25564 return SDValue();
25565 }
25566
25567 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25568 {Src1, Src2, Src3});
25569 }
25570 case INTR_TYPE_4OP_IMM8: {
25571 assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);
25572 SDValue Src4 = Op.getOperand(4);
25573 if (Src4.getValueType() != MVT::i8) {
25574 Src4 = DAG.getTargetConstant(Src4->getAsZExtVal() & 0xff, dl, MVT::i8);
25575 }
25576
25577 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25578 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
25579 Src4);
25580 }
25581 case INTR_TYPE_1OP_MASK: {
25582 SDValue Src = Op.getOperand(1);
25583 SDValue PassThru = Op.getOperand(2);
25584 SDValue Mask = Op.getOperand(3);
25585 // We add rounding mode to the Node when
25586 // - RC Opcode is specified and
25587 // - RC is not "current direction".
25588 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25589 if (IntrWithRoundingModeOpcode != 0) {
25590 SDValue Rnd = Op.getOperand(4);
25591 unsigned RC = 0;
25592 if (isRoundModeSAEToX(Rnd, RC))
25593 return getVectorMaskingNode(
25594 DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25595 Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
25596 Mask, PassThru, Subtarget, DAG);
25597 if (!isRoundModeCurDirection(Rnd))
25598 return SDValue();
25599 }
25600 return getVectorMaskingNode(
25601 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
25602 Subtarget, DAG);
25603 }
25605 SDValue Src = Op.getOperand(1);
25606 SDValue PassThru = Op.getOperand(2);
25607 SDValue Mask = Op.getOperand(3);
25608 SDValue Rnd = Op.getOperand(4);
25609
25610 unsigned Opc;
25611 if (isRoundModeCurDirection(Rnd))
25612 Opc = IntrData->Opc0;
25613 else if (isRoundModeSAE(Rnd))
25614 Opc = IntrData->Opc1;
25615 else
25616 return SDValue();
25617
25618 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
25619 Subtarget, DAG);
25620 }
25621 case INTR_TYPE_SCALAR_MASK: {
25622 SDValue Src1 = Op.getOperand(1);
25623 SDValue Src2 = Op.getOperand(2);
25624 SDValue passThru = Op.getOperand(3);
25625 SDValue Mask = Op.getOperand(4);
25626 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25627 // There are 2 kinds of intrinsics in this group:
25628 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
25629 // (2) With rounding mode and sae - 7 operands.
25630 bool HasRounding = IntrWithRoundingModeOpcode != 0;
25631 if (Op.getNumOperands() == (5U + HasRounding)) {
25632 if (HasRounding) {
25633 SDValue Rnd = Op.getOperand(5);
25634 unsigned RC = 0;
25635 if (isRoundModeSAEToX(Rnd, RC))
25636 return getScalarMaskingNode(
25637 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
25638 DAG.getTargetConstant(RC, dl, MVT::i32)),
25639 Mask, passThru, Subtarget, DAG);
25640 if (!isRoundModeCurDirection(Rnd))
25641 return SDValue();
25642 }
25643 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
25644 Src2),
25645 Mask, passThru, Subtarget, DAG);
25646 }
25647
25648 assert(Op.getNumOperands() == (6U + HasRounding) &&
25649 "Unexpected intrinsic form");
25650 SDValue RoundingMode = Op.getOperand(5);
25651 unsigned Opc = IntrData->Opc0;
25652 if (HasRounding) {
25653 SDValue Sae = Op.getOperand(6);
25654 if (isRoundModeSAE(Sae))
25655 Opc = IntrWithRoundingModeOpcode;
25656 else if (!isRoundModeCurDirection(Sae))
25657 return SDValue();
25658 }
25659 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
25660 Src2, RoundingMode),
25661 Mask, passThru, Subtarget, DAG);
25662 }
25664 SDValue Src1 = Op.getOperand(1);
25665 SDValue Src2 = Op.getOperand(2);
25666 SDValue passThru = Op.getOperand(3);
25667 SDValue Mask = Op.getOperand(4);
25668 SDValue Rnd = Op.getOperand(5);
25669
25670 SDValue NewOp;
25671 unsigned RC = 0;
25672 if (isRoundModeCurDirection(Rnd))
25673 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
25674 else if (isRoundModeSAEToX(Rnd, RC))
25675 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
25676 DAG.getTargetConstant(RC, dl, MVT::i32));
25677 else
25678 return SDValue();
25679
25680 return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
25681 }
25683 SDValue Src1 = Op.getOperand(1);
25684 SDValue Src2 = Op.getOperand(2);
25685 SDValue passThru = Op.getOperand(3);
25686 SDValue Mask = Op.getOperand(4);
25687 SDValue Sae = Op.getOperand(5);
25688 unsigned Opc;
25689 if (isRoundModeCurDirection(Sae))
25690 Opc = IntrData->Opc0;
25691 else if (isRoundModeSAE(Sae))
25692 Opc = IntrData->Opc1;
25693 else
25694 return SDValue();
25695
25696 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
25697 Mask, passThru, Subtarget, DAG);
25698 }
25699 case INTR_TYPE_2OP_MASK: {
25700 SDValue Src1 = Op.getOperand(1);
25701 SDValue Src2 = Op.getOperand(2);
25702 SDValue PassThru = Op.getOperand(3);
25703 SDValue Mask = Op.getOperand(4);
25704 SDValue NewOp;
25705 if (IntrData->Opc1 != 0) {
25706 SDValue Rnd = Op.getOperand(5);
25707 unsigned RC = 0;
25708 if (isRoundModeSAEToX(Rnd, RC))
25709 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
25710 DAG.getTargetConstant(RC, dl, MVT::i32));
25711 else if (!isRoundModeCurDirection(Rnd))
25712 return SDValue();
25713 }
25714 if (!NewOp)
25715 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
25716 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
25717 }
25719 SDValue Src1 = Op.getOperand(1);
25720 SDValue Src2 = Op.getOperand(2);
25721 SDValue PassThru = Op.getOperand(3);
25722 SDValue Mask = Op.getOperand(4);
25723
25724 unsigned Opc = IntrData->Opc0;
25725 if (IntrData->Opc1 != 0) {
25726 SDValue Sae = Op.getOperand(5);
25727 if (isRoundModeSAE(Sae))
25728 Opc = IntrData->Opc1;
25729 else if (!isRoundModeCurDirection(Sae))
25730 return SDValue();
25731 }
25732
25733 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
25734 Mask, PassThru, Subtarget, DAG);
25735 }
25737 SDValue Src1 = Op.getOperand(1);
25738 SDValue Src2 = Op.getOperand(2);
25739 SDValue Src3 = Op.getOperand(3);
25740 SDValue PassThru = Op.getOperand(4);
25741 SDValue Mask = Op.getOperand(5);
25742 SDValue Sae = Op.getOperand(6);
25743 unsigned Opc;
25744 if (isRoundModeCurDirection(Sae))
25745 Opc = IntrData->Opc0;
25746 else if (isRoundModeSAE(Sae))
25747 Opc = IntrData->Opc1;
25748 else
25749 return SDValue();
25750
25751 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
25752 Mask, PassThru, Subtarget, DAG);
25753 }
25755 SDValue Src1 = Op.getOperand(1);
25756 SDValue Src2 = Op.getOperand(2);
25757 SDValue Src3 = Op.getOperand(3);
25758 SDValue PassThru = Op.getOperand(4);
25759 SDValue Mask = Op.getOperand(5);
25760
25761 unsigned Opc = IntrData->Opc0;
25762 if (IntrData->Opc1 != 0) {
25763 SDValue Sae = Op.getOperand(6);
25764 if (isRoundModeSAE(Sae))
25765 Opc = IntrData->Opc1;
25766 else if (!isRoundModeCurDirection(Sae))
25767 return SDValue();
25768 }
25769 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
25770 Mask, PassThru, Subtarget, DAG);
25771 }
25772 case BLENDV: {
25773 SDValue Src1 = Op.getOperand(1);
25774 SDValue Src2 = Op.getOperand(2);
25775 SDValue Src3 = Op.getOperand(3);
25776
25778 Src3 = DAG.getBitcast(MaskVT, Src3);
25779
25780 // Reverse the operands to match VSELECT order.
25781 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
25782 }
25783 case VPERM_2OP : {
25784 SDValue Src1 = Op.getOperand(1);
25785 SDValue Src2 = Op.getOperand(2);
25786
25787 // Swap Src1 and Src2 in the node creation
25788 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
25789 }
25790 case CFMA_OP_MASKZ:
25791 case CFMA_OP_MASK: {
25792 SDValue Src1 = Op.getOperand(1);
25793 SDValue Src2 = Op.getOperand(2);
25794 SDValue Src3 = Op.getOperand(3);
25795 SDValue Mask = Op.getOperand(4);
25796 MVT VT = Op.getSimpleValueType();
25797
25798 SDValue PassThru = Src3;
25799 if (IntrData->Type == CFMA_OP_MASKZ)
25800 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
25801
25802 // We add rounding mode to the Node when
25803 // - RC Opcode is specified and
25804 // - RC is not "current direction".
25805 SDValue NewOp;
25806 if (IntrData->Opc1 != 0) {
25807 SDValue Rnd = Op.getOperand(5);
25808 unsigned RC = 0;
25809 if (isRoundModeSAEToX(Rnd, RC))
25810 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,
25811 DAG.getTargetConstant(RC, dl, MVT::i32));
25812 else if (!isRoundModeCurDirection(Rnd))
25813 return SDValue();
25814 }
25815 if (!NewOp)
25816 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
25817 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
25818 }
25819 case IFMA_OP:
25820 // NOTE: We need to swizzle the operands to pass the multiply operands
25821 // first.
25822 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25823 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
25824 case FPCLASSS: {
25825 SDValue Src1 = Op.getOperand(1);
25826 SDValue Imm = Op.getOperand(2);
25827 SDValue Mask = Op.getOperand(3);
25828 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
25829 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
25830 Subtarget, DAG);
25831 // Need to fill with zeros to ensure the bitcast will produce zeroes
25832 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25833 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
25834 DAG.getConstant(0, dl, MVT::v8i1),
25835 FPclassMask, DAG.getIntPtrConstant(0, dl));
25836 return DAG.getBitcast(MVT::i8, Ins);
25837 }
25838
25839 case CMP_MASK_CC: {
25840 MVT MaskVT = Op.getSimpleValueType();
25841 SDValue CC = Op.getOperand(3);
25842 SDValue Mask = Op.getOperand(4);
25843 // We specify 2 possible opcodes for intrinsics with rounding modes.
25844 // First, we check if the intrinsic may have non-default rounding mode,
25845 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25846 if (IntrData->Opc1 != 0) {
25847 SDValue Sae = Op.getOperand(5);
25848 if (isRoundModeSAE(Sae))
25849 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
25850 Op.getOperand(2), CC, Mask, Sae);
25851 if (!isRoundModeCurDirection(Sae))
25852 return SDValue();
25853 }
25854 //default rounding mode
25855 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
25856 {Op.getOperand(1), Op.getOperand(2), CC, Mask});
25857 }
25858 case CMP_MASK_SCALAR_CC: {
25859 SDValue Src1 = Op.getOperand(1);
25860 SDValue Src2 = Op.getOperand(2);
25861 SDValue CC = Op.getOperand(3);
25862 SDValue Mask = Op.getOperand(4);
25863
25864 SDValue Cmp;
25865 if (IntrData->Opc1 != 0) {
25866 SDValue Sae = Op.getOperand(5);
25867 if (isRoundModeSAE(Sae))
25868 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
25869 else if (!isRoundModeCurDirection(Sae))
25870 return SDValue();
25871 }
25872 //default rounding mode
25873 if (!Cmp.getNode())
25874 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
25875
25876 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
25877 Subtarget, DAG);
25878 // Need to fill with zeros to ensure the bitcast will produce zeroes
25879 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25880 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
25881 DAG.getConstant(0, dl, MVT::v8i1),
25882 CmpMask, DAG.getIntPtrConstant(0, dl));
25883 return DAG.getBitcast(MVT::i8, Ins);
25884 }
25885 case COMI: { // Comparison intrinsics
25886 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
25887 SDValue LHS = Op.getOperand(1);
25888 SDValue RHS = Op.getOperand(2);
25889 // Some conditions require the operands to be swapped.
25890 if (CC == ISD::SETLT || CC == ISD::SETLE)
25891 std::swap(LHS, RHS);
25892
25893 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
25894 SDValue SetCC;
25895 switch (CC) {
25896 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
25897 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
25898 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
25899 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
25900 break;
25901 }
25902 case ISD::SETNE: { // (ZF = 1 or PF = 1)
25903 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
25904 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
25905 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
25906 break;
25907 }
25908 case ISD::SETGT: // (CF = 0 and ZF = 0)
25909 case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
25910 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
25911 break;
25912 }
25913 case ISD::SETGE: // CF = 0
25914 case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
25915 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
25916 break;
25917 default:
25918 llvm_unreachable("Unexpected illegal condition!");
25919 }
25920 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
25921 }
25922 case COMI_RM: { // Comparison intrinsics with Sae
25923 SDValue LHS = Op.getOperand(1);
25924 SDValue RHS = Op.getOperand(2);
25925 unsigned CondVal = Op.getConstantOperandVal(3);
25926 SDValue Sae = Op.getOperand(4);
25927
25928 SDValue FCmp;
25929 if (isRoundModeCurDirection(Sae))
25930 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
25931 DAG.getTargetConstant(CondVal, dl, MVT::i8));
25932 else if (isRoundModeSAE(Sae))
25933 FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
25934 DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
25935 else
25936 return SDValue();
25937 // Need to fill with zeros to ensure the bitcast will produce zeroes
25938 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25939 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
25940 DAG.getConstant(0, dl, MVT::v16i1),
25941 FCmp, DAG.getIntPtrConstant(0, dl));
25942 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
25943 DAG.getBitcast(MVT::i16, Ins));
25944 }
25945 case VSHIFT: {
25946 SDValue SrcOp = Op.getOperand(1);
25947 SDValue ShAmt = Op.getOperand(2);
25948 assert(ShAmt.getValueType() == MVT::i32 &&
25949 "Unexpected VSHIFT amount type");
25950
25951 // Catch shift-by-constant.
25952 if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
25953 return getTargetVShiftByConstNode(IntrData->Opc0, dl,
25954 Op.getSimpleValueType(), SrcOp,
25955 CShAmt->getZExtValue(), DAG);
25956
25957 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
25958 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
25959 SrcOp, ShAmt, 0, Subtarget, DAG);
25960 }
25962 SDValue Mask = Op.getOperand(3);
25963 SDValue DataToCompress = Op.getOperand(1);
25964 SDValue PassThru = Op.getOperand(2);
25965 if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
25966 return Op.getOperand(1);
25967
25968 // Avoid false dependency.
25969 if (PassThru.isUndef())
25970 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
25971
25972 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
25973 Mask);
25974 }
25975 case FIXUPIMM:
25976 case FIXUPIMM_MASKZ: {
25977 SDValue Src1 = Op.getOperand(1);
25978 SDValue Src2 = Op.getOperand(2);
25979 SDValue Src3 = Op.getOperand(3);
25980 SDValue Imm = Op.getOperand(4);
25981 SDValue Mask = Op.getOperand(5);
25982 SDValue Passthru = (IntrData->Type == FIXUPIMM)
25983 ? Src1
25984 : getZeroVector(VT, Subtarget, DAG, dl);
25985
25986 unsigned Opc = IntrData->Opc0;
25987 if (IntrData->Opc1 != 0) {
25988 SDValue Sae = Op.getOperand(6);
25989 if (isRoundModeSAE(Sae))
25990 Opc = IntrData->Opc1;
25991 else if (!isRoundModeCurDirection(Sae))
25992 return SDValue();
25993 }
25994
25995 SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
25996
25997 if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)
25998 return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
25999
26000 return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26001 }
26002 case ROUNDP: {
26003 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
26004 // Clear the upper bits of the rounding immediate so that the legacy
26005 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26006 uint64_t Round = Op.getConstantOperandVal(2);
26007 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26008 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26009 Op.getOperand(1), RoundingMode);
26010 }
26011 case ROUNDS: {
26012 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
26013 // Clear the upper bits of the rounding immediate so that the legacy
26014 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26015 uint64_t Round = Op.getConstantOperandVal(3);
26016 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26017 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26018 Op.getOperand(1), Op.getOperand(2), RoundingMode);
26019 }
26020 case BEXTRI: {
26021 assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
26022
26023 uint64_t Imm = Op.getConstantOperandVal(2);
26024 SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
26025 Op.getValueType());
26026 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26027 Op.getOperand(1), Control);
26028 }
26029 // ADC/SBB
26030 case ADX: {
26031 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
26032 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
26033
26034 SDValue Res;
26035 // If the carry in is zero, then we should just use ADD/SUB instead of
26036 // ADC/SBB.
26037 if (isNullConstant(Op.getOperand(1))) {
26038 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
26039 Op.getOperand(3));
26040 } else {
26041 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
26042 DAG.getConstant(-1, dl, MVT::i8));
26043 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
26044 Op.getOperand(3), GenCF.getValue(1));
26045 }
26046 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
26047 SDValue Results[] = { SetCC, Res };
26048 return DAG.getMergeValues(Results, dl);
26049 }
26050 case CVTPD2PS_MASK:
26051 case CVTPD2DQ_MASK:
26052 case CVTQQ2PS_MASK:
26053 case TRUNCATE_TO_REG: {
26054 SDValue Src = Op.getOperand(1);
26055 SDValue PassThru = Op.getOperand(2);
26056 SDValue Mask = Op.getOperand(3);
26057
26058 if (isAllOnesConstant(Mask))
26059 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
26060
26061 MVT SrcVT = Src.getSimpleValueType();
26062 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
26063 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26064 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
26065 {Src, PassThru, Mask});
26066 }
26067 case CVTPS2PH_MASK: {
26068 SDValue Src = Op.getOperand(1);
26069 SDValue Rnd = Op.getOperand(2);
26070 SDValue PassThru = Op.getOperand(3);
26071 SDValue Mask = Op.getOperand(4);
26072
26073 unsigned RC = 0;
26074 unsigned Opc = IntrData->Opc0;
26075 bool SAE = Src.getValueType().is512BitVector() &&
26076 (isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd));
26077 if (SAE) {
26079 Rnd = DAG.getTargetConstant(RC, dl, MVT::i32);
26080 }
26081
26082 if (isAllOnesConstant(Mask))
26083 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd);
26084
26085 if (SAE)
26087 else
26088 Opc = IntrData->Opc1;
26089 MVT SrcVT = Src.getSimpleValueType();
26090 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
26091 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26092 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask);
26093 }
26094 case CVTNEPS2BF16_MASK: {
26095 SDValue Src = Op.getOperand(1);
26096 SDValue PassThru = Op.getOperand(2);
26097 SDValue Mask = Op.getOperand(3);
26098
26099 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
26100 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
26101
26102 // Break false dependency.
26103 if (PassThru.isUndef())
26104 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
26105
26106 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
26107 Mask);
26108 }
26109 default:
26110 break;
26111 }
26112 }
26113
26114 switch (IntNo) {
26115 default: return SDValue(); // Don't custom lower most intrinsics.
26116
26117 // ptest and testp intrinsics. The intrinsic these come from are designed to
26118 // return an integer value, not just an instruction so lower it to the ptest
26119 // or testp pattern and a setcc for the result.
26120 case Intrinsic::x86_avx512_ktestc_b:
26121 case Intrinsic::x86_avx512_ktestc_w:
26122 case Intrinsic::x86_avx512_ktestc_d:
26123 case Intrinsic::x86_avx512_ktestc_q:
26124 case Intrinsic::x86_avx512_ktestz_b:
26125 case Intrinsic::x86_avx512_ktestz_w:
26126 case Intrinsic::x86_avx512_ktestz_d:
26127 case Intrinsic::x86_avx512_ktestz_q:
26128 case Intrinsic::x86_sse41_ptestz:
26129 case Intrinsic::x86_sse41_ptestc:
26130 case Intrinsic::x86_sse41_ptestnzc:
26131 case Intrinsic::x86_avx_ptestz_256:
26132 case Intrinsic::x86_avx_ptestc_256:
26133 case Intrinsic::x86_avx_ptestnzc_256:
26134 case Intrinsic::x86_avx_vtestz_ps:
26135 case Intrinsic::x86_avx_vtestc_ps:
26136 case Intrinsic::x86_avx_vtestnzc_ps:
26137 case Intrinsic::x86_avx_vtestz_pd:
26138 case Intrinsic::x86_avx_vtestc_pd:
26139 case Intrinsic::x86_avx_vtestnzc_pd:
26140 case Intrinsic::x86_avx_vtestz_ps_256:
26141 case Intrinsic::x86_avx_vtestc_ps_256:
26142 case Intrinsic::x86_avx_vtestnzc_ps_256:
26143 case Intrinsic::x86_avx_vtestz_pd_256:
26144 case Intrinsic::x86_avx_vtestc_pd_256:
26145 case Intrinsic::x86_avx_vtestnzc_pd_256: {
26146 unsigned TestOpc = X86ISD::PTEST;
26147 X86::CondCode X86CC;
26148 switch (IntNo) {
26149 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
26150 case Intrinsic::x86_avx512_ktestc_b:
26151 case Intrinsic::x86_avx512_ktestc_w:
26152 case Intrinsic::x86_avx512_ktestc_d:
26153 case Intrinsic::x86_avx512_ktestc_q:
26154 // CF = 1
26155 TestOpc = X86ISD::KTEST;
26156 X86CC = X86::COND_B;
26157 break;
26158 case Intrinsic::x86_avx512_ktestz_b:
26159 case Intrinsic::x86_avx512_ktestz_w:
26160 case Intrinsic::x86_avx512_ktestz_d:
26161 case Intrinsic::x86_avx512_ktestz_q:
26162 TestOpc = X86ISD::KTEST;
26163 X86CC = X86::COND_E;
26164 break;
26165 case Intrinsic::x86_avx_vtestz_ps:
26166 case Intrinsic::x86_avx_vtestz_pd:
26167 case Intrinsic::x86_avx_vtestz_ps_256:
26168 case Intrinsic::x86_avx_vtestz_pd_256:
26169 TestOpc = X86ISD::TESTP;
26170 [[fallthrough]];
26171 case Intrinsic::x86_sse41_ptestz:
26172 case Intrinsic::x86_avx_ptestz_256:
26173 // ZF = 1
26174 X86CC = X86::COND_E;
26175 break;
26176 case Intrinsic::x86_avx_vtestc_ps:
26177 case Intrinsic::x86_avx_vtestc_pd:
26178 case Intrinsic::x86_avx_vtestc_ps_256:
26179 case Intrinsic::x86_avx_vtestc_pd_256:
26180 TestOpc = X86ISD::TESTP;
26181 [[fallthrough]];
26182 case Intrinsic::x86_sse41_ptestc:
26183 case Intrinsic::x86_avx_ptestc_256:
26184 // CF = 1
26185 X86CC = X86::COND_B;
26186 break;
26187 case Intrinsic::x86_avx_vtestnzc_ps:
26188 case Intrinsic::x86_avx_vtestnzc_pd:
26189 case Intrinsic::x86_avx_vtestnzc_ps_256:
26190 case Intrinsic::x86_avx_vtestnzc_pd_256:
26191 TestOpc = X86ISD::TESTP;
26192 [[fallthrough]];
26193 case Intrinsic::x86_sse41_ptestnzc:
26194 case Intrinsic::x86_avx_ptestnzc_256:
26195 // ZF and CF = 0
26196 X86CC = X86::COND_A;
26197 break;
26198 }
26199
26200 SDValue LHS = Op.getOperand(1);
26201 SDValue RHS = Op.getOperand(2);
26202 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
26203 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
26204 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26205 }
26206
26207 case Intrinsic::x86_sse42_pcmpistria128:
26208 case Intrinsic::x86_sse42_pcmpestria128:
26209 case Intrinsic::x86_sse42_pcmpistric128:
26210 case Intrinsic::x86_sse42_pcmpestric128:
26211 case Intrinsic::x86_sse42_pcmpistrio128:
26212 case Intrinsic::x86_sse42_pcmpestrio128:
26213 case Intrinsic::x86_sse42_pcmpistris128:
26214 case Intrinsic::x86_sse42_pcmpestris128:
26215 case Intrinsic::x86_sse42_pcmpistriz128:
26216 case Intrinsic::x86_sse42_pcmpestriz128: {
26217 unsigned Opcode;
26218 X86::CondCode X86CC;
26219 switch (IntNo) {
26220 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
26221 case Intrinsic::x86_sse42_pcmpistria128:
26222 Opcode = X86ISD::PCMPISTR;
26223 X86CC = X86::COND_A;
26224 break;
26225 case Intrinsic::x86_sse42_pcmpestria128:
26226 Opcode = X86ISD::PCMPESTR;
26227 X86CC = X86::COND_A;
26228 break;
26229 case Intrinsic::x86_sse42_pcmpistric128:
26230 Opcode = X86ISD::PCMPISTR;
26231 X86CC = X86::COND_B;
26232 break;
26233 case Intrinsic::x86_sse42_pcmpestric128:
26234 Opcode = X86ISD::PCMPESTR;
26235 X86CC = X86::COND_B;
26236 break;
26237 case Intrinsic::x86_sse42_pcmpistrio128:
26238 Opcode = X86ISD::PCMPISTR;
26239 X86CC = X86::COND_O;
26240 break;
26241 case Intrinsic::x86_sse42_pcmpestrio128:
26242 Opcode = X86ISD::PCMPESTR;
26243 X86CC = X86::COND_O;
26244 break;
26245 case Intrinsic::x86_sse42_pcmpistris128:
26246 Opcode = X86ISD::PCMPISTR;
26247 X86CC = X86::COND_S;
26248 break;
26249 case Intrinsic::x86_sse42_pcmpestris128:
26250 Opcode = X86ISD::PCMPESTR;
26251 X86CC = X86::COND_S;
26252 break;
26253 case Intrinsic::x86_sse42_pcmpistriz128:
26254 Opcode = X86ISD::PCMPISTR;
26255 X86CC = X86::COND_E;
26256 break;
26257 case Intrinsic::x86_sse42_pcmpestriz128:
26258 Opcode = X86ISD::PCMPESTR;
26259 X86CC = X86::COND_E;
26260 break;
26261 }
26263 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26264 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
26265 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
26266 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26267 }
26268
26269 case Intrinsic::x86_sse42_pcmpistri128:
26270 case Intrinsic::x86_sse42_pcmpestri128: {
26271 unsigned Opcode;
26272 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
26273 Opcode = X86ISD::PCMPISTR;
26274 else
26275 Opcode = X86ISD::PCMPESTR;
26276
26278 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26279 return DAG.getNode(Opcode, dl, VTs, NewOps);
26280 }
26281
26282 case Intrinsic::x86_sse42_pcmpistrm128:
26283 case Intrinsic::x86_sse42_pcmpestrm128: {
26284 unsigned Opcode;
26285 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
26286 Opcode = X86ISD::PCMPISTR;
26287 else
26288 Opcode = X86ISD::PCMPESTR;
26289
26291 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26292 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
26293 }
26294
26295 case Intrinsic::eh_sjlj_lsda: {
26297 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26298 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
26299 auto &Context = MF.getMMI().getContext();
26300 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
26301 Twine(MF.getFunctionNumber()));
26302 return DAG.getNode(getGlobalWrapperKind(nullptr, /*OpFlags=*/0), dl, VT,
26303 DAG.getMCSymbol(S, PtrVT));
26304 }
26305
26306 case Intrinsic::x86_seh_lsda: {
26307 // Compute the symbol for the LSDA. We know it'll get emitted later.
26309 SDValue Op1 = Op.getOperand(1);
26310 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
26313
26314 // Generate a simple absolute symbol reference. This intrinsic is only
26315 // supported on 32-bit Windows, which isn't PIC.
26316 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
26317 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
26318 }
26319
26320 case Intrinsic::eh_recoverfp: {
26321 SDValue FnOp = Op.getOperand(1);
26322 SDValue IncomingFPOp = Op.getOperand(2);
26323 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
26324 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
26325 if (!Fn)
26327 "llvm.eh.recoverfp must take a function as the first argument");
26328 return recoverFramePointer(DAG, Fn, IncomingFPOp);
26329 }
26330
26331 case Intrinsic::localaddress: {
26332 // Returns one of the stack, base, or frame pointer registers, depending on
26333 // which is used to reference local variables.
26335 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26336 unsigned Reg;
26337 if (RegInfo->hasBasePointer(MF))
26338 Reg = RegInfo->getBaseRegister();
26339 else { // Handles the SP or FP case.
26340 bool CantUseFP = RegInfo->hasStackRealignment(MF);
26341 if (CantUseFP)
26342 Reg = RegInfo->getPtrSizedStackRegister(MF);
26343 else
26344 Reg = RegInfo->getPtrSizedFrameRegister(MF);
26345 }
26346 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
26347 }
26348 case Intrinsic::x86_avx512_vp2intersect_q_512:
26349 case Intrinsic::x86_avx512_vp2intersect_q_256:
26350 case Intrinsic::x86_avx512_vp2intersect_q_128:
26351 case Intrinsic::x86_avx512_vp2intersect_d_512:
26352 case Intrinsic::x86_avx512_vp2intersect_d_256:
26353 case Intrinsic::x86_avx512_vp2intersect_d_128: {
26354 MVT MaskVT = Op.getSimpleValueType();
26355
26356 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
26357 SDLoc DL(Op);
26358
26361 Op->getOperand(1), Op->getOperand(2));
26362
26363 SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
26364 MaskVT, Operation);
26365 SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
26366 MaskVT, Operation);
26367 return DAG.getMergeValues({Result0, Result1}, DL);
26368 }
26369 case Intrinsic::x86_mmx_pslli_w:
26370 case Intrinsic::x86_mmx_pslli_d:
26371 case Intrinsic::x86_mmx_pslli_q:
26372 case Intrinsic::x86_mmx_psrli_w:
26373 case Intrinsic::x86_mmx_psrli_d:
26374 case Intrinsic::x86_mmx_psrli_q:
26375 case Intrinsic::x86_mmx_psrai_w:
26376 case Intrinsic::x86_mmx_psrai_d: {
26377 SDLoc DL(Op);
26378 SDValue ShAmt = Op.getOperand(2);
26379 // If the argument is a constant, convert it to a target constant.
26380 if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
26381 // Clamp out of bounds shift amounts since they will otherwise be masked
26382 // to 8-bits which may make it no longer out of bounds.
26383 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
26384 if (ShiftAmount == 0)
26385 return Op.getOperand(1);
26386
26387 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
26388 Op.getOperand(0), Op.getOperand(1),
26389 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
26390 }
26391
26392 unsigned NewIntrinsic;
26393 switch (IntNo) {
26394 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
26395 case Intrinsic::x86_mmx_pslli_w:
26396 NewIntrinsic = Intrinsic::x86_mmx_psll_w;
26397 break;
26398 case Intrinsic::x86_mmx_pslli_d:
26399 NewIntrinsic = Intrinsic::x86_mmx_psll_d;
26400 break;
26401 case Intrinsic::x86_mmx_pslli_q:
26402 NewIntrinsic = Intrinsic::x86_mmx_psll_q;
26403 break;
26404 case Intrinsic::x86_mmx_psrli_w:
26405 NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
26406 break;
26407 case Intrinsic::x86_mmx_psrli_d:
26408 NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
26409 break;
26410 case Intrinsic::x86_mmx_psrli_q:
26411 NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
26412 break;
26413 case Intrinsic::x86_mmx_psrai_w:
26414 NewIntrinsic = Intrinsic::x86_mmx_psra_w;
26415 break;
26416 case Intrinsic::x86_mmx_psrai_d:
26417 NewIntrinsic = Intrinsic::x86_mmx_psra_d;
26418 break;
26419 }
26420
26421 // The vector shift intrinsics with scalars uses 32b shift amounts but
26422 // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
26423 // MMX register.
26424 ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
26425 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
26426 DAG.getTargetConstant(NewIntrinsic, DL,
26428 Op.getOperand(1), ShAmt);
26429 }
26430 case Intrinsic::thread_pointer: {
26431 if (Subtarget.isTargetELF()) {
26432 SDLoc dl(Op);
26433 EVT PtrVT = getPointerTy(DAG.getDataLayout());
26434 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
26436 *DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));
26437 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
26439 }
26441 "Target OS doesn't support __builtin_thread_pointer() yet.");
26442 }
26443 }
26444}
26445
26447 SDValue Src, SDValue Mask, SDValue Base,
26448 SDValue Index, SDValue ScaleOp, SDValue Chain,
26449 const X86Subtarget &Subtarget) {
26450 SDLoc dl(Op);
26451 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26452 // Scale must be constant.
26453 if (!C)
26454 return SDValue();
26455 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26456 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26457 TLI.getPointerTy(DAG.getDataLayout()));
26458 EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
26459 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
26460 // If source is undef or we know it won't be used, use a zero vector
26461 // to break register dependency.
26462 // TODO: use undef instead and let BreakFalseDeps deal with it?
26463 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
26464 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
26465
26466 // Cast mask to an integer type.
26467 Mask = DAG.getBitcast(MaskVT, Mask);
26468
26469 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26470
26471 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
26472 SDValue Res =
26473 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
26474 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26475 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
26476}
26477
26479 SDValue Src, SDValue Mask, SDValue Base,
26480 SDValue Index, SDValue ScaleOp, SDValue Chain,
26481 const X86Subtarget &Subtarget) {
26482 MVT VT = Op.getSimpleValueType();
26483 SDLoc dl(Op);
26484 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26485 // Scale must be constant.
26486 if (!C)
26487 return SDValue();
26488 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26489 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26490 TLI.getPointerTy(DAG.getDataLayout()));
26491 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
26493 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
26494
26495 // We support two versions of the gather intrinsics. One with scalar mask and
26496 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
26497 if (Mask.getValueType() != MaskVT)
26498 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26499
26500 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
26501 // If source is undef or we know it won't be used, use a zero vector
26502 // to break register dependency.
26503 // TODO: use undef instead and let BreakFalseDeps deal with it?
26504 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
26505 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
26506
26507 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26508
26509 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
26510 SDValue Res =
26511 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
26512 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26513 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
26514}
26515
26516static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26517 SDValue Src, SDValue Mask, SDValue Base,
26518 SDValue Index, SDValue ScaleOp, SDValue Chain,
26519 const X86Subtarget &Subtarget) {
26520 SDLoc dl(Op);
26521 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26522 // Scale must be constant.
26523 if (!C)
26524 return SDValue();
26525 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26526 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26527 TLI.getPointerTy(DAG.getDataLayout()));
26528 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
26529 Src.getSimpleValueType().getVectorNumElements());
26530 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
26531
26532 // We support two versions of the scatter intrinsics. One with scalar mask and
26533 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
26534 if (Mask.getValueType() != MaskVT)
26535 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26536
26537 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26538
26539 SDVTList VTs = DAG.getVTList(MVT::Other);
26540 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
26541 SDValue Res =
26542 DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
26543 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26544 return Res;
26545}
26546
26547static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26549 SDValue ScaleOp, SDValue Chain,
26550 const X86Subtarget &Subtarget) {
26551 SDLoc dl(Op);
26552 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26553 // Scale must be constant.
26554 if (!C)
26555 return SDValue();
26556 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26557 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26558 TLI.getPointerTy(DAG.getDataLayout()));
26559 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
26560 SDValue Segment = DAG.getRegister(0, MVT::i32);
26561 MVT MaskVT =
26562 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
26563 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26564 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
26565 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
26566 return SDValue(Res, 0);
26567}
26568
26569/// Handles the lowering of builtin intrinsics with chain that return their
26570/// value into registers EDX:EAX.
26571/// If operand ScrReg is a valid register identifier, then operand 2 of N is
26572/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
26573/// TargetOpcode.
26574/// Returns a Glue value which can be used to add extra copy-from-reg if the
26575/// expanded intrinsics implicitly defines extra registers (i.e. not just
26576/// EDX:EAX).
26578 SelectionDAG &DAG,
26579 unsigned TargetOpcode,
26580 unsigned SrcReg,
26581 const X86Subtarget &Subtarget,
26583 SDValue Chain = N->getOperand(0);
26584 SDValue Glue;
26585
26586 if (SrcReg) {
26587 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
26588 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
26589 Glue = Chain.getValue(1);
26590 }
26591
26592 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
26593 SDValue N1Ops[] = {Chain, Glue};
26594 SDNode *N1 = DAG.getMachineNode(
26595 TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
26596 Chain = SDValue(N1, 0);
26597
26598 // Reads the content of XCR and returns it in registers EDX:EAX.
26599 SDValue LO, HI;
26600 if (Subtarget.is64Bit()) {
26601 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
26602 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
26603 LO.getValue(2));
26604 } else {
26605 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
26606 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
26607 LO.getValue(2));
26608 }
26609 Chain = HI.getValue(1);
26610 Glue = HI.getValue(2);
26611
26612 if (Subtarget.is64Bit()) {
26613 // Merge the two 32-bit values into a 64-bit one.
26614 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
26615 DAG.getConstant(32, DL, MVT::i8));
26616 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
26617 Results.push_back(Chain);
26618 return Glue;
26619 }
26620
26621 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
26622 SDValue Ops[] = { LO, HI };
26623 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
26624 Results.push_back(Pair);
26625 Results.push_back(Chain);
26626 return Glue;
26627}
26628
26629/// Handles the lowering of builtin intrinsics that read the time stamp counter
26630/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
26631/// READCYCLECOUNTER nodes.
26632static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
26633 SelectionDAG &DAG,
26634 const X86Subtarget &Subtarget,
26636 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
26637 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
26638 // and the EAX register is loaded with the low-order 32 bits.
26639 SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
26640 /* NoRegister */0, Subtarget,
26641 Results);
26642 if (Opcode != X86::RDTSCP)
26643 return;
26644
26645 SDValue Chain = Results[1];
26646 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
26647 // the ECX register. Add 'ecx' explicitly to the chain.
26648 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
26649 Results[1] = ecx;
26650 Results.push_back(ecx.getValue(1));
26651}
26652
26654 SelectionDAG &DAG) {
26656 SDLoc DL(Op);
26657 getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
26658 Results);
26659 return DAG.getMergeValues(Results, DL);
26660}
26661
26664 SDValue Chain = Op.getOperand(0);
26665 SDValue RegNode = Op.getOperand(2);
26666 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
26667 if (!EHInfo)
26668 report_fatal_error("EH registrations only live in functions using WinEH");
26669
26670 // Cast the operand to an alloca, and remember the frame index.
26671 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
26672 if (!FINode)
26673 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
26674 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
26675
26676 // Return the chain operand without making any DAG nodes.
26677 return Chain;
26678}
26679
26682 SDValue Chain = Op.getOperand(0);
26683 SDValue EHGuard = Op.getOperand(2);
26684 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
26685 if (!EHInfo)
26686 report_fatal_error("EHGuard only live in functions using WinEH");
26687
26688 // Cast the operand to an alloca, and remember the frame index.
26689 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
26690 if (!FINode)
26691 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
26692 EHInfo->EHGuardFrameIndex = FINode->getIndex();
26693
26694 // Return the chain operand without making any DAG nodes.
26695 return Chain;
26696}
26697
26698/// Emit Truncating Store with signed or unsigned saturation.
26699static SDValue
26700EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val,
26701 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
26702 SelectionDAG &DAG) {
26703 SDVTList VTs = DAG.getVTList(MVT::Other);
26704 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
26705 SDValue Ops[] = { Chain, Val, Ptr, Undef };
26706 unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
26707 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
26708}
26709
26710/// Emit Masked Truncating Store with signed or unsigned saturation.
26711static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain,
26712 const SDLoc &DL,
26713 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
26714 MachineMemOperand *MMO, SelectionDAG &DAG) {
26715 SDVTList VTs = DAG.getVTList(MVT::Other);
26716 SDValue Ops[] = { Chain, Val, Ptr, Mask };
26717 unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
26718 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
26719}
26720
26722 const MachineFunction &MF) {
26723 if (!Subtarget.is64Bit())
26724 return false;
26725 // 64-bit targets support extended Swift async frame setup,
26726 // except for targets that use the windows 64 prologue.
26727 return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
26728}
26729
26731 SelectionDAG &DAG) {
26732 unsigned IntNo = Op.getConstantOperandVal(1);
26733 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
26734 if (!IntrData) {
26735 switch (IntNo) {
26736
26737 case Intrinsic::swift_async_context_addr: {
26738 SDLoc dl(Op);
26739 auto &MF = DAG.getMachineFunction();
26740 auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
26741 if (X86::isExtendedSwiftAsyncFrameSupported(Subtarget, MF)) {
26743 X86FI->setHasSwiftAsyncContext(true);
26744 SDValue Chain = Op->getOperand(0);
26745 SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64);
26746 SDValue Result =
26747 SDValue(DAG.getMachineNode(X86::SUB64ri32, dl, MVT::i64, CopyRBP,
26748 DAG.getTargetConstant(8, dl, MVT::i32)),
26749 0);
26750 // Return { result, chain }.
26751 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
26752 CopyRBP.getValue(1));
26753 } else {
26754 // No special extended frame, create or reuse an existing stack slot.
26755 int PtrSize = Subtarget.is64Bit() ? 8 : 4;
26756 if (!X86FI->getSwiftAsyncContextFrameIdx())
26757 X86FI->setSwiftAsyncContextFrameIdx(
26758 MF.getFrameInfo().CreateStackObject(PtrSize, Align(PtrSize),
26759 false));
26760 SDValue Result =
26761 DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(),
26762 PtrSize == 8 ? MVT::i64 : MVT::i32);
26763 // Return { result, chain }.
26764 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
26765 Op->getOperand(0));
26766 }
26767 }
26768
26769 case llvm::Intrinsic::x86_seh_ehregnode:
26770 return MarkEHRegistrationNode(Op, DAG);
26771 case llvm::Intrinsic::x86_seh_ehguard:
26772 return MarkEHGuard(Op, DAG);
26773 case llvm::Intrinsic::x86_rdpkru: {
26774 SDLoc dl(Op);
26775 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26776 // Create a RDPKRU node and pass 0 to the ECX parameter.
26777 return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
26778 DAG.getConstant(0, dl, MVT::i32));
26779 }
26780 case llvm::Intrinsic::x86_wrpkru: {
26781 SDLoc dl(Op);
26782 // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
26783 // to the EDX and ECX parameters.
26784 return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
26785 Op.getOperand(0), Op.getOperand(2),
26786 DAG.getConstant(0, dl, MVT::i32),
26787 DAG.getConstant(0, dl, MVT::i32));
26788 }
26789 case llvm::Intrinsic::asan_check_memaccess: {
26790 // Mark this as adjustsStack because it will be lowered to a call.
26792 // Don't do anything here, we will expand these intrinsics out later.
26793 return Op;
26794 }
26795 case llvm::Intrinsic::x86_flags_read_u32:
26796 case llvm::Intrinsic::x86_flags_read_u64:
26797 case llvm::Intrinsic::x86_flags_write_u32:
26798 case llvm::Intrinsic::x86_flags_write_u64: {
26799 // We need a frame pointer because this will get lowered to a PUSH/POP
26800 // sequence.
26803 // Don't do anything here, we will expand these intrinsics out later
26804 // during FinalizeISel in EmitInstrWithCustomInserter.
26805 return Op;
26806 }
26807 case Intrinsic::x86_lwpins32:
26808 case Intrinsic::x86_lwpins64:
26809 case Intrinsic::x86_umwait:
26810 case Intrinsic::x86_tpause: {
26811 SDLoc dl(Op);
26812 SDValue Chain = Op->getOperand(0);
26813 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26814 unsigned Opcode;
26815
26816 switch (IntNo) {
26817 default: llvm_unreachable("Impossible intrinsic");
26818 case Intrinsic::x86_umwait:
26819 Opcode = X86ISD::UMWAIT;
26820 break;
26821 case Intrinsic::x86_tpause:
26822 Opcode = X86ISD::TPAUSE;
26823 break;
26824 case Intrinsic::x86_lwpins32:
26825 case Intrinsic::x86_lwpins64:
26826 Opcode = X86ISD::LWPINS;
26827 break;
26828 }
26829
26831 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
26832 Op->getOperand(3), Op->getOperand(4));
26833 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
26834 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26835 Operation.getValue(1));
26836 }
26837 case Intrinsic::x86_enqcmd:
26838 case Intrinsic::x86_enqcmds: {
26839 SDLoc dl(Op);
26840 SDValue Chain = Op.getOperand(0);
26841 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26842 unsigned Opcode;
26843 switch (IntNo) {
26844 default: llvm_unreachable("Impossible intrinsic!");
26845 case Intrinsic::x86_enqcmd:
26846 Opcode = X86ISD::ENQCMD;
26847 break;
26848 case Intrinsic::x86_enqcmds:
26849 Opcode = X86ISD::ENQCMDS;
26850 break;
26851 }
26852 SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
26853 Op.getOperand(3));
26854 SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
26855 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26856 Operation.getValue(1));
26857 }
26858 case Intrinsic::x86_aesenc128kl:
26859 case Intrinsic::x86_aesdec128kl:
26860 case Intrinsic::x86_aesenc256kl:
26861 case Intrinsic::x86_aesdec256kl: {
26862 SDLoc DL(Op);
26863 SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
26864 SDValue Chain = Op.getOperand(0);
26865 unsigned Opcode;
26866
26867 switch (IntNo) {
26868 default: llvm_unreachable("Impossible intrinsic");
26869 case Intrinsic::x86_aesenc128kl:
26870 Opcode = X86ISD::AESENC128KL;
26871 break;
26872 case Intrinsic::x86_aesdec128kl:
26873 Opcode = X86ISD::AESDEC128KL;
26874 break;
26875 case Intrinsic::x86_aesenc256kl:
26876 Opcode = X86ISD::AESENC256KL;
26877 break;
26878 case Intrinsic::x86_aesdec256kl:
26879 Opcode = X86ISD::AESDEC256KL;
26880 break;
26881 }
26882
26883 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26884 MachineMemOperand *MMO = MemIntr->getMemOperand();
26885 EVT MemVT = MemIntr->getMemoryVT();
26887 Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
26888 MMO);
26889 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
26890
26891 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
26892 {ZF, Operation.getValue(0), Operation.getValue(2)});
26893 }
26894 case Intrinsic::x86_aesencwide128kl:
26895 case Intrinsic::x86_aesdecwide128kl:
26896 case Intrinsic::x86_aesencwide256kl:
26897 case Intrinsic::x86_aesdecwide256kl: {
26898 SDLoc DL(Op);
26899 SDVTList VTs = DAG.getVTList(
26900 {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
26901 MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
26902 SDValue Chain = Op.getOperand(0);
26903 unsigned Opcode;
26904
26905 switch (IntNo) {
26906 default: llvm_unreachable("Impossible intrinsic");
26907 case Intrinsic::x86_aesencwide128kl:
26908 Opcode = X86ISD::AESENCWIDE128KL;
26909 break;
26910 case Intrinsic::x86_aesdecwide128kl:
26911 Opcode = X86ISD::AESDECWIDE128KL;
26912 break;
26913 case Intrinsic::x86_aesencwide256kl:
26914 Opcode = X86ISD::AESENCWIDE256KL;
26915 break;
26916 case Intrinsic::x86_aesdecwide256kl:
26917 Opcode = X86ISD::AESDECWIDE256KL;
26918 break;
26919 }
26920
26921 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26922 MachineMemOperand *MMO = MemIntr->getMemOperand();
26923 EVT MemVT = MemIntr->getMemoryVT();
26925 Opcode, DL, VTs,
26926 {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
26927 Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
26928 Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
26929 MemVT, MMO);
26930 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
26931
26932 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
26933 {ZF, Operation.getValue(1), Operation.getValue(2),
26934 Operation.getValue(3), Operation.getValue(4),
26935 Operation.getValue(5), Operation.getValue(6),
26936 Operation.getValue(7), Operation.getValue(8),
26937 Operation.getValue(9)});
26938 }
26939 case Intrinsic::x86_testui: {
26940 SDLoc dl(Op);
26941 SDValue Chain = Op.getOperand(0);
26942 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26943 SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
26944 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
26945 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26946 Operation.getValue(1));
26947 }
26948 case Intrinsic::x86_atomic_bts_rm:
26949 case Intrinsic::x86_atomic_btc_rm:
26950 case Intrinsic::x86_atomic_btr_rm: {
26951 SDLoc DL(Op);
26952 MVT VT = Op.getSimpleValueType();
26953 SDValue Chain = Op.getOperand(0);
26954 SDValue Op1 = Op.getOperand(2);
26955 SDValue Op2 = Op.getOperand(3);
26956 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm ? X86ISD::LBTS_RM
26957 : IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM
26959 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
26960 SDValue Res =
26961 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
26962 {Chain, Op1, Op2}, VT, MMO);
26963 Chain = Res.getValue(1);
26964 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
26965 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
26966 }
26967 case Intrinsic::x86_atomic_bts:
26968 case Intrinsic::x86_atomic_btc:
26969 case Intrinsic::x86_atomic_btr: {
26970 SDLoc DL(Op);
26971 MVT VT = Op.getSimpleValueType();
26972 SDValue Chain = Op.getOperand(0);
26973 SDValue Op1 = Op.getOperand(2);
26974 SDValue Op2 = Op.getOperand(3);
26975 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS
26976 : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC
26977 : X86ISD::LBTR;
26978 SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);
26979 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
26980 SDValue Res =
26981 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
26982 {Chain, Op1, Op2, Size}, VT, MMO);
26983 Chain = Res.getValue(1);
26984 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
26985 unsigned Imm = Op2->getAsZExtVal();
26986 if (Imm)
26987 Res = DAG.getNode(ISD::SHL, DL, VT, Res,
26988 DAG.getShiftAmountConstant(Imm, VT, DL));
26989 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
26990 }
26991 case Intrinsic::x86_cmpccxadd32:
26992 case Intrinsic::x86_cmpccxadd64: {
26993 SDLoc DL(Op);
26994 SDValue Chain = Op.getOperand(0);
26995 SDValue Addr = Op.getOperand(2);
26996 SDValue Src1 = Op.getOperand(3);
26997 SDValue Src2 = Op.getOperand(4);
26998 SDValue CC = Op.getOperand(5);
26999 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27001 X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},
27002 MVT::i32, MMO);
27003 return Operation;
27004 }
27005 case Intrinsic::x86_aadd32:
27006 case Intrinsic::x86_aadd64:
27007 case Intrinsic::x86_aand32:
27008 case Intrinsic::x86_aand64:
27009 case Intrinsic::x86_aor32:
27010 case Intrinsic::x86_aor64:
27011 case Intrinsic::x86_axor32:
27012 case Intrinsic::x86_axor64: {
27013 SDLoc DL(Op);
27014 SDValue Chain = Op.getOperand(0);
27015 SDValue Op1 = Op.getOperand(2);
27016 SDValue Op2 = Op.getOperand(3);
27017 MVT VT = Op2.getSimpleValueType();
27018 unsigned Opc = 0;
27019 switch (IntNo) {
27020 default:
27021 llvm_unreachable("Unknown Intrinsic");
27022 case Intrinsic::x86_aadd32:
27023 case Intrinsic::x86_aadd64:
27024 Opc = X86ISD::AADD;
27025 break;
27026 case Intrinsic::x86_aand32:
27027 case Intrinsic::x86_aand64:
27028 Opc = X86ISD::AAND;
27029 break;
27030 case Intrinsic::x86_aor32:
27031 case Intrinsic::x86_aor64:
27032 Opc = X86ISD::AOR;
27033 break;
27034 case Intrinsic::x86_axor32:
27035 case Intrinsic::x86_axor64:
27036 Opc = X86ISD::AXOR;
27037 break;
27038 }
27039 MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand();
27040 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(),
27041 {Chain, Op1, Op2}, VT, MMO);
27042 }
27043 case Intrinsic::x86_atomic_add_cc:
27044 case Intrinsic::x86_atomic_sub_cc:
27045 case Intrinsic::x86_atomic_or_cc:
27046 case Intrinsic::x86_atomic_and_cc:
27047 case Intrinsic::x86_atomic_xor_cc: {
27048 SDLoc DL(Op);
27049 SDValue Chain = Op.getOperand(0);
27050 SDValue Op1 = Op.getOperand(2);
27051 SDValue Op2 = Op.getOperand(3);
27052 X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4);
27053 MVT VT = Op2.getSimpleValueType();
27054 unsigned Opc = 0;
27055 switch (IntNo) {
27056 default:
27057 llvm_unreachable("Unknown Intrinsic");
27058 case Intrinsic::x86_atomic_add_cc:
27059 Opc = X86ISD::LADD;
27060 break;
27061 case Intrinsic::x86_atomic_sub_cc:
27062 Opc = X86ISD::LSUB;
27063 break;
27064 case Intrinsic::x86_atomic_or_cc:
27065 Opc = X86ISD::LOR;
27066 break;
27067 case Intrinsic::x86_atomic_and_cc:
27068 Opc = X86ISD::LAND;
27069 break;
27070 case Intrinsic::x86_atomic_xor_cc:
27071 Opc = X86ISD::LXOR;
27072 break;
27073 }
27074 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27075 SDValue LockArith =
27076 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
27077 {Chain, Op1, Op2}, VT, MMO);
27078 Chain = LockArith.getValue(1);
27079 return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL);
27080 }
27081 }
27082 return SDValue();
27083 }
27084
27085 SDLoc dl(Op);
27086 switch(IntrData->Type) {
27087 default: llvm_unreachable("Unknown Intrinsic Type");
27088 case RDSEED:
27089 case RDRAND: {
27090 // Emit the node with the right value type.
27091 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
27092 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
27093
27094 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
27095 // Otherwise return the value from Rand, which is always 0, casted to i32.
27096 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
27097 DAG.getConstant(1, dl, Op->getValueType(1)),
27098 DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
27099 SDValue(Result.getNode(), 1)};
27100 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
27101
27102 // Return { result, isValid, chain }.
27103 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
27104 SDValue(Result.getNode(), 2));
27105 }
27106 case GATHER_AVX2: {
27107 SDValue Chain = Op.getOperand(0);
27108 SDValue Src = Op.getOperand(2);
27109 SDValue Base = Op.getOperand(3);
27110 SDValue Index = Op.getOperand(4);
27111 SDValue Mask = Op.getOperand(5);
27112 SDValue Scale = Op.getOperand(6);
27113 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
27114 Scale, Chain, Subtarget);
27115 }
27116 case GATHER: {
27117 //gather(v1, mask, index, base, scale);
27118 SDValue Chain = Op.getOperand(0);
27119 SDValue Src = Op.getOperand(2);
27120 SDValue Base = Op.getOperand(3);
27121 SDValue Index = Op.getOperand(4);
27122 SDValue Mask = Op.getOperand(5);
27123 SDValue Scale = Op.getOperand(6);
27124 return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
27125 Chain, Subtarget);
27126 }
27127 case SCATTER: {
27128 //scatter(base, mask, index, v1, scale);
27129 SDValue Chain = Op.getOperand(0);
27130 SDValue Base = Op.getOperand(2);
27131 SDValue Mask = Op.getOperand(3);
27132 SDValue Index = Op.getOperand(4);
27133 SDValue Src = Op.getOperand(5);
27134 SDValue Scale = Op.getOperand(6);
27135 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
27136 Scale, Chain, Subtarget);
27137 }
27138 case PREFETCH: {
27139 const APInt &HintVal = Op.getConstantOperandAPInt(6);
27140 assert((HintVal == 2 || HintVal == 3) &&
27141 "Wrong prefetch hint in intrinsic: should be 2 or 3");
27142 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
27143 SDValue Chain = Op.getOperand(0);
27144 SDValue Mask = Op.getOperand(2);
27145 SDValue Index = Op.getOperand(3);
27146 SDValue Base = Op.getOperand(4);
27147 SDValue Scale = Op.getOperand(5);
27148 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
27149 Subtarget);
27150 }
27151 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
27152 case RDTSC: {
27154 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
27155 Results);
27156 return DAG.getMergeValues(Results, dl);
27157 }
27158 // Read Performance Monitoring Counters.
27159 case RDPMC:
27160 // Read Processor Register.
27161 case RDPRU:
27162 // GetExtended Control Register.
27163 case XGETBV: {
27165
27166 // RDPMC uses ECX to select the index of the performance counter to read.
27167 // RDPRU uses ECX to select the processor register to read.
27168 // XGETBV uses ECX to select the index of the XCR register to return.
27169 // The result is stored into registers EDX:EAX.
27170 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
27171 Subtarget, Results);
27172 return DAG.getMergeValues(Results, dl);
27173 }
27174 // XTEST intrinsics.
27175 case XTEST: {
27176 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
27177 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
27178
27179 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
27180 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
27181 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
27182 Ret, SDValue(InTrans.getNode(), 1));
27183 }
27186 case TRUNCATE_TO_MEM_VI32: {
27187 SDValue Mask = Op.getOperand(4);
27188 SDValue DataToTruncate = Op.getOperand(3);
27189 SDValue Addr = Op.getOperand(2);
27190 SDValue Chain = Op.getOperand(0);
27191
27192 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
27193 assert(MemIntr && "Expected MemIntrinsicSDNode!");
27194
27195 EVT MemVT = MemIntr->getMemoryVT();
27196
27197 uint16_t TruncationOp = IntrData->Opc0;
27198 switch (TruncationOp) {
27199 case X86ISD::VTRUNC: {
27200 if (isAllOnesConstant(Mask)) // return just a truncate store
27201 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
27202 MemIntr->getMemOperand());
27203
27204 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
27205 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27206 SDValue Offset = DAG.getUNDEF(VMask.getValueType());
27207
27208 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
27209 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
27210 true /* truncating */);
27211 }
27212 case X86ISD::VTRUNCUS:
27213 case X86ISD::VTRUNCS: {
27214 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
27215 if (isAllOnesConstant(Mask))
27216 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
27217 MemIntr->getMemOperand(), DAG);
27218
27219 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
27220 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27221
27222 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
27223 VMask, MemVT, MemIntr->getMemOperand(), DAG);
27224 }
27225 default:
27226 llvm_unreachable("Unsupported truncstore intrinsic");
27227 }
27228 }
27229 }
27230}
27231
27232SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
27233 SelectionDAG &DAG) const {
27235 MFI.setReturnAddressIsTaken(true);
27236
27238 return SDValue();
27239
27240 unsigned Depth = Op.getConstantOperandVal(0);
27241 SDLoc dl(Op);
27242 EVT PtrVT = getPointerTy(DAG.getDataLayout());
27243
27244 if (Depth > 0) {
27245 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
27246 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27247 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
27248 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
27249 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
27251 }
27252
27253 // Just load the return address.
27254 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
27255 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
27257}
27258
27259SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
27260 SelectionDAG &DAG) const {
27262 return getReturnAddressFrameIndex(DAG);
27263}
27264
27265SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
27267 MachineFrameInfo &MFI = MF.getFrameInfo();
27269 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27270 EVT VT = Op.getValueType();
27271
27272 MFI.setFrameAddressIsTaken(true);
27273
27274 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
27275 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
27276 // is not possible to crawl up the stack without looking at the unwind codes
27277 // simultaneously.
27278 int FrameAddrIndex = FuncInfo->getFAIndex();
27279 if (!FrameAddrIndex) {
27280 // Set up a frame object for the return address.
27281 unsigned SlotSize = RegInfo->getSlotSize();
27282 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
27283 SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
27284 FuncInfo->setFAIndex(FrameAddrIndex);
27285 }
27286 return DAG.getFrameIndex(FrameAddrIndex, VT);
27287 }
27288
27289 unsigned FrameReg =
27290 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
27291 SDLoc dl(Op); // FIXME probably not meaningful
27292 unsigned Depth = Op.getConstantOperandVal(0);
27293 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
27294 (FrameReg == X86::EBP && VT == MVT::i32)) &&
27295 "Invalid Frame Register!");
27296 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
27297 while (Depth--)
27298 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
27300 return FrameAddr;
27301}
27302
27303// FIXME? Maybe this could be a TableGen attribute on some registers and
27304// this table could be generated automatically from RegInfo.
27306 const MachineFunction &MF) const {
27307 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
27308
27310 .Case("esp", X86::ESP)
27311 .Case("rsp", X86::RSP)
27312 .Case("ebp", X86::EBP)
27313 .Case("rbp", X86::RBP)
27314 .Case("r14", X86::R14)
27315 .Case("r15", X86::R15)
27316 .Default(0);
27317
27318 if (Reg == X86::EBP || Reg == X86::RBP) {
27319 if (!TFI.hasFP(MF))
27320 report_fatal_error("register " + StringRef(RegName) +
27321 " is allocatable: function has no frame pointer");
27322#ifndef NDEBUG
27323 else {
27324 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27325 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
27326 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
27327 "Invalid Frame Register!");
27328 }
27329#endif
27330 }
27331
27332 if (Reg)
27333 return Reg;
27334
27335 report_fatal_error("Invalid register name global variable");
27336}
27337
27338SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
27339 SelectionDAG &DAG) const {
27340 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27341 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
27342}
27343
27345 const Constant *PersonalityFn) const {
27346 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
27347 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
27348
27349 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
27350}
27351
27353 const Constant *PersonalityFn) const {
27354 // Funclet personalities don't use selectors (the runtime does the selection).
27356 return X86::NoRegister;
27357 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
27358}
27359
27361 return Subtarget.isTargetWin64();
27362}
27363
27364SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
27365 SDValue Chain = Op.getOperand(0);
27366 SDValue Offset = Op.getOperand(1);
27367 SDValue Handler = Op.getOperand(2);
27368 SDLoc dl (Op);
27369
27370 EVT PtrVT = getPointerTy(DAG.getDataLayout());
27371 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27372 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
27373 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
27374 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
27375 "Invalid Frame Register!");
27376 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
27377 Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
27378
27379 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
27380 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
27381 dl));
27382 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
27383 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
27384 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
27385
27386 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
27387 DAG.getRegister(StoreAddrReg, PtrVT));
27388}
27389
27390SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
27391 SelectionDAG &DAG) const {
27392 SDLoc DL(Op);
27393 // If the subtarget is not 64bit, we may need the global base reg
27394 // after isel expand pseudo, i.e., after CGBR pass ran.
27395 // Therefore, ask for the GlobalBaseReg now, so that the pass
27396 // inserts the code for us in case we need it.
27397 // Otherwise, we will end up in a situation where we will
27398 // reference a virtual register that is not defined!
27399 if (!Subtarget.is64Bit()) {
27400 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27401 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
27402 }
27403 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
27404 DAG.getVTList(MVT::i32, MVT::Other),
27405 Op.getOperand(0), Op.getOperand(1));
27406}
27407
27408SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
27409 SelectionDAG &DAG) const {
27410 SDLoc DL(Op);
27411 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
27412 Op.getOperand(0), Op.getOperand(1));
27413}
27414
27415SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
27416 SelectionDAG &DAG) const {
27417 SDLoc DL(Op);
27418 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
27419 Op.getOperand(0));
27420}
27421
27423 return Op.getOperand(0);
27424}
27425
27426SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
27427 SelectionDAG &DAG) const {
27428 SDValue Root = Op.getOperand(0);
27429 SDValue Trmp = Op.getOperand(1); // trampoline
27430 SDValue FPtr = Op.getOperand(2); // nested function
27431 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
27432 SDLoc dl (Op);
27433
27434 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
27435 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
27436
27437 if (Subtarget.is64Bit()) {
27438 SDValue OutChains[6];
27439
27440 // Large code-model.
27441 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
27442 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
27443
27444 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
27445 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
27446
27447 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
27448
27449 // Load the pointer to the nested function into R11.
27450 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
27451 SDValue Addr = Trmp;
27452 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27453 Addr, MachinePointerInfo(TrmpAddr));
27454
27455 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27456 DAG.getConstant(2, dl, MVT::i64));
27457 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
27458 MachinePointerInfo(TrmpAddr, 2), Align(2));
27459
27460 // Load the 'nest' parameter value into R10.
27461 // R10 is specified in X86CallingConv.td
27462 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
27463 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27464 DAG.getConstant(10, dl, MVT::i64));
27465 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27466 Addr, MachinePointerInfo(TrmpAddr, 10));
27467
27468 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27469 DAG.getConstant(12, dl, MVT::i64));
27470 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
27471 MachinePointerInfo(TrmpAddr, 12), Align(2));
27472
27473 // Jump to the nested function.
27474 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
27475 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27476 DAG.getConstant(20, dl, MVT::i64));
27477 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27478 Addr, MachinePointerInfo(TrmpAddr, 20));
27479
27480 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
27481 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27482 DAG.getConstant(22, dl, MVT::i64));
27483 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
27484 Addr, MachinePointerInfo(TrmpAddr, 22));
27485
27486 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
27487 } else {
27488 const Function *Func =
27489 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
27490 CallingConv::ID CC = Func->getCallingConv();
27491 unsigned NestReg;
27492
27493 switch (CC) {
27494 default:
27495 llvm_unreachable("Unsupported calling convention");
27496 case CallingConv::C:
27498 // Pass 'nest' parameter in ECX.
27499 // Must be kept in sync with X86CallingConv.td
27500 NestReg = X86::ECX;
27501
27502 // Check that ECX wasn't needed by an 'inreg' parameter.
27503 FunctionType *FTy = Func->getFunctionType();
27504 const AttributeList &Attrs = Func->getAttributes();
27505
27506 if (!Attrs.isEmpty() && !Func->isVarArg()) {
27507 unsigned InRegCount = 0;
27508 unsigned Idx = 0;
27509
27510 for (FunctionType::param_iterator I = FTy->param_begin(),
27511 E = FTy->param_end(); I != E; ++I, ++Idx)
27512 if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
27513 const DataLayout &DL = DAG.getDataLayout();
27514 // FIXME: should only count parameters that are lowered to integers.
27515 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
27516 }
27517
27518 if (InRegCount > 2) {
27519 report_fatal_error("Nest register in use - reduce number of inreg"
27520 " parameters!");
27521 }
27522 }
27523 break;
27524 }
27527 case CallingConv::Fast:
27528 case CallingConv::Tail:
27530 // Pass 'nest' parameter in EAX.
27531 // Must be kept in sync with X86CallingConv.td
27532 NestReg = X86::EAX;
27533 break;
27534 }
27535
27536 SDValue OutChains[4];
27537 SDValue Addr, Disp;
27538
27539 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27540 DAG.getConstant(10, dl, MVT::i32));
27541 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
27542
27543 // This is storing the opcode for MOV32ri.
27544 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
27545 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
27546 OutChains[0] =
27547 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
27548 Trmp, MachinePointerInfo(TrmpAddr));
27549
27550 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27551 DAG.getConstant(1, dl, MVT::i32));
27552 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
27553 MachinePointerInfo(TrmpAddr, 1), Align(1));
27554
27555 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
27556 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27557 DAG.getConstant(5, dl, MVT::i32));
27558 OutChains[2] =
27559 DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
27560 MachinePointerInfo(TrmpAddr, 5), Align(1));
27561
27562 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27563 DAG.getConstant(6, dl, MVT::i32));
27564 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
27565 MachinePointerInfo(TrmpAddr, 6), Align(1));
27566
27567 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
27568 }
27569}
27570
27571SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op,
27572 SelectionDAG &DAG) const {
27573 /*
27574 The rounding mode is in bits 11:10 of FPSR, and has the following
27575 settings:
27576 00 Round to nearest
27577 01 Round to -inf
27578 10 Round to +inf
27579 11 Round to 0
27580
27581 GET_ROUNDING, on the other hand, expects the following:
27582 -1 Undefined
27583 0 Round to 0
27584 1 Round to nearest
27585 2 Round to +inf
27586 3 Round to -inf
27587
27588 To perform the conversion, we use a packed lookup table of the four 2-bit
27589 values that we can index by FPSP[11:10]
27590 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
27591
27592 (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
27593 */
27594
27596 MVT VT = Op.getSimpleValueType();
27597 SDLoc DL(Op);
27598
27599 // Save FP Control Word to stack slot
27600 int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
27601 SDValue StackSlot =
27602 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
27603
27605
27606 SDValue Chain = Op.getOperand(0);
27607 SDValue Ops[] = {Chain, StackSlot};
27609 DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
27611
27612 // Load FP Control Word from stack slot
27613 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
27614 Chain = CWD.getValue(1);
27615
27616 // Mask and turn the control bits into a shift for the lookup table.
27617 SDValue Shift =
27618 DAG.getNode(ISD::SRL, DL, MVT::i16,
27619 DAG.getNode(ISD::AND, DL, MVT::i16,
27620 CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
27621 DAG.getConstant(9, DL, MVT::i8));
27622 Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
27623
27624 SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
27625 SDValue RetVal =
27626 DAG.getNode(ISD::AND, DL, MVT::i32,
27627 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
27628 DAG.getConstant(3, DL, MVT::i32));
27629
27630 RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
27631
27632 return DAG.getMergeValues({RetVal, Chain}, DL);
27633}
27634
27635SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
27636 SelectionDAG &DAG) const {
27638 SDLoc DL(Op);
27639 SDValue Chain = Op.getNode()->getOperand(0);
27640
27641 // FP control word may be set only from data in memory. So we need to allocate
27642 // stack space to save/load FP control word.
27643 int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
27644 SDValue StackSlot =
27645 DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
27647 MachineMemOperand *MMO =
27649
27650 // Store FP control word into memory.
27651 SDValue Ops[] = {Chain, StackSlot};
27652 Chain = DAG.getMemIntrinsicNode(
27653 X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
27654
27655 // Load FP Control Word from stack slot and clear RM field (bits 11:10).
27656 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
27657 Chain = CWD.getValue(1);
27658 CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
27659 DAG.getConstant(0xf3ff, DL, MVT::i16));
27660
27661 // Calculate new rounding mode.
27662 SDValue NewRM = Op.getNode()->getOperand(1);
27663 SDValue RMBits;
27664 if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
27665 uint64_t RM = CVal->getZExtValue();
27666 int FieldVal;
27667 switch (static_cast<RoundingMode>(RM)) {
27668 // clang-format off
27669 case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;
27670 case RoundingMode::TowardNegative: FieldVal = X86::rmDownward; break;
27671 case RoundingMode::TowardPositive: FieldVal = X86::rmUpward; break;
27672 case RoundingMode::TowardZero: FieldVal = X86::rmTowardZero; break;
27673 default:
27674 llvm_unreachable("rounding mode is not supported by X86 hardware");
27675 // clang-format on
27676 }
27677 RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
27678 } else {
27679 // Need to convert argument into bits of control word:
27680 // 0 Round to 0 -> 11
27681 // 1 Round to nearest -> 00
27682 // 2 Round to +inf -> 10
27683 // 3 Round to -inf -> 01
27684 // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
27685 // To make the conversion, put all these values into a value 0xc9 and shift
27686 // it left depending on the rounding mode:
27687 // (0xc9 << 4) & 0xc00 = X86::rmTowardZero
27688 // (0xc9 << 6) & 0xc00 = X86::rmToNearest
27689 // ...
27690 // (0xc9 << (2 * NewRM + 4)) & 0xc00
27691 SDValue ShiftValue =
27692 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
27693 DAG.getNode(ISD::ADD, DL, MVT::i32,
27694 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
27695 DAG.getConstant(1, DL, MVT::i8)),
27696 DAG.getConstant(4, DL, MVT::i32)));
27697 SDValue Shifted =
27698 DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
27699 ShiftValue);
27700 RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
27701 DAG.getConstant(0xc00, DL, MVT::i16));
27702 }
27703
27704 // Update rounding mode bits and store the new FP Control Word into stack.
27705 CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
27706 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(2));
27707
27708 // Load FP control word from the slot.
27709 SDValue OpsLD[] = {Chain, StackSlot};
27710 MachineMemOperand *MMOL =
27712 Chain = DAG.getMemIntrinsicNode(
27713 X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
27714
27715 // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
27716 // same way but in bits 14:13.
27717 if (Subtarget.hasSSE1()) {
27718 // Store MXCSR into memory.
27719 Chain = DAG.getNode(
27720 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27721 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
27722 StackSlot);
27723
27724 // Load MXCSR from stack slot and clear RM field (bits 14:13).
27725 SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
27726 Chain = CWD.getValue(1);
27727 CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
27728 DAG.getConstant(0xffff9fff, DL, MVT::i32));
27729
27730 // Shift X87 RM bits from 11:10 to 14:13.
27731 RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
27732 RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
27733 DAG.getConstant(3, DL, MVT::i8));
27734
27735 // Update rounding mode bits and store the new FP Control Word into stack.
27736 CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
27737 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(4));
27738
27739 // Load MXCSR from the slot.
27740 Chain = DAG.getNode(
27741 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27742 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
27743 StackSlot);
27744 }
27745
27746 return Chain;
27747}
27748
27749const unsigned X87StateSize = 28;
27750const unsigned FPStateSize = 32;
27751[[maybe_unused]] const unsigned FPStateSizeInBits = FPStateSize * 8;
27752
27753SDValue X86TargetLowering::LowerGET_FPENV_MEM(SDValue Op,
27754 SelectionDAG &DAG) const {
27756 SDLoc DL(Op);
27757 SDValue Chain = Op->getOperand(0);
27758 SDValue Ptr = Op->getOperand(1);
27759 auto *Node = cast<FPStateAccessSDNode>(Op);
27760 EVT MemVT = Node->getMemoryVT();
27762 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
27763
27764 // Get x87 state, if it presents.
27765 if (Subtarget.hasX87()) {
27766 Chain =
27767 DAG.getMemIntrinsicNode(X86ISD::FNSTENVm, DL, DAG.getVTList(MVT::Other),
27768 {Chain, Ptr}, MemVT, MMO);
27769
27770 // FNSTENV changes the exception mask, so load back the stored environment.
27771 MachineMemOperand::Flags NewFlags =
27773 (MMO->getFlags() & ~MachineMemOperand::MOStore);
27774 MMO = MF.getMachineMemOperand(MMO, NewFlags);
27775 Chain =
27776 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
27777 {Chain, Ptr}, MemVT, MMO);
27778 }
27779
27780 // If target supports SSE, get MXCSR as well.
27781 if (Subtarget.hasSSE1()) {
27782 // Get pointer to the MXCSR location in memory.
27784 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
27785 DAG.getConstant(X87StateSize, DL, PtrVT));
27786 // Store MXCSR into memory.
27787 Chain = DAG.getNode(
27788 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27789 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
27790 MXCSRAddr);
27791 }
27792
27793 return Chain;
27794}
27795
27797 EVT MemVT, MachineMemOperand *MMO,
27798 SelectionDAG &DAG,
27799 const X86Subtarget &Subtarget) {
27800 // Set x87 state, if it presents.
27801 if (Subtarget.hasX87())
27802 Chain =
27803 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
27804 {Chain, Ptr}, MemVT, MMO);
27805 // If target supports SSE, set MXCSR as well.
27806 if (Subtarget.hasSSE1()) {
27807 // Get pointer to the MXCSR location in memory.
27809 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
27810 DAG.getConstant(X87StateSize, DL, PtrVT));
27811 // Load MXCSR from memory.
27812 Chain = DAG.getNode(
27813 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27814 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
27815 MXCSRAddr);
27816 }
27817 return Chain;
27818}
27819
27820SDValue X86TargetLowering::LowerSET_FPENV_MEM(SDValue Op,
27821 SelectionDAG &DAG) const {
27822 SDLoc DL(Op);
27823 SDValue Chain = Op->getOperand(0);
27824 SDValue Ptr = Op->getOperand(1);
27825 auto *Node = cast<FPStateAccessSDNode>(Op);
27826 EVT MemVT = Node->getMemoryVT();
27828 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
27829 return createSetFPEnvNodes(Ptr, Chain, DL, MemVT, MMO, DAG, Subtarget);
27830}
27831
27832SDValue X86TargetLowering::LowerRESET_FPENV(SDValue Op,
27833 SelectionDAG &DAG) const {
27835 SDLoc DL(Op);
27836 SDValue Chain = Op.getNode()->getOperand(0);
27837
27838 IntegerType *ItemTy = Type::getInt32Ty(*DAG.getContext());
27839 ArrayType *FPEnvTy = ArrayType::get(ItemTy, 8);
27841
27842 // x87 FPU Control Word: mask all floating-point exceptions, sets rounding to
27843 // nearest. FPU precision is set to 53 bits on Windows and 64 bits otherwise
27844 // for compatibility with glibc.
27845 unsigned X87CW = Subtarget.isTargetWindowsMSVC() ? 0x27F : 0x37F;
27846 FPEnvVals.push_back(ConstantInt::get(ItemTy, X87CW));
27847 Constant *Zero = ConstantInt::get(ItemTy, 0);
27848 for (unsigned I = 0; I < 6; ++I)
27849 FPEnvVals.push_back(Zero);
27850
27851 // MXCSR: mask all floating-point exceptions, sets rounding to nearest, clear
27852 // all exceptions, sets DAZ and FTZ to 0.
27853 FPEnvVals.push_back(ConstantInt::get(ItemTy, 0x1F80));
27854 Constant *FPEnvBits = ConstantArray::get(FPEnvTy, FPEnvVals);
27856 SDValue Env = DAG.getConstantPool(FPEnvBits, PtrVT);
27857 MachinePointerInfo MPI =
27861
27862 return createSetFPEnvNodes(Env, Chain, DL, MVT::i32, MMO, DAG, Subtarget);
27863}
27864
27865/// Lower a vector CTLZ using native supported vector CTLZ instruction.
27866//
27867// i8/i16 vector implemented using dword LZCNT vector instruction
27868// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
27869// split the vector, perform operation on it's Lo a Hi part and
27870// concatenate the results.
27872 const X86Subtarget &Subtarget) {
27873 assert(Op.getOpcode() == ISD::CTLZ);
27874 SDLoc dl(Op);
27875 MVT VT = Op.getSimpleValueType();
27876 MVT EltVT = VT.getVectorElementType();
27877 unsigned NumElems = VT.getVectorNumElements();
27878
27879 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
27880 "Unsupported element type");
27881
27882 // Split vector, it's Lo and Hi parts will be handled in next iteration.
27883 if (NumElems > 16 ||
27884 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
27885 return splitVectorIntUnary(Op, DAG, dl);
27886
27887 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
27888 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
27889 "Unsupported value type for operation");
27890
27891 // Use native supported vector instruction vplzcntd.
27892 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
27893 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
27894 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
27895 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
27896
27897 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
27898}
27899
27900// Lower CTLZ using a PSHUFB lookup table implementation.
27902 const X86Subtarget &Subtarget,
27903 SelectionDAG &DAG) {
27904 MVT VT = Op.getSimpleValueType();
27905 int NumElts = VT.getVectorNumElements();
27906 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
27907 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
27908
27909 // Per-nibble leading zero PSHUFB lookup table.
27910 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
27911 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
27912 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
27913 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
27914
27916 for (int i = 0; i < NumBytes; ++i)
27917 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
27918 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
27919
27920 // Begin by bitcasting the input to byte vector, then split those bytes
27921 // into lo/hi nibbles and use the PSHUFB LUT to perform CTLZ on each of them.
27922 // If the hi input nibble is zero then we add both results together, otherwise
27923 // we just take the hi result (by masking the lo result to zero before the
27924 // add).
27925 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
27926 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
27927
27928 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
27929 SDValue Lo = Op0;
27930 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
27931 SDValue HiZ;
27932 if (CurrVT.is512BitVector()) {
27933 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
27934 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
27935 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
27936 } else {
27937 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
27938 }
27939
27940 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
27941 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
27942 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
27943 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
27944
27945 // Merge result back from vXi8 back to VT, working on the lo/hi halves
27946 // of the current vector width in the same way we did for the nibbles.
27947 // If the upper half of the input element is zero then add the halves'
27948 // leading zero counts together, otherwise just use the upper half's.
27949 // Double the width of the result until we are at target width.
27950 while (CurrVT != VT) {
27951 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
27952 int CurrNumElts = CurrVT.getVectorNumElements();
27953 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
27954 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
27955 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
27956
27957 // Check if the upper half of the input element is zero.
27958 if (CurrVT.is512BitVector()) {
27959 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
27960 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
27961 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
27962 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
27963 } else {
27964 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
27965 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
27966 }
27967 HiZ = DAG.getBitcast(NextVT, HiZ);
27968
27969 // Move the upper/lower halves to the lower bits as we'll be extending to
27970 // NextVT. Mask the lower result to zero if HiZ is true and add the results
27971 // together.
27972 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
27973 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
27974 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
27975 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
27976 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
27977 CurrVT = NextVT;
27978 }
27979
27980 return Res;
27981}
27982
27984 const X86Subtarget &Subtarget,
27985 SelectionDAG &DAG) {
27986 MVT VT = Op.getSimpleValueType();
27987
27988 if (Subtarget.hasCDI() &&
27989 // vXi8 vectors need to be promoted to 512-bits for vXi32.
27990 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
27991 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
27992
27993 // Decompose 256-bit ops into smaller 128-bit ops.
27994 if (VT.is256BitVector() && !Subtarget.hasInt256())
27995 return splitVectorIntUnary(Op, DAG, DL);
27996
27997 // Decompose 512-bit ops into smaller 256-bit ops.
27998 if (VT.is512BitVector() && !Subtarget.hasBWI())
27999 return splitVectorIntUnary(Op, DAG, DL);
28000
28001 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
28002 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
28003}
28004
28005static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
28006 SelectionDAG &DAG) {
28007 MVT VT = Op.getSimpleValueType();
28008 MVT OpVT = VT;
28009 unsigned NumBits = VT.getSizeInBits();
28010 SDLoc dl(Op);
28011 unsigned Opc = Op.getOpcode();
28012
28013 if (VT.isVector())
28014 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
28015
28016 Op = Op.getOperand(0);
28017 if (VT == MVT::i8) {
28018 // Zero extend to i32 since there is not an i8 bsr.
28019 OpVT = MVT::i32;
28020 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
28021 }
28022
28023 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
28024 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
28025 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
28026
28027 if (Opc == ISD::CTLZ) {
28028 // If src is zero (i.e. bsr sets ZF), returns NumBits.
28029 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
28030 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
28031 Op.getValue(1)};
28032 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
28033 }
28034
28035 // Finally xor with NumBits-1.
28036 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
28037 DAG.getConstant(NumBits - 1, dl, OpVT));
28038
28039 if (VT == MVT::i8)
28040 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
28041 return Op;
28042}
28043
28044static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
28045 SelectionDAG &DAG) {
28046 MVT VT = Op.getSimpleValueType();
28047 unsigned NumBits = VT.getScalarSizeInBits();
28048 SDValue N0 = Op.getOperand(0);
28049 SDLoc dl(Op);
28050
28051 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
28052 "Only scalar CTTZ requires custom lowering");
28053
28054 // Issue a bsf (scan bits forward) which also sets EFLAGS.
28055 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
28056 Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
28057
28058 // If src is known never zero we can skip the CMOV.
28059 if (DAG.isKnownNeverZero(N0))
28060 return Op;
28061
28062 // If src is zero (i.e. bsf sets ZF), returns NumBits.
28063 SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
28064 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
28065 Op.getValue(1)};
28066 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
28067}
28068
28070 const X86Subtarget &Subtarget) {
28071 MVT VT = Op.getSimpleValueType();
28072 SDLoc DL(Op);
28073
28074 if (VT == MVT::i16 || VT == MVT::i32)
28075 return lowerAddSubToHorizontalOp(Op, DL, DAG, Subtarget);
28076
28077 if (VT == MVT::v32i16 || VT == MVT::v64i8)
28078 return splitVectorIntBinary(Op, DAG, DL);
28079
28080 assert(Op.getSimpleValueType().is256BitVector() &&
28081 Op.getSimpleValueType().isInteger() &&
28082 "Only handle AVX 256-bit vector integer operation");
28083 return splitVectorIntBinary(Op, DAG, DL);
28084}
28085
28087 const X86Subtarget &Subtarget) {
28088 MVT VT = Op.getSimpleValueType();
28089 SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
28090 unsigned Opcode = Op.getOpcode();
28091 SDLoc DL(Op);
28092
28093 if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
28094 (VT.is256BitVector() && !Subtarget.hasInt256())) {
28095 assert(Op.getSimpleValueType().isInteger() &&
28096 "Only handle AVX vector integer operation");
28097 return splitVectorIntBinary(Op, DAG, DL);
28098 }
28099
28100 // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
28101 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28102 EVT SetCCResultType =
28103 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
28104
28105 unsigned BitWidth = VT.getScalarSizeInBits();
28106 if (Opcode == ISD::USUBSAT) {
28107 if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {
28108 // Handle a special-case with a bit-hack instead of cmp+select:
28109 // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)
28110 // If the target can use VPTERNLOG, DAGToDAG will match this as
28111 // "vpsra + vpternlog" which is better than "vpmax + vpsub" with a
28112 // "broadcast" constant load.
28114 if (C && C->getAPIntValue().isSignMask()) {
28115 SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);
28116 SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);
28117 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);
28118 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);
28119 return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);
28120 }
28121 }
28122 if (!TLI.isOperationLegal(ISD::UMAX, VT)) {
28123 // usubsat X, Y --> (X >u Y) ? X - Y : 0
28124 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
28125 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
28126 // TODO: Move this to DAGCombiner?
28127 if (SetCCResultType == VT &&
28128 DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
28129 return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
28130 return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
28131 }
28132 }
28133
28134 if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&
28135 (!VT.isVector() || VT == MVT::v2i64)) {
28138 SDValue Zero = DAG.getConstant(0, DL, VT);
28139 SDValue Result =
28140 DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,
28141 DAG.getVTList(VT, SetCCResultType), X, Y);
28142 SDValue SumDiff = Result.getValue(0);
28143 SDValue Overflow = Result.getValue(1);
28144 SDValue SatMin = DAG.getConstant(MinVal, DL, VT);
28145 SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);
28146 SDValue SumNeg =
28147 DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);
28148 Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);
28149 return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);
28150 }
28151
28152 // Use default expansion.
28153 return SDValue();
28154}
28155
28156static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
28157 SelectionDAG &DAG) {
28158 MVT VT = Op.getSimpleValueType();
28159 SDLoc DL(Op);
28160
28161 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
28162 // Since X86 does not have CMOV for 8-bit integer, we don't convert
28163 // 8-bit integer abs to NEG and CMOV.
28164 SDValue N0 = Op.getOperand(0);
28165 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
28166 DAG.getConstant(0, DL, VT), N0);
28167 SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),
28168 SDValue(Neg.getNode(), 1)};
28169 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
28170 }
28171
28172 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
28173 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
28174 SDValue Src = Op.getOperand(0);
28175 SDValue Neg = DAG.getNegative(Src, DL, VT);
28176 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Neg, Src);
28177 }
28178
28179 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
28180 assert(VT.isInteger() &&
28181 "Only handle AVX 256-bit vector integer operation");
28182 return splitVectorIntUnary(Op, DAG, DL);
28183 }
28184
28185 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
28186 return splitVectorIntUnary(Op, DAG, DL);
28187
28188 // Default to expand.
28189 return SDValue();
28190}
28191
28192static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,
28193 SelectionDAG &DAG) {
28194 MVT VT = Op.getSimpleValueType();
28195 SDLoc DL(Op);
28196
28197 // For AVX1 cases, split to use legal ops.
28198 if (VT.is256BitVector() && !Subtarget.hasInt256())
28199 return splitVectorIntBinary(Op, DAG, DL);
28200
28201 if (VT == MVT::v32i16 || VT == MVT::v64i8)
28202 return splitVectorIntBinary(Op, DAG, DL);
28203
28204 // Default to expand.
28205 return SDValue();
28206}
28207
28208static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,
28209 SelectionDAG &DAG) {
28210 MVT VT = Op.getSimpleValueType();
28211 SDLoc DL(Op);
28212
28213 // For AVX1 cases, split to use legal ops.
28214 if (VT.is256BitVector() && !Subtarget.hasInt256())
28215 return splitVectorIntBinary(Op, DAG, DL);
28216
28217 if (VT == MVT::v32i16 || VT == MVT::v64i8)
28218 return splitVectorIntBinary(Op, DAG, DL);
28219
28220 // Default to expand.
28221 return SDValue();
28222}
28223
28225 SelectionDAG &DAG) {
28226 assert((Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMINIMUM) &&
28227 "Expected FMAXIMUM or FMINIMUM opcode");
28228 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28229 EVT VT = Op.getValueType();
28230 SDValue X = Op.getOperand(0);
28231 SDValue Y = Op.getOperand(1);
28232 SDLoc DL(Op);
28233 uint64_t SizeInBits = VT.getScalarSizeInBits();
28234 APInt PreferredZero = APInt::getZero(SizeInBits);
28235 APInt OppositeZero = PreferredZero;
28236 EVT IVT = VT.changeTypeToInteger();
28237 X86ISD::NodeType MinMaxOp;
28238 if (Op.getOpcode() == ISD::FMAXIMUM) {
28239 MinMaxOp = X86ISD::FMAX;
28240 OppositeZero.setSignBit();
28241 } else {
28242 PreferredZero.setSignBit();
28243 MinMaxOp = X86ISD::FMIN;
28244 }
28245 EVT SetCCType =
28246 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
28247
28248 // The tables below show the expected result of Max in cases of NaN and
28249 // signed zeros.
28250 //
28251 // Y Y
28252 // Num xNaN +0 -0
28253 // --------------- ---------------
28254 // Num | Max | Y | +0 | +0 | +0 |
28255 // X --------------- X ---------------
28256 // xNaN | X | X/Y | -0 | +0 | -0 |
28257 // --------------- ---------------
28258 //
28259 // It is achieved by means of FMAX/FMIN with preliminary checks and operand
28260 // reordering.
28261 //
28262 // We check if any of operands is NaN and return NaN. Then we check if any of
28263 // operands is zero or negative zero (for fmaximum and fminimum respectively)
28264 // to ensure the correct zero is returned.
28265 auto MatchesZero = [](SDValue Op, APInt Zero) {
28267 if (auto *CstOp = dyn_cast<ConstantFPSDNode>(Op))
28268 return CstOp->getValueAPF().bitcastToAPInt() == Zero;
28269 if (auto *CstOp = dyn_cast<ConstantSDNode>(Op))
28270 return CstOp->getAPIntValue() == Zero;
28271 if (Op->getOpcode() == ISD::BUILD_VECTOR ||
28272 Op->getOpcode() == ISD::SPLAT_VECTOR) {
28273 for (const SDValue &OpVal : Op->op_values()) {
28274 if (OpVal.isUndef())
28275 continue;
28276 auto *CstOp = dyn_cast<ConstantFPSDNode>(OpVal);
28277 if (!CstOp)
28278 return false;
28279 if (!CstOp->getValueAPF().isZero())
28280 continue;
28281 if (CstOp->getValueAPF().bitcastToAPInt() != Zero)
28282 return false;
28283 }
28284 return true;
28285 }
28286 return false;
28287 };
28288
28289 bool IsXNeverNaN = DAG.isKnownNeverNaN(X);
28290 bool IsYNeverNaN = DAG.isKnownNeverNaN(Y);
28291 bool IgnoreSignedZero = DAG.getTarget().Options.NoSignedZerosFPMath ||
28292 Op->getFlags().hasNoSignedZeros() ||
28293 DAG.isKnownNeverZeroFloat(X) ||
28295 SDValue NewX, NewY;
28296 if (IgnoreSignedZero || MatchesZero(Y, PreferredZero) ||
28297 MatchesZero(X, OppositeZero)) {
28298 // Operands are already in right order or order does not matter.
28299 NewX = X;
28300 NewY = Y;
28301 } else if (MatchesZero(X, PreferredZero) || MatchesZero(Y, OppositeZero)) {
28302 NewX = Y;
28303 NewY = X;
28304 } else if (!VT.isVector() && (VT == MVT::f16 || Subtarget.hasDQI()) &&
28305 (Op->getFlags().hasNoNaNs() || IsXNeverNaN || IsYNeverNaN)) {
28306 if (IsXNeverNaN)
28307 std::swap(X, Y);
28308 // VFPCLASSS consumes a vector type. So provide a minimal one corresponded
28309 // xmm register.
28310 MVT VectorType = MVT::getVectorVT(VT.getSimpleVT(), 128 / SizeInBits);
28312 // Bits of classes:
28313 // Bits Imm8[0] Imm8[1] Imm8[2] Imm8[3] Imm8[4] Imm8[5] Imm8[6] Imm8[7]
28314 // Class QNAN PosZero NegZero PosINF NegINF Denormal Negative SNAN
28315 SDValue Imm = DAG.getTargetConstant(MinMaxOp == X86ISD::FMAX ? 0b11 : 0b101,
28316 DL, MVT::i32);
28317 SDValue IsNanZero = DAG.getNode(X86ISD::VFPCLASSS, DL, MVT::v1i1, VX, Imm);
28318 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
28319 DAG.getConstant(0, DL, MVT::v8i1), IsNanZero,
28320 DAG.getIntPtrConstant(0, DL));
28321 SDValue NeedSwap = DAG.getBitcast(MVT::i8, Ins);
28322 NewX = DAG.getSelect(DL, VT, NeedSwap, Y, X);
28323 NewY = DAG.getSelect(DL, VT, NeedSwap, X, Y);
28324 return DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
28325 } else {
28326 SDValue IsXSigned;
28327 if (Subtarget.is64Bit() || VT != MVT::f64) {
28328 SDValue XInt = DAG.getNode(ISD::BITCAST, DL, IVT, X);
28329 SDValue ZeroCst = DAG.getConstant(0, DL, IVT);
28330 IsXSigned = DAG.getSetCC(DL, SetCCType, XInt, ZeroCst, ISD::SETLT);
28331 } else {
28332 assert(VT == MVT::f64);
28333 SDValue Ins = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2f64,
28334 DAG.getConstantFP(0, DL, MVT::v2f64), X,
28335 DAG.getIntPtrConstant(0, DL));
28336 SDValue VX = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, Ins);
28337 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VX,
28338 DAG.getIntPtrConstant(1, DL));
28339 Hi = DAG.getBitcast(MVT::i32, Hi);
28340 SDValue ZeroCst = DAG.getConstant(0, DL, MVT::i32);
28341 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(),
28342 *DAG.getContext(), MVT::i32);
28343 IsXSigned = DAG.getSetCC(DL, SetCCType, Hi, ZeroCst, ISD::SETLT);
28344 }
28345 if (MinMaxOp == X86ISD::FMAX) {
28346 NewX = DAG.getSelect(DL, VT, IsXSigned, X, Y);
28347 NewY = DAG.getSelect(DL, VT, IsXSigned, Y, X);
28348 } else {
28349 NewX = DAG.getSelect(DL, VT, IsXSigned, Y, X);
28350 NewY = DAG.getSelect(DL, VT, IsXSigned, X, Y);
28351 }
28352 }
28353
28354 bool IgnoreNaN = DAG.getTarget().Options.NoNaNsFPMath ||
28355 Op->getFlags().hasNoNaNs() || (IsXNeverNaN && IsYNeverNaN);
28356
28357 // If we did no ordering operands for signed zero handling and we need
28358 // to process NaN and we know that the second operand is not NaN then put
28359 // it in first operand and we will not need to post handle NaN after max/min.
28360 if (IgnoreSignedZero && !IgnoreNaN && DAG.isKnownNeverNaN(NewY))
28361 std::swap(NewX, NewY);
28362
28363 SDValue MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
28364
28365 if (IgnoreNaN || DAG.isKnownNeverNaN(NewX))
28366 return MinMax;
28367
28368 SDValue IsNaN = DAG.getSetCC(DL, SetCCType, NewX, NewX, ISD::SETUO);
28369 return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);
28370}
28371
28372static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,
28373 SelectionDAG &DAG) {
28374 MVT VT = Op.getSimpleValueType();
28375 SDLoc dl(Op);
28376
28377 // For AVX1 cases, split to use legal ops.
28378 if (VT.is256BitVector() && !Subtarget.hasInt256())
28379 return splitVectorIntBinary(Op, DAG, dl);
28380
28381 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.useBWIRegs())
28382 return splitVectorIntBinary(Op, DAG, dl);
28383
28384 bool IsSigned = Op.getOpcode() == ISD::ABDS;
28385 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28386
28387 // TODO: Move to TargetLowering expandABD() once we have ABD promotion.
28388 if (VT.isScalarInteger()) {
28389 unsigned WideBits = std::max<unsigned>(2 * VT.getScalarSizeInBits(), 32u);
28390 MVT WideVT = MVT::getIntegerVT(WideBits);
28391 if (TLI.isTypeLegal(WideVT)) {
28392 // abds(lhs, rhs) -> trunc(abs(sub(sext(lhs), sext(rhs))))
28393 // abdu(lhs, rhs) -> trunc(abs(sub(zext(lhs), zext(rhs))))
28394 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28395 SDValue LHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(0));
28396 SDValue RHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(1));
28397 SDValue Diff = DAG.getNode(ISD::SUB, dl, WideVT, LHS, RHS);
28398 SDValue AbsDiff = DAG.getNode(ISD::ABS, dl, WideVT, Diff);
28399 return DAG.getNode(ISD::TRUNCATE, dl, VT, AbsDiff);
28400 }
28401 }
28402
28403 // TODO: Move to TargetLowering expandABD().
28404 if (!Subtarget.hasSSE41() &&
28405 ((IsSigned && VT == MVT::v16i8) || VT == MVT::v4i32)) {
28406 SDValue LHS = DAG.getFreeze(Op.getOperand(0));
28407 SDValue RHS = DAG.getFreeze(Op.getOperand(1));
28409 SDValue Cmp = DAG.getSetCC(dl, VT, LHS, RHS, CC);
28410 SDValue Diff0 = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
28411 SDValue Diff1 = DAG.getNode(ISD::SUB, dl, VT, RHS, LHS);
28412 return getBitSelect(dl, VT, Diff0, Diff1, Cmp, DAG);
28413 }
28414
28415 // Default to expand.
28416 return SDValue();
28417}
28418
28419static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
28420 SelectionDAG &DAG) {
28421 SDLoc dl(Op);
28422 MVT VT = Op.getSimpleValueType();
28423
28424 // Decompose 256-bit ops into 128-bit ops.
28425 if (VT.is256BitVector() && !Subtarget.hasInt256())
28426 return splitVectorIntBinary(Op, DAG, dl);
28427
28428 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
28429 return splitVectorIntBinary(Op, DAG, dl);
28430
28431 SDValue A = Op.getOperand(0);
28432 SDValue B = Op.getOperand(1);
28433
28434 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
28435 // vector pairs, multiply and truncate.
28436 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
28437 unsigned NumElts = VT.getVectorNumElements();
28438
28439 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
28440 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
28441 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
28442 return DAG.getNode(
28443 ISD::TRUNCATE, dl, VT,
28444 DAG.getNode(ISD::MUL, dl, ExVT,
28445 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
28446 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
28447 }
28448
28449 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28450
28451 // Extract the lo/hi parts to any extend to i16.
28452 // We're going to mask off the low byte of each result element of the
28453 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
28454 // element.
28455 SDValue Undef = DAG.getUNDEF(VT);
28456 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
28457 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
28458
28459 SDValue BLo, BHi;
28460 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
28461 // If the RHS is a constant, manually unpackl/unpackh.
28462 SmallVector<SDValue, 16> LoOps, HiOps;
28463 for (unsigned i = 0; i != NumElts; i += 16) {
28464 for (unsigned j = 0; j != 8; ++j) {
28465 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
28466 MVT::i16));
28467 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
28468 MVT::i16));
28469 }
28470 }
28471
28472 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
28473 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
28474 } else {
28475 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
28476 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
28477 }
28478
28479 // Multiply, mask the lower 8bits of the lo/hi results and pack.
28480 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
28481 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
28482 return getPack(DAG, Subtarget, dl, VT, RLo, RHi);
28483 }
28484
28485 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
28486 if (VT == MVT::v4i32) {
28487 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
28488 "Should not custom lower when pmulld is available!");
28489
28490 // Extract the odd parts.
28491 static const int UnpackMask[] = { 1, -1, 3, -1 };
28492 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
28493 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
28494
28495 // Multiply the even parts.
28496 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
28497 DAG.getBitcast(MVT::v2i64, A),
28498 DAG.getBitcast(MVT::v2i64, B));
28499 // Now multiply odd parts.
28500 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
28501 DAG.getBitcast(MVT::v2i64, Aodds),
28502 DAG.getBitcast(MVT::v2i64, Bodds));
28503
28504 Evens = DAG.getBitcast(VT, Evens);
28505 Odds = DAG.getBitcast(VT, Odds);
28506
28507 // Merge the two vectors back together with a shuffle. This expands into 2
28508 // shuffles.
28509 static const int ShufMask[] = { 0, 4, 2, 6 };
28510 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
28511 }
28512
28513 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
28514 "Only know how to lower V2I64/V4I64/V8I64 multiply");
28515 assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
28516
28517 // Ahi = psrlqi(a, 32);
28518 // Bhi = psrlqi(b, 32);
28519 //
28520 // AloBlo = pmuludq(a, b);
28521 // AloBhi = pmuludq(a, Bhi);
28522 // AhiBlo = pmuludq(Ahi, b);
28523 //
28524 // Hi = psllqi(AloBhi + AhiBlo, 32);
28525 // return AloBlo + Hi;
28526 KnownBits AKnown = DAG.computeKnownBits(A);
28527 KnownBits BKnown = DAG.computeKnownBits(B);
28528
28529 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
28530 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
28531 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
28532
28533 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
28534 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
28535 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
28536
28537 SDValue Zero = DAG.getConstant(0, dl, VT);
28538
28539 // Only multiply lo/hi halves that aren't known to be zero.
28540 SDValue AloBlo = Zero;
28541 if (!ALoIsZero && !BLoIsZero)
28542 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
28543
28544 SDValue AloBhi = Zero;
28545 if (!ALoIsZero && !BHiIsZero) {
28546 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
28547 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
28548 }
28549
28550 SDValue AhiBlo = Zero;
28551 if (!AHiIsZero && !BLoIsZero) {
28552 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
28553 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
28554 }
28555
28556 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
28557 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
28558
28559 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
28560}
28561
28563 MVT VT, bool IsSigned,
28564 const X86Subtarget &Subtarget,
28565 SelectionDAG &DAG,
28566 SDValue *Low = nullptr) {
28567 unsigned NumElts = VT.getVectorNumElements();
28568
28569 // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
28570 // to a vXi16 type. Do the multiplies, shift the results and pack the half
28571 // lane results back together.
28572
28573 // We'll take different approaches for signed and unsigned.
28574 // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
28575 // and use pmullw to calculate the full 16-bit product.
28576 // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
28577 // shift them left into the upper byte of each word. This allows us to use
28578 // pmulhw to calculate the full 16-bit product. This trick means we don't
28579 // need to sign extend the bytes to use pmullw.
28580
28581 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28582 SDValue Zero = DAG.getConstant(0, dl, VT);
28583
28584 SDValue ALo, AHi;
28585 if (IsSigned) {
28586 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
28587 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
28588 } else {
28589 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
28590 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
28591 }
28592
28593 SDValue BLo, BHi;
28594 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
28595 // If the RHS is a constant, manually unpackl/unpackh and extend.
28596 SmallVector<SDValue, 16> LoOps, HiOps;
28597 for (unsigned i = 0; i != NumElts; i += 16) {
28598 for (unsigned j = 0; j != 8; ++j) {
28599 SDValue LoOp = B.getOperand(i + j);
28600 SDValue HiOp = B.getOperand(i + j + 8);
28601
28602 if (IsSigned) {
28603 LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
28604 HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
28605 LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
28606 DAG.getConstant(8, dl, MVT::i16));
28607 HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
28608 DAG.getConstant(8, dl, MVT::i16));
28609 } else {
28610 LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
28611 HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
28612 }
28613
28614 LoOps.push_back(LoOp);
28615 HiOps.push_back(HiOp);
28616 }
28617 }
28618
28619 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
28620 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
28621 } else if (IsSigned) {
28622 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
28623 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
28624 } else {
28625 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
28626 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
28627 }
28628
28629 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
28630 // pack back to vXi8.
28631 unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
28632 SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
28633 SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
28634
28635 if (Low)
28636 *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
28637
28638 return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
28639}
28640
28641static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
28642 SelectionDAG &DAG) {
28643 SDLoc dl(Op);
28644 MVT VT = Op.getSimpleValueType();
28645 bool IsSigned = Op->getOpcode() == ISD::MULHS;
28646 unsigned NumElts = VT.getVectorNumElements();
28647 SDValue A = Op.getOperand(0);
28648 SDValue B = Op.getOperand(1);
28649
28650 // Decompose 256-bit ops into 128-bit ops.
28651 if (VT.is256BitVector() && !Subtarget.hasInt256())
28652 return splitVectorIntBinary(Op, DAG, dl);
28653
28654 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
28655 return splitVectorIntBinary(Op, DAG, dl);
28656
28657 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
28658 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
28659 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
28660 (VT == MVT::v16i32 && Subtarget.hasAVX512()));
28661
28662 // PMULxD operations multiply each even value (starting at 0) of LHS with
28663 // the related value of RHS and produce a widen result.
28664 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
28665 // => <2 x i64> <ae|cg>
28666 //
28667 // In other word, to have all the results, we need to perform two PMULxD:
28668 // 1. one with the even values.
28669 // 2. one with the odd values.
28670 // To achieve #2, with need to place the odd values at an even position.
28671 //
28672 // Place the odd value at an even position (basically, shift all values 1
28673 // step to the left):
28674 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
28675 9, -1, 11, -1, 13, -1, 15, -1};
28676 // <a|b|c|d> => <b|undef|d|undef>
28677 SDValue Odd0 =
28678 DAG.getVectorShuffle(VT, dl, A, A, ArrayRef(&Mask[0], NumElts));
28679 // <e|f|g|h> => <f|undef|h|undef>
28680 SDValue Odd1 =
28681 DAG.getVectorShuffle(VT, dl, B, B, ArrayRef(&Mask[0], NumElts));
28682
28683 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
28684 // ints.
28685 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
28686 unsigned Opcode =
28687 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
28688 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
28689 // => <2 x i64> <ae|cg>
28690 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
28691 DAG.getBitcast(MulVT, A),
28692 DAG.getBitcast(MulVT, B)));
28693 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
28694 // => <2 x i64> <bf|dh>
28695 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
28696 DAG.getBitcast(MulVT, Odd0),
28697 DAG.getBitcast(MulVT, Odd1)));
28698
28699 // Shuffle it back into the right order.
28700 SmallVector<int, 16> ShufMask(NumElts);
28701 for (int i = 0; i != (int)NumElts; ++i)
28702 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
28703
28704 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
28705
28706 // If we have a signed multiply but no PMULDQ fix up the result of an
28707 // unsigned multiply.
28708 if (IsSigned && !Subtarget.hasSSE41()) {
28709 SDValue Zero = DAG.getConstant(0, dl, VT);
28710 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
28711 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
28712 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
28713 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
28714
28715 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
28716 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
28717 }
28718
28719 return Res;
28720 }
28721
28722 // Only i8 vectors should need custom lowering after this.
28723 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
28724 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
28725 "Unsupported vector type");
28726
28727 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
28728 // logical shift down the upper half and pack back to i8.
28729
28730 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
28731 // and then ashr/lshr the upper bits down to the lower bits before multiply.
28732
28733 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
28734 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
28735 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
28736 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28737 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
28738 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
28739 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
28740 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28741 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
28742 }
28743
28744 return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
28745}
28746
28747// Custom lowering for SMULO/UMULO.
28748static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
28749 SelectionDAG &DAG) {
28750 MVT VT = Op.getSimpleValueType();
28751
28752 // Scalars defer to LowerXALUO.
28753 if (!VT.isVector())
28754 return LowerXALUO(Op, DAG);
28755
28756 SDLoc dl(Op);
28757 bool IsSigned = Op->getOpcode() == ISD::SMULO;
28758 SDValue A = Op.getOperand(0);
28759 SDValue B = Op.getOperand(1);
28760 EVT OvfVT = Op->getValueType(1);
28761
28762 if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
28763 (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
28764 // Extract the LHS Lo/Hi vectors
28765 SDValue LHSLo, LHSHi;
28766 std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
28767
28768 // Extract the RHS Lo/Hi vectors
28769 SDValue RHSLo, RHSHi;
28770 std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
28771
28772 EVT LoOvfVT, HiOvfVT;
28773 std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
28774 SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
28775 SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
28776
28777 // Issue the split operations.
28778 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
28779 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
28780
28781 // Join the separate data results and the overflow results.
28782 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
28783 SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
28784 Hi.getValue(1));
28785
28786 return DAG.getMergeValues({Res, Ovf}, dl);
28787 }
28788
28789 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28790 EVT SetccVT =
28791 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
28792
28793 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
28794 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
28795 unsigned NumElts = VT.getVectorNumElements();
28796 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
28797 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28798 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
28799 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
28800 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
28801
28802 SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
28803
28804 SDValue Ovf;
28805 if (IsSigned) {
28806 SDValue High, LowSign;
28807 if (OvfVT.getVectorElementType() == MVT::i1 &&
28808 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
28809 // Rather the truncating try to do the compare on vXi16 or vXi32.
28810 // Shift the high down filling with sign bits.
28811 High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
28812 // Fill all 16 bits with the sign bit from the low.
28813 LowSign =
28814 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
28815 LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
28816 15, DAG);
28817 SetccVT = OvfVT;
28818 if (!Subtarget.hasBWI()) {
28819 // We can't do a vXi16 compare so sign extend to v16i32.
28820 High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
28821 LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
28822 }
28823 } else {
28824 // Otherwise do the compare at vXi8.
28825 High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28826 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
28827 LowSign =
28828 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
28829 }
28830
28831 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
28832 } else {
28833 SDValue High =
28834 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28835 if (OvfVT.getVectorElementType() == MVT::i1 &&
28836 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
28837 // Rather the truncating try to do the compare on vXi16 or vXi32.
28838 SetccVT = OvfVT;
28839 if (!Subtarget.hasBWI()) {
28840 // We can't do a vXi16 compare so sign extend to v16i32.
28841 High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
28842 }
28843 } else {
28844 // Otherwise do the compare at vXi8.
28845 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
28846 }
28847
28848 Ovf =
28849 DAG.getSetCC(dl, SetccVT, High,
28850 DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
28851 }
28852
28853 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
28854
28855 return DAG.getMergeValues({Low, Ovf}, dl);
28856 }
28857
28858 SDValue Low;
28859 SDValue High =
28860 LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
28861
28862 SDValue Ovf;
28863 if (IsSigned) {
28864 // SMULO overflows if the high bits don't match the sign of the low.
28865 SDValue LowSign =
28866 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
28867 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
28868 } else {
28869 // UMULO overflows if the high bits are non-zero.
28870 Ovf =
28871 DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
28872 }
28873
28874 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
28875
28876 return DAG.getMergeValues({Low, Ovf}, dl);
28877}
28878
28879SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
28880 assert(Subtarget.isTargetWin64() && "Unexpected target");
28881 EVT VT = Op.getValueType();
28882 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
28883 "Unexpected return type for lowering");
28884
28885 if (isa<ConstantSDNode>(Op->getOperand(1))) {
28887 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i64, DAG))
28888 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Result[0], Result[1]);
28889 }
28890
28891 RTLIB::Libcall LC;
28892 bool isSigned;
28893 switch (Op->getOpcode()) {
28894 // clang-format off
28895 default: llvm_unreachable("Unexpected request for libcall!");
28896 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
28897 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
28898 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
28899 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
28900 // clang-format on
28901 }
28902
28903 SDLoc dl(Op);
28904 SDValue InChain = DAG.getEntryNode();
28905
28908 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
28909 EVT ArgVT = Op->getOperand(i).getValueType();
28910 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
28911 "Unexpected argument type for lowering");
28912 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
28913 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
28914 MachinePointerInfo MPI =
28916 Entry.Node = StackPtr;
28917 InChain =
28918 DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
28919 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
28920 Entry.Ty = PointerType::get(ArgTy,0);
28921 Entry.IsSExt = false;
28922 Entry.IsZExt = false;
28923 Args.push_back(Entry);
28924 }
28925
28928
28930 CLI.setDebugLoc(dl)
28931 .setChain(InChain)
28932 .setLibCallee(
28934 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
28935 std::move(Args))
28936 .setInRegister()
28937 .setSExtResult(isSigned)
28938 .setZExtResult(!isSigned);
28939
28940 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
28941 return DAG.getBitcast(VT, CallInfo.first);
28942}
28943
28944SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,
28945 SelectionDAG &DAG,
28946 SDValue &Chain) const {
28947 assert(Subtarget.isTargetWin64() && "Unexpected target");
28948 EVT VT = Op.getValueType();
28949 bool IsStrict = Op->isStrictFPOpcode();
28950
28951 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
28952 EVT ArgVT = Arg.getValueType();
28953
28954 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
28955 "Unexpected return type for lowering");
28956
28957 RTLIB::Libcall LC;
28958 if (Op->getOpcode() == ISD::FP_TO_SINT ||
28959 Op->getOpcode() == ISD::STRICT_FP_TO_SINT)
28960 LC = RTLIB::getFPTOSINT(ArgVT, VT);
28961 else
28962 LC = RTLIB::getFPTOUINT(ArgVT, VT);
28963 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
28964
28965 SDLoc dl(Op);
28966 MakeLibCallOptions CallOptions;
28967 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
28968
28970 // Expect the i128 argument returned as a v2i64 in xmm0, cast back to the
28971 // expected VT (i128).
28972 std::tie(Result, Chain) =
28973 makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);
28974 Result = DAG.getBitcast(VT, Result);
28975 return Result;
28976}
28977
28978SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
28979 SelectionDAG &DAG) const {
28980 assert(Subtarget.isTargetWin64() && "Unexpected target");
28981 EVT VT = Op.getValueType();
28982 bool IsStrict = Op->isStrictFPOpcode();
28983
28984 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
28985 EVT ArgVT = Arg.getValueType();
28986
28987 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
28988 "Unexpected argument type for lowering");
28989
28990 RTLIB::Libcall LC;
28991 if (Op->getOpcode() == ISD::SINT_TO_FP ||
28992 Op->getOpcode() == ISD::STRICT_SINT_TO_FP)
28993 LC = RTLIB::getSINTTOFP(ArgVT, VT);
28994 else
28995 LC = RTLIB::getUINTTOFP(ArgVT, VT);
28996 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
28997
28998 SDLoc dl(Op);
28999 MakeLibCallOptions CallOptions;
29000 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
29001
29002 // Pass the i128 argument as an indirect argument on the stack.
29003 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
29004 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
29005 MachinePointerInfo MPI =
29007 Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));
29008
29010 std::tie(Result, Chain) =
29011 makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);
29012 return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
29013}
29014
29015// Return true if the required (according to Opcode) shift-imm form is natively
29016// supported by the Subtarget
29017static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,
29018 unsigned Opcode) {
29019 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
29020 "Unexpected shift opcode");
29021
29022 if (!VT.isSimple())
29023 return false;
29024
29025 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
29026 return false;
29027
29028 if (VT.getScalarSizeInBits() < 16)
29029 return false;
29030
29031 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
29032 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
29033 return true;
29034
29035 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
29036 (VT.is256BitVector() && Subtarget.hasInt256());
29037
29038 bool AShift = LShift && (Subtarget.hasAVX512() ||
29039 (VT != MVT::v2i64 && VT != MVT::v4i64));
29040 return (Opcode == ISD::SRA) ? AShift : LShift;
29041}
29042
29043// The shift amount is a variable, but it is the same for all vector lanes.
29044// These instructions are defined together with shift-immediate.
29045static
29047 unsigned Opcode) {
29048 return supportedVectorShiftWithImm(VT, Subtarget, Opcode);
29049}
29050
29051// Return true if the required (according to Opcode) variable-shift form is
29052// natively supported by the Subtarget
29053static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget,
29054 unsigned Opcode) {
29055 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
29056 "Unexpected shift opcode");
29057
29058 if (!VT.isSimple())
29059 return false;
29060
29061 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
29062 return false;
29063
29064 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
29065 return false;
29066
29067 // vXi16 supported only on AVX-512, BWI
29068 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
29069 return false;
29070
29071 if (Subtarget.hasAVX512() &&
29072 (Subtarget.useAVX512Regs() || !VT.is512BitVector()))
29073 return true;
29074
29075 bool LShift = VT.is128BitVector() || VT.is256BitVector();
29076 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
29077 return (Opcode == ISD::SRA) ? AShift : LShift;
29078}
29079
29081 const X86Subtarget &Subtarget) {
29082 MVT VT = Op.getSimpleValueType();
29083 SDLoc dl(Op);
29084 SDValue R = Op.getOperand(0);
29085 SDValue Amt = Op.getOperand(1);
29086 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
29087 unsigned EltSizeInBits = VT.getScalarSizeInBits();
29088
29089 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
29090 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
29091 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
29092 SDValue Ex = DAG.getBitcast(ExVT, R);
29093
29094 // ashr(R, 63) === cmp_slt(R, 0)
29095 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
29096 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
29097 "Unsupported PCMPGT op");
29098 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
29099 }
29100
29101 if (ShiftAmt >= 32) {
29102 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
29103 SDValue Upper =
29104 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
29106 ShiftAmt - 32, DAG);
29107 if (VT == MVT::v2i64)
29108 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
29109 if (VT == MVT::v4i64)
29110 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
29111 {9, 1, 11, 3, 13, 5, 15, 7});
29112 } else {
29113 // SRA upper i32, SRL whole i64 and select lower i32.
29115 ShiftAmt, DAG);
29116 SDValue Lower =
29117 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
29118 Lower = DAG.getBitcast(ExVT, Lower);
29119 if (VT == MVT::v2i64)
29120 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
29121 if (VT == MVT::v4i64)
29122 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
29123 {8, 1, 10, 3, 12, 5, 14, 7});
29124 }
29125 return DAG.getBitcast(VT, Ex);
29126 };
29127
29128 // Optimize shl/srl/sra with constant shift amount.
29129 APInt APIntShiftAmt;
29130 if (!X86::isConstantSplat(Amt, APIntShiftAmt))
29131 return SDValue();
29132
29133 // If the shift amount is out of range, return undef.
29134 if (APIntShiftAmt.uge(EltSizeInBits))
29135 return DAG.getUNDEF(VT);
29136
29137 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
29138
29139 if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) {
29140 // Hardware support for vector shifts is sparse which makes us scalarize the
29141 // vector operations in many cases. Also, on sandybridge ADD is faster than
29142 // shl: (shl V, 1) -> (add (freeze V), (freeze V))
29143 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
29144 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
29145 // must be 0). (add undef, undef) however can be any value. To make this
29146 // safe, we must freeze R to ensure that register allocation uses the same
29147 // register for an undefined value. This ensures that the result will
29148 // still be even and preserves the original semantics.
29149 R = DAG.getFreeze(R);
29150 return DAG.getNode(ISD::ADD, dl, VT, R, R);
29151 }
29152
29153 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
29154 }
29155
29156 // i64 SRA needs to be performed as partial shifts.
29157 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
29158 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
29159 Op.getOpcode() == ISD::SRA)
29160 return ArithmeticShiftRight64(ShiftAmt);
29161
29162 // If we're logical shifting an all-signbits value then we can just perform as
29163 // a mask.
29164 if ((Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) &&
29165 DAG.ComputeNumSignBits(R) == EltSizeInBits) {
29166 SDValue Mask = DAG.getAllOnesConstant(dl, VT);
29167 Mask = DAG.getNode(Op.getOpcode(), dl, VT, Mask, Amt);
29168 return DAG.getNode(ISD::AND, dl, VT, R, Mask);
29169 }
29170
29171 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
29172 (Subtarget.hasBWI() && VT == MVT::v64i8)) {
29173 unsigned NumElts = VT.getVectorNumElements();
29174 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29175
29176 // Simple i8 add case
29177 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
29178 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
29179 // must be 0). (add undef, undef) however can be any value. To make this
29180 // safe, we must freeze R to ensure that register allocation uses the same
29181 // register for an undefined value. This ensures that the result will
29182 // still be even and preserves the original semantics.
29183 R = DAG.getFreeze(R);
29184 return DAG.getNode(ISD::ADD, dl, VT, R, R);
29185 }
29186
29187 // ashr(R, 7) === cmp_slt(R, 0)
29188 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
29189 SDValue Zeros = DAG.getConstant(0, dl, VT);
29190 if (VT.is512BitVector()) {
29191 assert(VT == MVT::v64i8 && "Unexpected element type!");
29192 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
29193 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
29194 }
29195 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
29196 }
29197
29198 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
29199 if (VT == MVT::v16i8 && Subtarget.hasXOP())
29200 return SDValue();
29201
29202 if (Op.getOpcode() == ISD::SHL) {
29203 // Make a large shift.
29204 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
29205 ShiftAmt, DAG);
29206 SHL = DAG.getBitcast(VT, SHL);
29207 // Zero out the rightmost bits.
29208 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
29209 return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
29210 }
29211 if (Op.getOpcode() == ISD::SRL) {
29212 // Make a large shift.
29213 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
29214 ShiftAmt, DAG);
29215 SRL = DAG.getBitcast(VT, SRL);
29216 // Zero out the leftmost bits.
29217 APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
29218 return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
29219 }
29220 if (Op.getOpcode() == ISD::SRA) {
29221 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
29222 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
29223
29224 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
29225 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
29226 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
29227 return Res;
29228 }
29229 llvm_unreachable("Unknown shift opcode.");
29230 }
29231
29232 return SDValue();
29233}
29234
29236 const X86Subtarget &Subtarget) {
29237 MVT VT = Op.getSimpleValueType();
29238 SDLoc dl(Op);
29239 SDValue R = Op.getOperand(0);
29240 SDValue Amt = Op.getOperand(1);
29241 unsigned Opcode = Op.getOpcode();
29242 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
29243
29244 int BaseShAmtIdx = -1;
29245 if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {
29246 if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))
29247 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,
29248 Subtarget, DAG);
29249
29250 // vXi8 shifts - shift as v8i16 + mask result.
29251 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
29252 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
29253 VT == MVT::v64i8) &&
29254 !Subtarget.hasXOP()) {
29255 unsigned NumElts = VT.getVectorNumElements();
29256 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29257 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
29258 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
29259 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
29260
29261 // Create the mask using vXi16 shifts. For shift-rights we need to move
29262 // the upper byte down before splatting the vXi8 mask.
29263 SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
29264 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
29265 BaseShAmt, BaseShAmtIdx, Subtarget, DAG);
29266 if (Opcode != ISD::SHL)
29267 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
29268 8, DAG);
29269 BitMask = DAG.getBitcast(VT, BitMask);
29270 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
29271 SmallVector<int, 64>(NumElts, 0));
29272
29273 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
29274 DAG.getBitcast(ExtVT, R), BaseShAmt,
29275 BaseShAmtIdx, Subtarget, DAG);
29276 Res = DAG.getBitcast(VT, Res);
29277 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
29278
29279 if (Opcode == ISD::SRA) {
29280 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
29281 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
29282 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
29283 SignMask =
29284 getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,
29285 BaseShAmtIdx, Subtarget, DAG);
29286 SignMask = DAG.getBitcast(VT, SignMask);
29287 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
29288 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
29289 }
29290 return Res;
29291 }
29292 }
29293 }
29294
29295 return SDValue();
29296}
29297
29298// Convert a shift/rotate left amount to a multiplication scale factor.
29300 const X86Subtarget &Subtarget,
29301 SelectionDAG &DAG) {
29302 MVT VT = Amt.getSimpleValueType();
29303 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
29304 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
29305 (Subtarget.hasAVX512() && VT == MVT::v32i16) ||
29306 (!Subtarget.hasAVX512() && VT == MVT::v16i8) ||
29307 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
29308 (Subtarget.hasBWI() && VT == MVT::v64i8)))
29309 return SDValue();
29310
29311 MVT SVT = VT.getVectorElementType();
29312 unsigned SVTBits = SVT.getSizeInBits();
29313 unsigned NumElems = VT.getVectorNumElements();
29314
29315 APInt UndefElts;
29316 SmallVector<APInt> EltBits;
29317 if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {
29318 APInt One(SVTBits, 1);
29319 SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));
29320 for (unsigned I = 0; I != NumElems; ++I) {
29321 if (UndefElts[I] || EltBits[I].uge(SVTBits))
29322 continue;
29323 uint64_t ShAmt = EltBits[I].getZExtValue();
29324 Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);
29325 }
29326 return DAG.getBuildVector(VT, dl, Elts);
29327 }
29328
29329 // If the target doesn't support variable shifts, use either FP conversion
29330 // or integer multiplication to avoid shifting each element individually.
29331 if (VT == MVT::v4i32) {
29332 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
29333 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
29334 DAG.getConstant(0x3f800000U, dl, VT));
29335 Amt = DAG.getBitcast(MVT::v4f32, Amt);
29336 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
29337 }
29338
29339 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
29340 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
29341 SDValue Z = DAG.getConstant(0, dl, VT);
29342 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
29343 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
29344 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
29345 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
29346 if (Subtarget.hasSSE41())
29347 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
29348 return getPack(DAG, Subtarget, dl, VT, Lo, Hi);
29349 }
29350
29351 return SDValue();
29352}
29353
29354static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
29355 SelectionDAG &DAG) {
29356 MVT VT = Op.getSimpleValueType();
29357 SDLoc dl(Op);
29358 SDValue R = Op.getOperand(0);
29359 SDValue Amt = Op.getOperand(1);
29360 unsigned EltSizeInBits = VT.getScalarSizeInBits();
29361 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
29362
29363 unsigned Opc = Op.getOpcode();
29364 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
29365 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
29366
29367 assert(VT.isVector() && "Custom lowering only for vector shifts!");
29368 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
29369
29370 if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))
29371 return V;
29372
29373 if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))
29374 return V;
29375
29376 if (supportedVectorVarShift(VT, Subtarget, Opc))
29377 return Op;
29378
29379 // i64 vector arithmetic shift can be emulated with the transform:
29380 // M = lshr(SIGN_MASK, Amt)
29381 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
29382 if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||
29383 (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
29384 Opc == ISD::SRA) {
29385 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
29386 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
29387 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
29388 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
29389 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
29390 return R;
29391 }
29392
29393 // XOP has 128-bit variable logical/arithmetic shifts.
29394 // +ve/-ve Amt = shift left/right.
29395 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
29396 VT == MVT::v8i16 || VT == MVT::v16i8)) {
29397 if (Opc == ISD::SRL || Opc == ISD::SRA)
29398 Amt = DAG.getNegative(Amt, dl, VT);
29399 if (Opc == ISD::SHL || Opc == ISD::SRL)
29400 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
29401 if (Opc == ISD::SRA)
29402 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
29403 }
29404
29405 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
29406 // shifts per-lane and then shuffle the partial results back together.
29407 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
29408 // Splat the shift amounts so the scalar shifts above will catch it.
29409 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
29410 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
29411 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
29412 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
29413 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
29414 }
29415
29416 // If possible, lower this shift as a sequence of two shifts by
29417 // constant plus a BLENDing shuffle instead of scalarizing it.
29418 // Example:
29419 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
29420 //
29421 // Could be rewritten as:
29422 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
29423 //
29424 // The advantage is that the two shifts from the example would be
29425 // lowered as X86ISD::VSRLI nodes in parallel before blending.
29426 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
29427 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
29428 SDValue Amt1, Amt2;
29429 unsigned NumElts = VT.getVectorNumElements();
29430 SmallVector<int, 8> ShuffleMask;
29431 for (unsigned i = 0; i != NumElts; ++i) {
29432 SDValue A = Amt->getOperand(i);
29433 if (A.isUndef()) {
29434 ShuffleMask.push_back(SM_SentinelUndef);
29435 continue;
29436 }
29437 if (!Amt1 || Amt1 == A) {
29438 ShuffleMask.push_back(i);
29439 Amt1 = A;
29440 continue;
29441 }
29442 if (!Amt2 || Amt2 == A) {
29443 ShuffleMask.push_back(i + NumElts);
29444 Amt2 = A;
29445 continue;
29446 }
29447 break;
29448 }
29449
29450 // Only perform this blend if we can perform it without loading a mask.
29451 if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
29452 (VT != MVT::v16i16 ||
29453 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
29454 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
29455 canWidenShuffleElements(ShuffleMask))) {
29456 auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
29457 auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
29458 if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
29459 Cst2->getAPIntValue().ult(EltSizeInBits)) {
29460 SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
29461 Cst1->getZExtValue(), DAG);
29462 SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
29463 Cst2->getZExtValue(), DAG);
29464 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
29465 }
29466 }
29467 }
29468
29469 // If possible, lower this packed shift into a vector multiply instead of
29470 // expanding it into a sequence of scalar shifts.
29471 // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.
29472 if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||
29473 Subtarget.canExtendTo512BW())))
29474 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
29475 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
29476
29477 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
29478 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
29479 if (Opc == ISD::SRL && ConstantAmt &&
29480 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
29481 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
29482 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
29483 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
29484 SDValue Zero = DAG.getConstant(0, dl, VT);
29485 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
29486 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
29487 return DAG.getSelect(dl, VT, ZAmt, R, Res);
29488 }
29489 }
29490
29491 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
29492 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
29493 // TODO: Special case handling for shift by 0/1, really we can afford either
29494 // of these cases in pre-SSE41/XOP/AVX512 but not both.
29495 if (Opc == ISD::SRA && ConstantAmt &&
29496 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
29497 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
29498 !Subtarget.hasAVX512()) ||
29499 DAG.isKnownNeverZero(Amt))) {
29500 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
29501 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
29502 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
29503 SDValue Amt0 =
29504 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
29505 SDValue Amt1 =
29506 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
29507 SDValue Sra1 =
29508 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
29509 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
29510 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
29511 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
29512 }
29513 }
29514
29515 // v4i32 Non Uniform Shifts.
29516 // If the shift amount is constant we can shift each lane using the SSE2
29517 // immediate shifts, else we need to zero-extend each lane to the lower i64
29518 // and shift using the SSE2 variable shifts.
29519 // The separate results can then be blended together.
29520 if (VT == MVT::v4i32) {
29521 SDValue Amt0, Amt1, Amt2, Amt3;
29522 if (ConstantAmt) {
29523 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
29524 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
29525 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
29526 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
29527 } else {
29528 // The SSE2 shifts use the lower i64 as the same shift amount for
29529 // all lanes and the upper i64 is ignored. On AVX we're better off
29530 // just zero-extending, but for SSE just duplicating the top 16-bits is
29531 // cheaper and has the same effect for out of range values.
29532 if (Subtarget.hasAVX()) {
29533 SDValue Z = DAG.getConstant(0, dl, VT);
29534 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
29535 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
29536 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
29537 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
29538 } else {
29539 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
29540 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
29541 {4, 5, 6, 7, -1, -1, -1, -1});
29542 SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG);
29543 SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG);
29544 Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02);
29545 Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13);
29546 Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02);
29547 Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13);
29548 }
29549 }
29550
29551 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
29552 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
29553 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
29554 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
29555 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
29556
29557 // Merge the shifted lane results optimally with/without PBLENDW.
29558 // TODO - ideally shuffle combining would handle this.
29559 if (Subtarget.hasSSE41()) {
29560 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
29561 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
29562 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
29563 }
29564 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
29565 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
29566 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
29567 }
29568
29569 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
29570 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
29571 // make the existing SSE solution better.
29572 // NOTE: We honor prefered vector width before promoting to 512-bits.
29573 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
29574 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
29575 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
29576 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
29577 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
29578 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
29579 "Unexpected vector type");
29580 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
29581 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
29582 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29583 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
29584 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
29585 return DAG.getNode(ISD::TRUNCATE, dl, VT,
29586 DAG.getNode(Opc, dl, ExtVT, R, Amt));
29587 }
29588
29589 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
29590 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
29591 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
29592 (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
29593 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
29594 !Subtarget.hasXOP()) {
29595 int NumElts = VT.getVectorNumElements();
29596 SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
29597
29598 // Extend constant shift amount to vXi16 (it doesn't matter if the type
29599 // isn't legal).
29600 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29601 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
29602 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
29603 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
29605 "Constant build vector expected");
29606
29607 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
29608 bool IsSigned = Opc == ISD::SRA;
29609 R = DAG.getExtOrTrunc(IsSigned, R, dl, ExVT);
29610 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
29611 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
29612 return DAG.getZExtOrTrunc(R, dl, VT);
29613 }
29614
29615 SmallVector<SDValue, 16> LoAmt, HiAmt;
29616 for (int i = 0; i != NumElts; i += 16) {
29617 for (int j = 0; j != 8; ++j) {
29618 LoAmt.push_back(Amt.getOperand(i + j));
29619 HiAmt.push_back(Amt.getOperand(i + j + 8));
29620 }
29621 }
29622
29623 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
29624 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
29625 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
29626
29627 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
29628 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
29629 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
29630 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
29631 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
29632 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
29633 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
29634 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
29635 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
29636 }
29637
29638 if (VT == MVT::v16i8 ||
29639 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
29640 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
29641 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
29642
29643 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
29644 if (VT.is512BitVector()) {
29645 // On AVX512BW targets we make use of the fact that VSELECT lowers
29646 // to a masked blend which selects bytes based just on the sign bit
29647 // extracted to a mask.
29648 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
29649 V0 = DAG.getBitcast(VT, V0);
29650 V1 = DAG.getBitcast(VT, V1);
29651 Sel = DAG.getBitcast(VT, Sel);
29652 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
29653 ISD::SETGT);
29654 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
29655 } else if (Subtarget.hasSSE41()) {
29656 // On SSE41 targets we can use PBLENDVB which selects bytes based just
29657 // on the sign bit.
29658 V0 = DAG.getBitcast(VT, V0);
29659 V1 = DAG.getBitcast(VT, V1);
29660 Sel = DAG.getBitcast(VT, Sel);
29661 return DAG.getBitcast(SelVT,
29662 DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
29663 }
29664 // On pre-SSE41 targets we test for the sign bit by comparing to
29665 // zero - a negative value will set all bits of the lanes to true
29666 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
29667 SDValue Z = DAG.getConstant(0, dl, SelVT);
29668 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
29669 return DAG.getSelect(dl, SelVT, C, V0, V1);
29670 };
29671
29672 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
29673 // We can safely do this using i16 shifts as we're only interested in
29674 // the 3 lower bits of each byte.
29675 Amt = DAG.getBitcast(ExtVT, Amt);
29676 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
29677 Amt = DAG.getBitcast(VT, Amt);
29678
29679 if (Opc == ISD::SHL || Opc == ISD::SRL) {
29680 // r = VSELECT(r, shift(r, 4), a);
29681 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
29682 R = SignBitSelect(VT, Amt, M, R);
29683
29684 // a += a
29685 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29686
29687 // r = VSELECT(r, shift(r, 2), a);
29688 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
29689 R = SignBitSelect(VT, Amt, M, R);
29690
29691 // a += a
29692 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29693
29694 // return VSELECT(r, shift(r, 1), a);
29695 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
29696 R = SignBitSelect(VT, Amt, M, R);
29697 return R;
29698 }
29699
29700 if (Opc == ISD::SRA) {
29701 // For SRA we need to unpack each byte to the higher byte of a i16 vector
29702 // so we can correctly sign extend. We don't care what happens to the
29703 // lower byte.
29704 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
29705 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
29706 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
29707 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
29708 ALo = DAG.getBitcast(ExtVT, ALo);
29709 AHi = DAG.getBitcast(ExtVT, AHi);
29710 RLo = DAG.getBitcast(ExtVT, RLo);
29711 RHi = DAG.getBitcast(ExtVT, RHi);
29712
29713 // r = VSELECT(r, shift(r, 4), a);
29714 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
29715 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
29716 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
29717 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
29718
29719 // a += a
29720 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
29721 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
29722
29723 // r = VSELECT(r, shift(r, 2), a);
29724 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
29725 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
29726 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
29727 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
29728
29729 // a += a
29730 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
29731 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
29732
29733 // r = VSELECT(r, shift(r, 1), a);
29734 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
29735 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
29736 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
29737 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
29738
29739 // Logical shift the result back to the lower byte, leaving a zero upper
29740 // byte meaning that we can safely pack with PACKUSWB.
29741 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
29742 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
29743 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
29744 }
29745 }
29746
29747 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
29748 MVT ExtVT = MVT::v8i32;
29749 SDValue Z = DAG.getConstant(0, dl, VT);
29750 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
29751 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
29752 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
29753 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
29754 ALo = DAG.getBitcast(ExtVT, ALo);
29755 AHi = DAG.getBitcast(ExtVT, AHi);
29756 RLo = DAG.getBitcast(ExtVT, RLo);
29757 RHi = DAG.getBitcast(ExtVT, RHi);
29758 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
29759 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
29760 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
29761 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
29762 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
29763 }
29764
29765 if (VT == MVT::v8i16) {
29766 // If we have a constant shift amount, the non-SSE41 path is best as
29767 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
29768 bool UseSSE41 = Subtarget.hasSSE41() &&
29770
29771 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
29772 // On SSE41 targets we can use PBLENDVB which selects bytes based just on
29773 // the sign bit.
29774 if (UseSSE41) {
29775 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
29776 V0 = DAG.getBitcast(ExtVT, V0);
29777 V1 = DAG.getBitcast(ExtVT, V1);
29778 Sel = DAG.getBitcast(ExtVT, Sel);
29779 return DAG.getBitcast(
29780 VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
29781 }
29782 // On pre-SSE41 targets we splat the sign bit - a negative value will
29783 // set all bits of the lanes to true and VSELECT uses that in
29784 // its OR(AND(V0,C),AND(V1,~C)) lowering.
29785 SDValue C =
29786 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
29787 return DAG.getSelect(dl, VT, C, V0, V1);
29788 };
29789
29790 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
29791 if (UseSSE41) {
29792 // On SSE41 targets we need to replicate the shift mask in both
29793 // bytes for PBLENDVB.
29794 Amt = DAG.getNode(
29795 ISD::OR, dl, VT,
29796 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
29797 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
29798 } else {
29799 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
29800 }
29801
29802 // r = VSELECT(r, shift(r, 8), a);
29803 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
29804 R = SignBitSelect(Amt, M, R);
29805
29806 // a += a
29807 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29808
29809 // r = VSELECT(r, shift(r, 4), a);
29810 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
29811 R = SignBitSelect(Amt, M, R);
29812
29813 // a += a
29814 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29815
29816 // r = VSELECT(r, shift(r, 2), a);
29817 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
29818 R = SignBitSelect(Amt, M, R);
29819
29820 // a += a
29821 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29822
29823 // return VSELECT(r, shift(r, 1), a);
29824 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
29825 R = SignBitSelect(Amt, M, R);
29826 return R;
29827 }
29828
29829 // Decompose 256-bit shifts into 128-bit shifts.
29830 if (VT.is256BitVector())
29831 return splitVectorIntBinary(Op, DAG, dl);
29832
29833 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29834 return splitVectorIntBinary(Op, DAG, dl);
29835
29836 return SDValue();
29837}
29838
29840 SelectionDAG &DAG) {
29841 MVT VT = Op.getSimpleValueType();
29842 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
29843 "Unexpected funnel shift opcode!");
29844
29845 SDLoc DL(Op);
29846 SDValue Op0 = Op.getOperand(0);
29847 SDValue Op1 = Op.getOperand(1);
29848 SDValue Amt = Op.getOperand(2);
29849 unsigned EltSizeInBits = VT.getScalarSizeInBits();
29850 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
29851
29852 if (VT.isVector()) {
29853 APInt APIntShiftAmt;
29854 bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
29855 unsigned NumElts = VT.getVectorNumElements();
29856
29857 if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
29858 if (IsFSHR)
29859 std::swap(Op0, Op1);
29860
29861 if (IsCstSplat) {
29862 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
29863 SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
29864 return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
29865 {Op0, Op1, Imm}, DAG, Subtarget);
29866 }
29867 return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
29868 {Op0, Op1, Amt}, DAG, Subtarget);
29869 }
29870 assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
29871 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||
29872 VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&
29873 "Unexpected funnel shift type!");
29874
29875 // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
29876 // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).
29877 if (IsCstSplat) {
29878 // TODO: Can't use generic expansion as UNDEF amt elements can be
29879 // converted to other values when folded to shift amounts, losing the
29880 // splat.
29881 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
29882 uint64_t ShXAmt = IsFSHR ? (EltSizeInBits - ShiftAmt) : ShiftAmt;
29883 uint64_t ShYAmt = IsFSHR ? ShiftAmt : (EltSizeInBits - ShiftAmt);
29884 assert((ShXAmt + ShYAmt) == EltSizeInBits && "Illegal funnel shift");
29885
29886 if (EltSizeInBits == 8 && ShXAmt > 1 &&
29887 (Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT))) {
29888 // For vXi8 cases on Subtargets that can perform VPCMOV/VPTERNLOG
29889 // bit-select - lower using vXi16 shifts and then perform the bitmask at
29890 // the original vector width to handle cases where we split.
29891 MVT WideVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29892 APInt MaskX = APInt::getHighBitsSet(8, 8 - ShXAmt);
29893 APInt MaskY = APInt::getLowBitsSet(8, 8 - ShYAmt);
29894 SDValue ShX =
29895 DAG.getNode(ISD::SHL, DL, WideVT, DAG.getBitcast(WideVT, Op0),
29896 DAG.getShiftAmountConstant(ShXAmt, WideVT, DL));
29897 SDValue ShY =
29898 DAG.getNode(ISD::SRL, DL, WideVT, DAG.getBitcast(WideVT, Op1),
29899 DAG.getShiftAmountConstant(ShYAmt, WideVT, DL));
29900 ShX = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShX),
29901 DAG.getConstant(MaskX, DL, VT));
29902 ShY = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShY),
29903 DAG.getConstant(MaskY, DL, VT));
29904 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
29905 }
29906
29907 SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, Op0,
29908 DAG.getShiftAmountConstant(ShXAmt, VT, DL));
29909 SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Op1,
29910 DAG.getShiftAmountConstant(ShYAmt, VT, DL));
29911 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
29912 }
29913
29914 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
29915 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
29916 bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());
29917
29918 // Constant vXi16 funnel shifts can be efficiently handled by default.
29919 if (IsCst && EltSizeInBits == 16)
29920 return SDValue();
29921
29922 unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;
29923 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
29924 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
29925
29926 // Split 256-bit integers on XOP/pre-AVX2 targets.
29927 // Split 512-bit integers on non 512-bit BWI targets.
29928 if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||
29929 !Subtarget.hasAVX2())) ||
29930 (VT.is512BitVector() && !Subtarget.useBWIRegs() &&
29931 EltSizeInBits < 32)) {
29932 // Pre-mask the amount modulo using the wider vector.
29933 Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);
29934 return splitVectorOp(Op, DAG, DL);
29935 }
29936
29937 // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))
29938 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {
29939 int ScalarAmtIdx = -1;
29940 if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {
29941 // Uniform vXi16 funnel shifts can be efficiently handled by default.
29942 if (EltSizeInBits == 16)
29943 return SDValue();
29944
29945 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
29946 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
29947 Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,
29948 ScalarAmtIdx, Subtarget, DAG);
29949 Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,
29950 ScalarAmtIdx, Subtarget, DAG);
29951 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
29952 }
29953 }
29954
29955 MVT WideSVT = MVT::getIntegerVT(
29956 std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));
29957 MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);
29958
29959 // If per-element shifts are legal, fallback to generic expansion.
29960 if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())
29961 return SDValue();
29962
29963 // Attempt to fold as:
29964 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
29965 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
29966 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
29967 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
29968 Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);
29969 Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);
29970 AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
29971 Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,
29972 EltSizeInBits, DAG);
29973 SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);
29974 Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);
29975 if (!IsFSHR)
29976 Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,
29977 EltSizeInBits, DAG);
29978 return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
29979 }
29980
29981 // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)
29982 if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||
29983 supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
29984 SDValue Z = DAG.getConstant(0, DL, VT);
29985 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
29986 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
29987 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
29988 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
29989 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
29990 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
29991 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
29992 }
29993
29994 // Fallback to generic expansion.
29995 return SDValue();
29996 }
29997 assert(
29998 (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
29999 "Unexpected funnel shift type!");
30000
30001 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
30002 bool OptForSize = DAG.shouldOptForSize();
30003 bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
30004
30005 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
30006 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
30007 if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
30008 !isa<ConstantSDNode>(Amt)) {
30009 SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
30010 SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
30011 Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
30012 Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
30013 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
30014 SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
30015 Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
30016 if (IsFSHR) {
30017 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
30018 } else {
30019 Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
30020 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
30021 }
30022 return DAG.getZExtOrTrunc(Res, DL, VT);
30023 }
30024
30025 if (VT == MVT::i8 || ExpandFunnel)
30026 return SDValue();
30027
30028 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
30029 if (VT == MVT::i16) {
30030 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
30031 DAG.getConstant(15, DL, Amt.getValueType()));
30032 unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
30033 return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
30034 }
30035
30036 return Op;
30037}
30038
30039static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
30040 SelectionDAG &DAG) {
30041 MVT VT = Op.getSimpleValueType();
30042 assert(VT.isVector() && "Custom lowering only for vector rotates!");
30043
30044 SDLoc DL(Op);
30045 SDValue R = Op.getOperand(0);
30046 SDValue Amt = Op.getOperand(1);
30047 unsigned Opcode = Op.getOpcode();
30048 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30049 int NumElts = VT.getVectorNumElements();
30050 bool IsROTL = Opcode == ISD::ROTL;
30051
30052 // Check for constant splat rotation amount.
30053 APInt CstSplatValue;
30054 bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
30055
30056 // Check for splat rotate by zero.
30057 if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
30058 return R;
30059
30060 // AVX512 implicitly uses modulo rotation amounts.
30061 if ((Subtarget.hasVLX() ||
30062 (Subtarget.hasAVX512() && Subtarget.hasEVEX512())) &&
30063 32 <= EltSizeInBits) {
30064 // Attempt to rotate by immediate.
30065 if (IsCstSplat) {
30066 unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;
30067 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
30068 return DAG.getNode(RotOpc, DL, VT, R,
30069 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
30070 }
30071
30072 // Else, fall-back on VPROLV/VPRORV.
30073 return Op;
30074 }
30075
30076 // AVX512 VBMI2 vXi16 - lower to funnel shifts.
30077 if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
30078 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
30079 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
30080 }
30081
30082 SDValue Z = DAG.getConstant(0, DL, VT);
30083
30084 if (!IsROTL) {
30085 // If the ISD::ROTR amount is constant, we're always better converting to
30086 // ISD::ROTL.
30087 if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))
30088 return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);
30089
30090 // XOP targets always prefers ISD::ROTL.
30091 if (Subtarget.hasXOP())
30092 return DAG.getNode(ISD::ROTL, DL, VT, R,
30093 DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
30094 }
30095
30096 // Split 256-bit integers on XOP/pre-AVX2 targets.
30097 if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
30098 return splitVectorIntBinary(Op, DAG, DL);
30099
30100 // XOP has 128-bit vector variable + immediate rotates.
30101 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
30102 // XOP implicitly uses modulo rotation amounts.
30103 if (Subtarget.hasXOP()) {
30104 assert(IsROTL && "Only ROTL expected");
30105 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
30106
30107 // Attempt to rotate by immediate.
30108 if (IsCstSplat) {
30109 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
30110 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
30111 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
30112 }
30113
30114 // Use general rotate by variable (per-element).
30115 return Op;
30116 }
30117
30118 // Rotate by an uniform constant - expand back to shifts.
30119 // TODO: Can't use generic expansion as UNDEF amt elements can be converted
30120 // to other values when folded to shift amounts, losing the splat.
30121 if (IsCstSplat) {
30122 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
30123 uint64_t ShlAmt = IsROTL ? RotAmt : (EltSizeInBits - RotAmt);
30124 uint64_t SrlAmt = IsROTL ? (EltSizeInBits - RotAmt) : RotAmt;
30125 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, R,
30126 DAG.getShiftAmountConstant(ShlAmt, VT, DL));
30127 SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, R,
30128 DAG.getShiftAmountConstant(SrlAmt, VT, DL));
30129 return DAG.getNode(ISD::OR, DL, VT, Shl, Srl);
30130 }
30131
30132 // Split 512-bit integers on non 512-bit BWI targets.
30133 if (VT.is512BitVector() && !Subtarget.useBWIRegs())
30134 return splitVectorIntBinary(Op, DAG, DL);
30135
30136 assert(
30137 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
30138 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
30139 Subtarget.hasAVX2()) ||
30140 ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&
30141 "Only vXi32/vXi16/vXi8 vector rotates supported");
30142
30143 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
30144 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
30145
30146 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
30147 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
30148
30149 // Attempt to fold as unpack(x,x) << zext(splat(y)):
30150 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
30151 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
30152 if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {
30153 int BaseRotAmtIdx = -1;
30154 if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {
30155 if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {
30156 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
30157 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
30158 }
30159 unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;
30160 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
30161 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
30162 Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
30163 BaseRotAmtIdx, Subtarget, DAG);
30164 Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
30165 BaseRotAmtIdx, Subtarget, DAG);
30166 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
30167 }
30168 }
30169
30170 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
30171 unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;
30172
30173 // Attempt to fold as unpack(x,x) << zext(y):
30174 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
30175 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
30176 // Const vXi16/vXi32 are excluded in favor of MUL-based lowering.
30177 if (!(ConstantAmt && EltSizeInBits != 8) &&
30178 !supportedVectorVarShift(VT, Subtarget, ShiftOpc) &&
30179 (ConstantAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc))) {
30180 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
30181 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
30182 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
30183 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
30184 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
30185 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
30186 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
30187 }
30188
30189 // v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by
30190 // the amount bit.
30191 // TODO: We're doing nothing here that we couldn't do for funnel shifts.
30192 if (EltSizeInBits == 8) {
30193 MVT WideVT =
30194 MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);
30195
30196 // Attempt to fold as:
30197 // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
30198 // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
30199 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
30200 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
30201 // If we're rotating by constant, just use default promotion.
30202 if (ConstantAmt)
30203 return SDValue();
30204 // See if we can perform this by widening to vXi16 or vXi32.
30205 R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);
30206 R = DAG.getNode(
30207 ISD::OR, DL, WideVT, R,
30208 getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));
30209 Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
30210 R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);
30211 if (IsROTL)
30212 R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);
30213 return DAG.getNode(ISD::TRUNCATE, DL, VT, R);
30214 }
30215
30216 // We don't need ModuloAmt here as we just peek at individual bits.
30217 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
30218 if (Subtarget.hasSSE41()) {
30219 // On SSE41 targets we can use PBLENDVB which selects bytes based just
30220 // on the sign bit.
30221 V0 = DAG.getBitcast(VT, V0);
30222 V1 = DAG.getBitcast(VT, V1);
30223 Sel = DAG.getBitcast(VT, Sel);
30224 return DAG.getBitcast(SelVT,
30225 DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
30226 }
30227 // On pre-SSE41 targets we test for the sign bit by comparing to
30228 // zero - a negative value will set all bits of the lanes to true
30229 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
30230 SDValue Z = DAG.getConstant(0, DL, SelVT);
30231 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
30232 return DAG.getSelect(DL, SelVT, C, V0, V1);
30233 };
30234
30235 // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.
30236 if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {
30237 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
30238 IsROTL = true;
30239 }
30240
30241 unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;
30242 unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;
30243
30244 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
30245 // We can safely do this using i16 shifts as we're only interested in
30246 // the 3 lower bits of each byte.
30247 Amt = DAG.getBitcast(ExtVT, Amt);
30248 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
30249 Amt = DAG.getBitcast(VT, Amt);
30250
30251 // r = VSELECT(r, rot(r, 4), a);
30252 SDValue M;
30253 M = DAG.getNode(
30254 ISD::OR, DL, VT,
30255 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
30256 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
30257 R = SignBitSelect(VT, Amt, M, R);
30258
30259 // a += a
30260 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
30261
30262 // r = VSELECT(r, rot(r, 2), a);
30263 M = DAG.getNode(
30264 ISD::OR, DL, VT,
30265 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
30266 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
30267 R = SignBitSelect(VT, Amt, M, R);
30268
30269 // a += a
30270 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
30271
30272 // return VSELECT(r, rot(r, 1), a);
30273 M = DAG.getNode(
30274 ISD::OR, DL, VT,
30275 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
30276 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
30277 return SignBitSelect(VT, Amt, M, R);
30278 }
30279
30280 bool IsSplatAmt = DAG.isSplatValue(Amt);
30281 bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
30282 supportedVectorVarShift(VT, Subtarget, ISD::SRL);
30283
30284 // Fallback for splats + all supported variable shifts.
30285 // Fallback for non-constants AVX2 vXi16 as well.
30286 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
30287 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
30288 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
30289 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
30290 SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);
30291 SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);
30292 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
30293 }
30294
30295 // Everything below assumes ISD::ROTL.
30296 if (!IsROTL) {
30297 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
30298 IsROTL = true;
30299 }
30300
30301 // ISD::ROT* uses modulo rotate amounts.
30302 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
30303
30304 assert(IsROTL && "Only ROTL supported");
30305
30306 // As with shifts, attempt to convert the rotation amount to a multiplication
30307 // factor, fallback to general expansion.
30308 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
30309 if (!Scale)
30310 return SDValue();
30311
30312 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
30313 if (EltSizeInBits == 16) {
30314 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
30315 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
30316 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
30317 }
30318
30319 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
30320 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
30321 // that can then be OR'd with the lower 32-bits.
30322 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
30323 static const int OddMask[] = {1, -1, 3, -1};
30324 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
30325 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
30326
30327 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
30328 DAG.getBitcast(MVT::v2i64, R),
30329 DAG.getBitcast(MVT::v2i64, Scale));
30330 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
30331 DAG.getBitcast(MVT::v2i64, R13),
30332 DAG.getBitcast(MVT::v2i64, Scale13));
30333 Res02 = DAG.getBitcast(VT, Res02);
30334 Res13 = DAG.getBitcast(VT, Res13);
30335
30336 return DAG.getNode(ISD::OR, DL, VT,
30337 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
30338 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
30339}
30340
30341/// Returns true if the operand type is exactly twice the native width, and
30342/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
30343/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
30344/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
30345bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
30346 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
30347
30348 if (OpWidth == 64)
30349 return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();
30350 if (OpWidth == 128)
30351 return Subtarget.canUseCMPXCHG16B();
30352
30353 return false;
30354}
30355
30357X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
30358 Type *MemType = SI->getValueOperand()->getType();
30359
30360 bool NoImplicitFloatOps =
30361 SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
30362 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
30363 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
30364 (Subtarget.hasSSE1() || Subtarget.hasX87()))
30366
30367 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
30369}
30370
30371// Note: this turns large loads into lock cmpxchg8b/16b.
30372// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
30374X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
30375 Type *MemType = LI->getType();
30376
30377 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
30378 // can use movq to do the load. If we have X87 we can load into an 80-bit
30379 // X87 register and store it to a stack temporary.
30380 bool NoImplicitFloatOps =
30381 LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
30382 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
30383 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
30384 (Subtarget.hasSSE1() || Subtarget.hasX87()))
30386
30387 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
30389}
30390
30391enum BitTestKind : unsigned {
30398
30399static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
30400 using namespace llvm::PatternMatch;
30401 BitTestKind BTK = UndefBit;
30402 auto *C = dyn_cast<ConstantInt>(V);
30403 if (C) {
30404 // Check if V is a power of 2 or NOT power of 2.
30405 if (isPowerOf2_64(C->getZExtValue()))
30406 BTK = ConstantBit;
30407 else if (isPowerOf2_64((~C->getValue()).getZExtValue()))
30408 BTK = NotConstantBit;
30409 return {V, BTK};
30410 }
30411
30412 // Check if V is some power of 2 pattern known to be non-zero
30413 auto *I = dyn_cast<Instruction>(V);
30414 if (I) {
30415 bool Not = false;
30416 // Check if we have a NOT
30417 Value *PeekI;
30418 if (match(I, m_c_Xor(m_Value(PeekI), m_AllOnes())) ||
30419 match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {
30420 Not = true;
30421 I = dyn_cast<Instruction>(PeekI);
30422
30423 // If I is constant, it will fold and we can evaluate later. If its an
30424 // argument or something of that nature, we can't analyze.
30425 if (I == nullptr)
30426 return {nullptr, UndefBit};
30427 }
30428 // We can only use 1 << X without more sophisticated analysis. C << X where
30429 // C is a power of 2 but not 1 can result in zero which cannot be translated
30430 // to bittest. Likewise any C >> X (either arith or logical) can be zero.
30431 if (I->getOpcode() == Instruction::Shl) {
30432 // Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &
30433 // -X` and some other provable power of 2 patterns that we can use CTZ on
30434 // may be profitable.
30435 // Todo(2): It may be possible in some cases to prove that Shl(C, X) is
30436 // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also
30437 // be provably a non-zero power of 2.
30438 // Todo(3): ROTL and ROTR patterns on a power of 2 C should also be
30439 // transformable to bittest.
30440 auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));
30441 if (!ShiftVal)
30442 return {nullptr, UndefBit};
30443 if (ShiftVal->equalsInt(1))
30444 BTK = Not ? NotShiftBit : ShiftBit;
30445
30446 if (BTK == UndefBit)
30447 return {nullptr, UndefBit};
30448
30449 Value *BitV = I->getOperand(1);
30450
30451 Value *AndOp;
30452 const APInt *AndC;
30453 if (match(BitV, m_c_And(m_Value(AndOp), m_APInt(AndC)))) {
30454 // Read past a shiftmask instruction to find count
30455 if (*AndC == (I->getType()->getPrimitiveSizeInBits() - 1))
30456 BitV = AndOp;
30457 }
30458 return {BitV, BTK};
30459 }
30460 }
30461 return {nullptr, UndefBit};
30462}
30463
30465X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
30466 using namespace llvm::PatternMatch;
30467 // If the atomicrmw's result isn't actually used, we can just add a "lock"
30468 // prefix to a normal instruction for these operations.
30469 if (AI->use_empty())
30471
30472 if (AI->getOperation() == AtomicRMWInst::Xor) {
30473 // A ^ SignBit -> A + SignBit. This allows us to use `xadd` which is
30474 // preferable to both `cmpxchg` and `btc`.
30475 if (match(AI->getOperand(1), m_SignMask()))
30477 }
30478
30479 // If the atomicrmw's result is used by a single bit AND, we may use
30480 // bts/btr/btc instruction for these operations.
30481 // Note: InstCombinePass can cause a de-optimization here. It replaces the
30482 // SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor
30483 // (depending on CC). This pattern can only use bts/btr/btc but we don't
30484 // detect it.
30485 Instruction *I = AI->user_back();
30486 auto BitChange = FindSingleBitChange(AI->getValOperand());
30487 if (BitChange.second == UndefBit || !AI->hasOneUse() ||
30488 I->getOpcode() != Instruction::And ||
30489 AI->getType()->getPrimitiveSizeInBits() == 8 ||
30490 AI->getParent() != I->getParent())
30492
30493 unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0;
30494
30495 // This is a redundant AND, it should get cleaned up elsewhere.
30496 if (AI == I->getOperand(OtherIdx))
30498
30499 // The following instruction must be a AND single bit.
30500 if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {
30501 auto *C1 = cast<ConstantInt>(AI->getValOperand());
30502 auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx));
30503 if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {
30505 }
30506 if (AI->getOperation() == AtomicRMWInst::And) {
30507 return ~C1->getValue() == C2->getValue()
30510 }
30513 }
30514
30515 assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit);
30516
30517 auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx));
30518 if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)
30520
30521 assert(BitChange.first != nullptr && BitTested.first != nullptr);
30522
30523 // If shift amounts are not the same we can't use BitTestIntrinsic.
30524 if (BitChange.first != BitTested.first)
30526
30527 // If atomic AND need to be masking all be one bit and testing the one bit
30528 // unset in the mask.
30529 if (AI->getOperation() == AtomicRMWInst::And)
30530 return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit)
30533
30534 // If atomic XOR/OR need to be setting and testing the same bit.
30535 return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)
30538}
30539
30540void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
30541 IRBuilder<> Builder(AI);
30542 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
30545 switch (AI->getOperation()) {
30546 default:
30547 llvm_unreachable("Unknown atomic operation");
30548 case AtomicRMWInst::Or:
30549 IID_C = Intrinsic::x86_atomic_bts;
30550 IID_I = Intrinsic::x86_atomic_bts_rm;
30551 break;
30552 case AtomicRMWInst::Xor:
30553 IID_C = Intrinsic::x86_atomic_btc;
30554 IID_I = Intrinsic::x86_atomic_btc_rm;
30555 break;
30556 case AtomicRMWInst::And:
30557 IID_C = Intrinsic::x86_atomic_btr;
30558 IID_I = Intrinsic::x86_atomic_btr_rm;
30559 break;
30560 }
30561 Instruction *I = AI->user_back();
30562 LLVMContext &Ctx = AI->getContext();
30563 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
30565 Function *BitTest = nullptr;
30566 Value *Result = nullptr;
30567 auto BitTested = FindSingleBitChange(AI->getValOperand());
30568 assert(BitTested.first != nullptr);
30569
30570 if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {
30571 auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));
30572
30573 BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_C, AI->getType());
30574
30575 unsigned Imm = llvm::countr_zero(C->getZExtValue());
30576 Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)});
30577 } else {
30578 BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_I, AI->getType());
30579
30580 assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit);
30581
30582 Value *SI = BitTested.first;
30583 assert(SI != nullptr);
30584
30585 // BT{S|R|C} on memory operand don't modulo bit position so we need to
30586 // mask it.
30587 unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();
30588 Value *BitPos =
30589 Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));
30590 // Todo(1): In many cases it may be provable that SI is less than
30591 // ShiftBits in which case this mask is unnecessary
30592 // Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1
30593 // << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in
30594 // favor of just a raw BT{S|R|C}.
30595
30596 Result = Builder.CreateCall(BitTest, {Addr, BitPos});
30597 Result = Builder.CreateZExtOrTrunc(Result, AI->getType());
30598
30599 // If the result is only used for zero/non-zero status then we don't need to
30600 // shift value back. Otherwise do so.
30601 for (auto It = I->user_begin(); It != I->user_end(); ++It) {
30602 if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {
30603 if (ICmp->isEquality()) {
30604 auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));
30605 auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));
30606 if (C0 || C1) {
30607 assert(C0 == nullptr || C1 == nullptr);
30608 if ((C0 ? C0 : C1)->isZero())
30609 continue;
30610 }
30611 }
30612 }
30613 Result = Builder.CreateShl(Result, BitPos);
30614 break;
30615 }
30616 }
30617
30618 I->replaceAllUsesWith(Result);
30619 I->eraseFromParent();
30620 AI->eraseFromParent();
30621}
30622
30624 using namespace llvm::PatternMatch;
30625 if (!AI->hasOneUse())
30626 return false;
30627
30628 Value *Op = AI->getOperand(1);
30630 Instruction *I = AI->user_back();
30632 if (Opc == AtomicRMWInst::Add) {
30633 if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value())))
30634 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
30635 if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) {
30636 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
30637 return Pred == CmpInst::ICMP_SLT;
30638 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
30639 return Pred == CmpInst::ICMP_SGT;
30640 }
30641 return false;
30642 }
30643 if (Opc == AtomicRMWInst::Sub) {
30644 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
30645 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
30646 if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) {
30647 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
30648 return Pred == CmpInst::ICMP_SLT;
30649 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
30650 return Pred == CmpInst::ICMP_SGT;
30651 }
30652 return false;
30653 }
30654 if ((Opc == AtomicRMWInst::Or &&
30656 (Opc == AtomicRMWInst::And &&
30658 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
30659 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE ||
30660 Pred == CmpInst::ICMP_SLT;
30661 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
30662 return Pred == CmpInst::ICMP_SGT;
30663 return false;
30664 }
30665 if (Opc == AtomicRMWInst::Xor) {
30666 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
30667 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
30668 if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) {
30669 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
30670 return Pred == CmpInst::ICMP_SLT;
30671 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
30672 return Pred == CmpInst::ICMP_SGT;
30673 }
30674 return false;
30675 }
30676
30677 return false;
30678}
30679
30680void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic(
30681 AtomicRMWInst *AI) const {
30682 IRBuilder<> Builder(AI);
30683 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
30684 Instruction *TempI = nullptr;
30685 LLVMContext &Ctx = AI->getContext();
30686 ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back());
30687 if (!ICI) {
30688 TempI = AI->user_back();
30689 assert(TempI->hasOneUse() && "Must have one use");
30690 ICI = cast<ICmpInst>(TempI->user_back());
30691 }
30693 ICmpInst::Predicate Pred = ICI->getPredicate();
30694 switch (Pred) {
30695 default:
30696 llvm_unreachable("Not supported Pred");
30697 case CmpInst::ICMP_EQ:
30698 CC = X86::COND_E;
30699 break;
30700 case CmpInst::ICMP_NE:
30701 CC = X86::COND_NE;
30702 break;
30703 case CmpInst::ICMP_SLT:
30704 CC = X86::COND_S;
30705 break;
30706 case CmpInst::ICMP_SGT:
30707 CC = X86::COND_NS;
30708 break;
30709 }
30711 switch (AI->getOperation()) {
30712 default:
30713 llvm_unreachable("Unknown atomic operation");
30714 case AtomicRMWInst::Add:
30715 IID = Intrinsic::x86_atomic_add_cc;
30716 break;
30717 case AtomicRMWInst::Sub:
30718 IID = Intrinsic::x86_atomic_sub_cc;
30719 break;
30720 case AtomicRMWInst::Or:
30721 IID = Intrinsic::x86_atomic_or_cc;
30722 break;
30723 case AtomicRMWInst::And:
30724 IID = Intrinsic::x86_atomic_and_cc;
30725 break;
30726 case AtomicRMWInst::Xor:
30727 IID = Intrinsic::x86_atomic_xor_cc;
30728 break;
30729 }
30730 Function *CmpArith =
30731 Intrinsic::getDeclaration(AI->getModule(), IID, AI->getType());
30732 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
30734 Value *Call = Builder.CreateCall(
30735 CmpArith, {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});
30736 Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx));
30737 ICI->replaceAllUsesWith(Result);
30738 ICI->eraseFromParent();
30739 if (TempI)
30740 TempI->eraseFromParent();
30741 AI->eraseFromParent();
30742}
30743
30745X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
30746 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
30747 Type *MemType = AI->getType();
30748
30749 // If the operand is too big, we must see if cmpxchg8/16b is available
30750 // and default to library calls otherwise.
30751 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
30752 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
30754 }
30755
30757 switch (Op) {
30760 case AtomicRMWInst::Add:
30761 case AtomicRMWInst::Sub:
30764 // It's better to use xadd, xsub or xchg for these in other cases.
30766 case AtomicRMWInst::Or:
30767 case AtomicRMWInst::And:
30768 case AtomicRMWInst::Xor:
30771 return shouldExpandLogicAtomicRMWInIR(AI);
30773 case AtomicRMWInst::Max:
30774 case AtomicRMWInst::Min:
30783 default:
30784 // These always require a non-trivial set of data operations on x86. We must
30785 // use a cmpxchg loop.
30787 }
30788}
30789
30790LoadInst *
30791X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
30792 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
30793 Type *MemType = AI->getType();
30794 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
30795 // there is no benefit in turning such RMWs into loads, and it is actually
30796 // harmful as it introduces a mfence.
30797 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
30798 return nullptr;
30799
30800 // If this is a canonical idempotent atomicrmw w/no uses, we have a better
30801 // lowering available in lowerAtomicArith.
30802 // TODO: push more cases through this path.
30803 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
30804 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
30805 AI->use_empty())
30806 return nullptr;
30807
30808 IRBuilder<> Builder(AI);
30809 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
30810 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
30811 auto SSID = AI->getSyncScopeID();
30812 // We must restrict the ordering to avoid generating loads with Release or
30813 // ReleaseAcquire orderings.
30815
30816 // Before the load we need a fence. Here is an example lifted from
30817 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
30818 // is required:
30819 // Thread 0:
30820 // x.store(1, relaxed);
30821 // r1 = y.fetch_add(0, release);
30822 // Thread 1:
30823 // y.fetch_add(42, acquire);
30824 // r2 = x.load(relaxed);
30825 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
30826 // lowered to just a load without a fence. A mfence flushes the store buffer,
30827 // making the optimization clearly correct.
30828 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
30829 // otherwise, we might be able to be more aggressive on relaxed idempotent
30830 // rmw. In practice, they do not look useful, so we don't try to be
30831 // especially clever.
30832 if (SSID == SyncScope::SingleThread)
30833 // FIXME: we could just insert an ISD::MEMBARRIER here, except we are at
30834 // the IR level, so we must wrap it in an intrinsic.
30835 return nullptr;
30836
30837 if (!Subtarget.hasMFence())
30838 // FIXME: it might make sense to use a locked operation here but on a
30839 // different cache-line to prevent cache-line bouncing. In practice it
30840 // is probably a small win, and x86 processors without mfence are rare
30841 // enough that we do not bother.
30842 return nullptr;
30843
30844 Function *MFence =
30845 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
30846 Builder.CreateCall(MFence, {});
30847
30848 // Finally we can emit the atomic load.
30849 LoadInst *Loaded = Builder.CreateAlignedLoad(
30850 AI->getType(), AI->getPointerOperand(), AI->getAlign());
30851 Loaded->setAtomic(Order, SSID);
30852 AI->replaceAllUsesWith(Loaded);
30853 AI->eraseFromParent();
30854 return Loaded;
30855}
30856
30857/// Emit a locked operation on a stack location which does not change any
30858/// memory location, but does involve a lock prefix. Location is chosen to be
30859/// a) very likely accessed only by a single thread to minimize cache traffic,
30860/// and b) definitely dereferenceable. Returns the new Chain result.
30862 const X86Subtarget &Subtarget, SDValue Chain,
30863 const SDLoc &DL) {
30864 // Implementation notes:
30865 // 1) LOCK prefix creates a full read/write reordering barrier for memory
30866 // operations issued by the current processor. As such, the location
30867 // referenced is not relevant for the ordering properties of the instruction.
30868 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
30869 // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
30870 // 2) Using an immediate operand appears to be the best encoding choice
30871 // here since it doesn't require an extra register.
30872 // 3) OR appears to be very slightly faster than ADD. (Though, the difference
30873 // is small enough it might just be measurement noise.)
30874 // 4) When choosing offsets, there are several contributing factors:
30875 // a) If there's no redzone, we default to TOS. (We could allocate a cache
30876 // line aligned stack object to improve this case.)
30877 // b) To minimize our chances of introducing a false dependence, we prefer
30878 // to offset the stack usage from TOS slightly.
30879 // c) To minimize concerns about cross thread stack usage - in particular,
30880 // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
30881 // captures state in the TOS frame and accesses it from many threads -
30882 // we want to use an offset such that the offset is in a distinct cache
30883 // line from the TOS frame.
30884 //
30885 // For a general discussion of the tradeoffs and benchmark results, see:
30886 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
30887
30888 auto &MF = DAG.getMachineFunction();
30889 auto &TFL = *Subtarget.getFrameLowering();
30890 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
30891
30892 if (Subtarget.is64Bit()) {
30893 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
30894 SDValue Ops[] = {
30895 DAG.getRegister(X86::RSP, MVT::i64), // Base
30896 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
30897 DAG.getRegister(0, MVT::i64), // Index
30898 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
30899 DAG.getRegister(0, MVT::i16), // Segment.
30900 Zero,
30901 Chain};
30902 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
30903 MVT::Other, Ops);
30904 return SDValue(Res, 1);
30905 }
30906
30907 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
30908 SDValue Ops[] = {
30909 DAG.getRegister(X86::ESP, MVT::i32), // Base
30910 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
30911 DAG.getRegister(0, MVT::i32), // Index
30912 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
30913 DAG.getRegister(0, MVT::i16), // Segment.
30914 Zero,
30915 Chain
30916 };
30917 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
30918 MVT::Other, Ops);
30919 return SDValue(Res, 1);
30920}
30921
30923 SelectionDAG &DAG) {
30924 SDLoc dl(Op);
30925 AtomicOrdering FenceOrdering =
30926 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
30927 SyncScope::ID FenceSSID =
30928 static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
30929
30930 // The only fence that needs an instruction is a sequentially-consistent
30931 // cross-thread fence.
30932 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
30933 FenceSSID == SyncScope::System) {
30934 if (Subtarget.hasMFence())
30935 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
30936
30937 SDValue Chain = Op.getOperand(0);
30938 return emitLockedStackOp(DAG, Subtarget, Chain, dl);
30939 }
30940
30941 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
30942 return DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
30943}
30944
30946 SelectionDAG &DAG) {
30947 MVT T = Op.getSimpleValueType();
30948 SDLoc DL(Op);
30949 unsigned Reg = 0;
30950 unsigned size = 0;
30951 switch(T.SimpleTy) {
30952 default: llvm_unreachable("Invalid value type!");
30953 case MVT::i8: Reg = X86::AL; size = 1; break;
30954 case MVT::i16: Reg = X86::AX; size = 2; break;
30955 case MVT::i32: Reg = X86::EAX; size = 4; break;
30956 case MVT::i64:
30957 assert(Subtarget.is64Bit() && "Node not type legal!");
30958 Reg = X86::RAX; size = 8;
30959 break;
30960 }
30961 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
30962 Op.getOperand(2), SDValue());
30963 SDValue Ops[] = { cpIn.getValue(0),
30964 Op.getOperand(1),
30965 Op.getOperand(3),
30966 DAG.getTargetConstant(size, DL, MVT::i8),
30967 cpIn.getValue(1) };
30968 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
30969 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
30971 Ops, T, MMO);
30972
30973 SDValue cpOut =
30974 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
30975 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
30976 MVT::i32, cpOut.getValue(2));
30977 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
30978
30979 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
30980 cpOut, Success, EFLAGS.getValue(1));
30981}
30982
30983// Create MOVMSKB, taking into account whether we need to split for AVX1.
30985 const X86Subtarget &Subtarget) {
30986 MVT InVT = V.getSimpleValueType();
30987
30988 if (InVT == MVT::v64i8) {
30989 SDValue Lo, Hi;
30990 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
30991 Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
30992 Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
30993 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
30994 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
30995 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
30996 DAG.getConstant(32, DL, MVT::i8));
30997 return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
30998 }
30999 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
31000 SDValue Lo, Hi;
31001 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
31002 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
31003 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
31004 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
31005 DAG.getConstant(16, DL, MVT::i8));
31006 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
31007 }
31008
31009 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
31010}
31011
31012static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
31013 SelectionDAG &DAG) {
31014 SDValue Src = Op.getOperand(0);
31015 MVT SrcVT = Src.getSimpleValueType();
31016 MVT DstVT = Op.getSimpleValueType();
31017
31018 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
31019 // half to v32i1 and concatenating the result.
31020 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
31021 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
31022 assert(Subtarget.hasBWI() && "Expected BWI target");
31023 SDLoc dl(Op);
31024 SDValue Lo, Hi;
31025 std::tie(Lo, Hi) = DAG.SplitScalar(Src, dl, MVT::i32, MVT::i32);
31026 Lo = DAG.getBitcast(MVT::v32i1, Lo);
31027 Hi = DAG.getBitcast(MVT::v32i1, Hi);
31028 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
31029 }
31030
31031 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
31032 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
31033 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
31034 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
31035 SDLoc DL(Op);
31036 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
31037 V = getPMOVMSKB(DL, V, DAG, Subtarget);
31038 return DAG.getZExtOrTrunc(V, DL, DstVT);
31039 }
31040
31041 assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
31042 SrcVT == MVT::i64) && "Unexpected VT!");
31043
31044 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
31045 if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
31046 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
31047 // This conversion needs to be expanded.
31048 return SDValue();
31049
31050 SDLoc dl(Op);
31051 if (SrcVT.isVector()) {
31052 // Widen the vector in input in the case of MVT::v2i32.
31053 // Example: from MVT::v2i32 to MVT::v4i32.
31055 SrcVT.getVectorNumElements() * 2);
31056 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
31057 DAG.getUNDEF(SrcVT));
31058 } else {
31059 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
31060 "Unexpected source type in LowerBITCAST");
31061 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
31062 }
31063
31064 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
31065 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
31066
31067 if (DstVT == MVT::x86mmx)
31068 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
31069
31070 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
31071 DAG.getIntPtrConstant(0, dl));
31072}
31073
31074/// Compute the horizontal sum of bytes in V for the elements of VT.
31075///
31076/// Requires V to be a byte vector and VT to be an integer vector type with
31077/// wider elements than V's type. The width of the elements of VT determines
31078/// how many bytes of V are summed horizontally to produce each element of the
31079/// result.
31081 const X86Subtarget &Subtarget,
31082 SelectionDAG &DAG) {
31083 SDLoc DL(V);
31084 MVT ByteVecVT = V.getSimpleValueType();
31085 MVT EltVT = VT.getVectorElementType();
31086 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
31087 "Expected value to have byte element type.");
31088 assert(EltVT != MVT::i8 &&
31089 "Horizontal byte sum only makes sense for wider elements!");
31090 unsigned VecSize = VT.getSizeInBits();
31091 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
31092
31093 // PSADBW instruction horizontally add all bytes and leave the result in i64
31094 // chunks, thus directly computes the pop count for v2i64 and v4i64.
31095 if (EltVT == MVT::i64) {
31096 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
31097 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
31098 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
31099 return DAG.getBitcast(VT, V);
31100 }
31101
31102 if (EltVT == MVT::i32) {
31103 // We unpack the low half and high half into i32s interleaved with zeros so
31104 // that we can use PSADBW to horizontally sum them. The most useful part of
31105 // this is that it lines up the results of two PSADBW instructions to be
31106 // two v2i64 vectors which concatenated are the 4 population counts. We can
31107 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
31108 SDValue Zeros = DAG.getConstant(0, DL, VT);
31109 SDValue V32 = DAG.getBitcast(VT, V);
31110 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
31111 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
31112
31113 // Do the horizontal sums into two v2i64s.
31114 Zeros = DAG.getConstant(0, DL, ByteVecVT);
31115 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
31116 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
31117 DAG.getBitcast(ByteVecVT, Low), Zeros);
31118 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
31119 DAG.getBitcast(ByteVecVT, High), Zeros);
31120
31121 // Merge them together.
31122 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
31123 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
31124 DAG.getBitcast(ShortVecVT, Low),
31125 DAG.getBitcast(ShortVecVT, High));
31126
31127 return DAG.getBitcast(VT, V);
31128 }
31129
31130 // The only element type left is i16.
31131 assert(EltVT == MVT::i16 && "Unknown how to handle type");
31132
31133 // To obtain pop count for each i16 element starting from the pop count for
31134 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
31135 // right by 8. It is important to shift as i16s as i8 vector shift isn't
31136 // directly supported.
31137 SDValue ShifterV = DAG.getConstant(8, DL, VT);
31138 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
31139 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
31140 DAG.getBitcast(ByteVecVT, V));
31141 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
31142}
31143
31145 const X86Subtarget &Subtarget,
31146 SelectionDAG &DAG) {
31147 MVT VT = Op.getSimpleValueType();
31148 MVT EltVT = VT.getVectorElementType();
31149 int NumElts = VT.getVectorNumElements();
31150 (void)EltVT;
31151 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");
31152
31153 // Implement a lookup table in register by using an algorithm based on:
31154 // http://wm.ite.pl/articles/sse-popcount.html
31155 //
31156 // The general idea is that every lower byte nibble in the input vector is an
31157 // index into a in-register pre-computed pop count table. We then split up the
31158 // input vector in two new ones: (1) a vector with only the shifted-right
31159 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
31160 // masked out higher ones) for each byte. PSHUFB is used separately with both
31161 // to index the in-register table. Next, both are added and the result is a
31162 // i8 vector where each element contains the pop count for input byte.
31163 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
31164 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
31165 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
31166 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
31167
31169 for (int i = 0; i < NumElts; ++i)
31170 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
31171 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
31172 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
31173
31174 // High nibbles
31175 SDValue FourV = DAG.getConstant(4, DL, VT);
31176 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
31177
31178 // Low nibbles
31179 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
31180
31181 // The input vector is used as the shuffle mask that index elements into the
31182 // LUT. After counting low and high nibbles, add the vector to obtain the
31183 // final pop count per i8 element.
31184 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
31185 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
31186 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
31187}
31188
31189// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
31190// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
31192 const X86Subtarget &Subtarget,
31193 SelectionDAG &DAG) {
31194 MVT VT = Op.getSimpleValueType();
31195 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
31196 "Unknown CTPOP type to handle");
31197 SDValue Op0 = Op.getOperand(0);
31198
31199 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
31200 if (Subtarget.hasVPOPCNTDQ()) {
31201 unsigned NumElems = VT.getVectorNumElements();
31202 assert((VT.getVectorElementType() == MVT::i8 ||
31203 VT.getVectorElementType() == MVT::i16) && "Unexpected type");
31204 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
31205 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
31206 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
31207 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
31208 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
31209 }
31210 }
31211
31212 // Decompose 256-bit ops into smaller 128-bit ops.
31213 if (VT.is256BitVector() && !Subtarget.hasInt256())
31214 return splitVectorIntUnary(Op, DAG, DL);
31215
31216 // Decompose 512-bit ops into smaller 256-bit ops.
31217 if (VT.is512BitVector() && !Subtarget.hasBWI())
31218 return splitVectorIntUnary(Op, DAG, DL);
31219
31220 // For element types greater than i8, do vXi8 pop counts and a bytesum.
31221 if (VT.getScalarType() != MVT::i8) {
31222 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
31223 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
31224 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
31225 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
31226 }
31227
31228 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
31229 if (!Subtarget.hasSSSE3())
31230 return SDValue();
31231
31232 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
31233}
31234
31235static SDValue LowerCTPOP(SDValue N, const X86Subtarget &Subtarget,
31236 SelectionDAG &DAG) {
31237 MVT VT = N.getSimpleValueType();
31238 SDValue Op = N.getOperand(0);
31239 SDLoc DL(N);
31240
31241 if (VT.isScalarInteger()) {
31242 // Compute the lower/upper bounds of the active bits of the value,
31243 // allowing us to shift the active bits down if necessary to fit into the
31244 // special cases below.
31245 KnownBits Known = DAG.computeKnownBits(Op);
31246 unsigned LZ = Known.countMinLeadingZeros();
31247 unsigned TZ = Known.countMinTrailingZeros();
31248 assert((LZ + TZ) < Known.getBitWidth() && "Illegal shifted mask");
31249 unsigned ActiveBits = Known.getBitWidth() - LZ;
31250 unsigned ShiftedActiveBits = Known.getBitWidth() - (LZ + TZ);
31251
31252 // i2 CTPOP - "ctpop(x) --> sub(x, (x >> 1))".
31253 if (ShiftedActiveBits <= 2) {
31254 if (ActiveBits > 2)
31255 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
31256 DAG.getShiftAmountConstant(TZ, VT, DL));
31257 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
31258 Op = DAG.getNode(ISD::SUB, DL, MVT::i32, Op,
31259 DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
31260 DAG.getShiftAmountConstant(1, VT, DL)));
31261 return DAG.getZExtOrTrunc(Op, DL, VT);
31262 }
31263
31264 // i3 CTPOP - perform LUT into i32 integer.
31265 if (ShiftedActiveBits <= 3) {
31266 if (ActiveBits > 3)
31267 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
31268 DAG.getShiftAmountConstant(TZ, VT, DL));
31269 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
31270 Op = DAG.getNode(ISD::SHL, DL, MVT::i32, Op,
31271 DAG.getShiftAmountConstant(1, VT, DL));
31272 Op = DAG.getNode(ISD::SRL, DL, MVT::i32,
31273 DAG.getConstant(0b1110100110010100U, DL, MVT::i32), Op);
31274 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op,
31275 DAG.getConstant(0x3, DL, MVT::i32));
31276 return DAG.getZExtOrTrunc(Op, DL, VT);
31277 }
31278
31279 // i4 CTPOP - perform LUT into i64 integer.
31280 if (ShiftedActiveBits <= 4 &&
31281 DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64)) {
31282 SDValue LUT = DAG.getConstant(0x4332322132212110ULL, DL, MVT::i64);
31283 if (ActiveBits > 4)
31284 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
31285 DAG.getShiftAmountConstant(TZ, VT, DL));
31286 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
31287 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
31288 DAG.getConstant(4, DL, MVT::i32));
31289 Op = DAG.getNode(ISD::SRL, DL, MVT::i64, LUT,
31290 DAG.getShiftAmountOperand(MVT::i64, Op));
31291 Op = DAG.getNode(ISD::AND, DL, MVT::i64, Op,
31292 DAG.getConstant(0x7, DL, MVT::i64));
31293 return DAG.getZExtOrTrunc(Op, DL, VT);
31294 }
31295
31296 // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
31297 if (ShiftedActiveBits <= 8) {
31298 SDValue Mask11 = DAG.getConstant(0x11111111U, DL, MVT::i32);
31299 if (ActiveBits > 8)
31300 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
31301 DAG.getShiftAmountConstant(TZ, VT, DL));
31302 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
31303 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
31304 DAG.getConstant(0x08040201U, DL, MVT::i32));
31305 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
31306 DAG.getShiftAmountConstant(3, MVT::i32, DL));
31307 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op, Mask11);
31308 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, Mask11);
31309 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
31310 DAG.getShiftAmountConstant(28, MVT::i32, DL));
31311 return DAG.getZExtOrTrunc(Op, DL, VT);
31312 }
31313
31314 return SDValue(); // fallback to generic expansion.
31315 }
31316
31317 assert(VT.isVector() &&
31318 "We only do custom lowering for vector population count.");
31319 return LowerVectorCTPOP(N, DL, Subtarget, DAG);
31320}
31321
31323 MVT VT = Op.getSimpleValueType();
31324 SDValue In = Op.getOperand(0);
31325 SDLoc DL(Op);
31326
31327 // For scalars, its still beneficial to transfer to/from the SIMD unit to
31328 // perform the BITREVERSE.
31329 if (!VT.isVector()) {
31330 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
31331 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
31332 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
31333 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
31334 DAG.getIntPtrConstant(0, DL));
31335 }
31336
31337 int NumElts = VT.getVectorNumElements();
31338 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
31339
31340 // Decompose 256-bit ops into smaller 128-bit ops.
31341 if (VT.is256BitVector())
31342 return splitVectorIntUnary(Op, DAG, DL);
31343
31344 assert(VT.is128BitVector() &&
31345 "Only 128-bit vector bitreverse lowering supported.");
31346
31347 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
31348 // perform the BSWAP in the shuffle.
31349 // Its best to shuffle using the second operand as this will implicitly allow
31350 // memory folding for multiple vectors.
31351 SmallVector<SDValue, 16> MaskElts;
31352 for (int i = 0; i != NumElts; ++i) {
31353 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
31354 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
31355 int PermuteByte = SourceByte | (2 << 5);
31356 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
31357 }
31358 }
31359
31360 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
31361 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
31362 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
31363 Res, Mask);
31364 return DAG.getBitcast(VT, Res);
31365}
31366
31368 SelectionDAG &DAG) {
31369 MVT VT = Op.getSimpleValueType();
31370
31371 if (Subtarget.hasXOP() && !VT.is512BitVector())
31372 return LowerBITREVERSE_XOP(Op, DAG);
31373
31374 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
31375
31376 SDValue In = Op.getOperand(0);
31377 SDLoc DL(Op);
31378
31379 // Split 512-bit ops without BWI so that we can still use the PSHUFB lowering.
31380 if (VT.is512BitVector() && !Subtarget.hasBWI())
31381 return splitVectorIntUnary(Op, DAG, DL);
31382
31383 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
31384 if (VT.is256BitVector() && !Subtarget.hasInt256())
31385 return splitVectorIntUnary(Op, DAG, DL);
31386
31387 // Lower i8/i16/i32/i64 as vXi8 BITREVERSE + BSWAP
31388 if (!VT.isVector()) {
31389 assert(
31390 (VT == MVT::i32 || VT == MVT::i64 || VT == MVT::i16 || VT == MVT::i8) &&
31391 "Only tested for i8/i16/i32/i64");
31392 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
31393 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
31394 Res = DAG.getNode(ISD::BITREVERSE, DL, MVT::v16i8,
31395 DAG.getBitcast(MVT::v16i8, Res));
31396 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
31397 DAG.getBitcast(VecVT, Res), DAG.getIntPtrConstant(0, DL));
31398 return (VT == MVT::i8) ? Res : DAG.getNode(ISD::BSWAP, DL, VT, Res);
31399 }
31400
31401 assert(VT.isVector() && VT.getSizeInBits() >= 128);
31402
31403 // Lower vXi16/vXi32/vXi64 as BSWAP + vXi8 BITREVERSE.
31404 if (VT.getScalarType() != MVT::i8) {
31405 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
31406 SDValue Res = DAG.getNode(ISD::BSWAP, DL, VT, In);
31407 Res = DAG.getBitcast(ByteVT, Res);
31408 Res = DAG.getNode(ISD::BITREVERSE, DL, ByteVT, Res);
31409 return DAG.getBitcast(VT, Res);
31410 }
31411 assert(VT.isVector() && VT.getScalarType() == MVT::i8 &&
31412 "Only byte vector BITREVERSE supported");
31413
31414 unsigned NumElts = VT.getVectorNumElements();
31415
31416 // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
31417 if (Subtarget.hasGFNI()) {
31418 MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
31419 SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);
31420 Matrix = DAG.getBitcast(VT, Matrix);
31421 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
31422 DAG.getTargetConstant(0, DL, MVT::i8));
31423 }
31424
31425 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
31426 // two nibbles and a PSHUFB lookup to find the bitreverse of each
31427 // 0-15 value (moved to the other nibble).
31428 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
31429 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
31430 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
31431
31432 const int LoLUT[16] = {
31433 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
31434 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
31435 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
31436 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
31437 const int HiLUT[16] = {
31438 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
31439 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
31440 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
31441 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
31442
31443 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
31444 for (unsigned i = 0; i < NumElts; ++i) {
31445 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
31446 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
31447 }
31448
31449 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
31450 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
31451 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
31452 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
31453 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
31454}
31455
31456static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
31457 SelectionDAG &DAG) {
31458 SDLoc DL(Op);
31459 SDValue X = Op.getOperand(0);
31460 MVT VT = Op.getSimpleValueType();
31461
31462 // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
31463 if (VT == MVT::i8 ||
31465 X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
31466 SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
31467 DAG.getConstant(0, DL, MVT::i8));
31468 // Copy the inverse of the parity flag into a register with setcc.
31469 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
31470 // Extend to the original type.
31471 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
31472 }
31473
31474 // If we have POPCNT, use the default expansion.
31475 if (Subtarget.hasPOPCNT())
31476 return SDValue();
31477
31478 if (VT == MVT::i64) {
31479 // Xor the high and low 16-bits together using a 32-bit operation.
31480 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
31481 DAG.getNode(ISD::SRL, DL, MVT::i64, X,
31482 DAG.getConstant(32, DL, MVT::i8)));
31483 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
31484 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
31485 }
31486
31487 if (VT != MVT::i16) {
31488 // Xor the high and low 16-bits together using a 32-bit operation.
31489 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
31490 DAG.getConstant(16, DL, MVT::i8));
31491 X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
31492 } else {
31493 // If the input is 16-bits, we need to extend to use an i32 shift below.
31494 X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
31495 }
31496
31497 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
31498 // This should allow an h-reg to be used to save a shift.
31499 SDValue Hi = DAG.getNode(
31500 ISD::TRUNCATE, DL, MVT::i8,
31501 DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
31502 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
31503 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
31504 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
31505
31506 // Copy the inverse of the parity flag into a register with setcc.
31507 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
31508 // Extend to the original type.
31509 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
31510}
31511
31513 const X86Subtarget &Subtarget) {
31514 unsigned NewOpc = 0;
31515 switch (N->getOpcode()) {
31517 NewOpc = X86ISD::LADD;
31518 break;
31520 NewOpc = X86ISD::LSUB;
31521 break;
31523 NewOpc = X86ISD::LOR;
31524 break;
31526 NewOpc = X86ISD::LXOR;
31527 break;
31529 NewOpc = X86ISD::LAND;
31530 break;
31531 default:
31532 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
31533 }
31534
31535 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
31536
31537 return DAG.getMemIntrinsicNode(
31538 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
31539 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
31540 /*MemVT=*/N->getSimpleValueType(0), MMO);
31541}
31542
31543/// Lower atomic_load_ops into LOCK-prefixed operations.
31545 const X86Subtarget &Subtarget) {
31546 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
31547 SDValue Chain = N->getOperand(0);
31548 SDValue LHS = N->getOperand(1);
31549 SDValue RHS = N->getOperand(2);
31550 unsigned Opc = N->getOpcode();
31551 MVT VT = N->getSimpleValueType(0);
31552 SDLoc DL(N);
31553
31554 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
31555 // can only be lowered when the result is unused. They should have already
31556 // been transformed into a cmpxchg loop in AtomicExpand.
31557 if (N->hasAnyUseOfValue(0)) {
31558 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
31559 // select LXADD if LOCK_SUB can't be selected.
31560 // Handle (atomic_load_xor p, SignBit) as (atomic_load_add p, SignBit) so we
31561 // can use LXADD as opposed to cmpxchg.
31562 if (Opc == ISD::ATOMIC_LOAD_SUB ||
31564 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
31565 DAG.getNegative(RHS, DL, VT), AN->getMemOperand());
31566
31568 "Used AtomicRMW ops other than Add should have been expanded!");
31569 return N;
31570 }
31571
31572 // Specialized lowering for the canonical form of an idemptotent atomicrmw.
31573 // The core idea here is that since the memory location isn't actually
31574 // changing, all we need is a lowering for the *ordering* impacts of the
31575 // atomicrmw. As such, we can chose a different operation and memory
31576 // location to minimize impact on other code.
31577 // The above holds unless the node is marked volatile in which
31578 // case it needs to be preserved according to the langref.
31579 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS) && !AN->isVolatile()) {
31580 // On X86, the only ordering which actually requires an instruction is
31581 // seq_cst which isn't SingleThread, everything just needs to be preserved
31582 // during codegen and then dropped. Note that we expect (but don't assume),
31583 // that orderings other than seq_cst and acq_rel have been canonicalized to
31584 // a store or load.
31587 // Prefer a locked operation against a stack location to minimize cache
31588 // traffic. This assumes that stack locations are very likely to be
31589 // accessed only by the owning thread.
31590 SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
31591 assert(!N->hasAnyUseOfValue(0));
31592 // NOTE: The getUNDEF is needed to give something for the unused result 0.
31593 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
31594 DAG.getUNDEF(VT), NewChain);
31595 }
31596 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
31597 SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain);
31598 assert(!N->hasAnyUseOfValue(0));
31599 // NOTE: The getUNDEF is needed to give something for the unused result 0.
31600 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
31601 DAG.getUNDEF(VT), NewChain);
31602 }
31603
31604 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
31605 // RAUW the chain, but don't worry about the result, as it's unused.
31606 assert(!N->hasAnyUseOfValue(0));
31607 // NOTE: The getUNDEF is needed to give something for the unused result 0.
31608 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
31609 DAG.getUNDEF(VT), LockOp.getValue(1));
31610}
31611
31613 const X86Subtarget &Subtarget) {
31614 auto *Node = cast<AtomicSDNode>(Op.getNode());
31615 SDLoc dl(Node);
31616 EVT VT = Node->getMemoryVT();
31617
31618 bool IsSeqCst =
31619 Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
31620 bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
31621
31622 // If this store is not sequentially consistent and the type is legal
31623 // we can just keep it.
31624 if (!IsSeqCst && IsTypeLegal)
31625 return Op;
31626
31627 if (VT == MVT::i64 && !IsTypeLegal) {
31628 // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
31629 // is enabled.
31630 bool NoImplicitFloatOps =
31632 Attribute::NoImplicitFloat);
31633 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
31634 SDValue Chain;
31635 if (Subtarget.hasSSE1()) {
31636 SDValue SclToVec =
31637 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getVal());
31638 MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
31639 SclToVec = DAG.getBitcast(StVT, SclToVec);
31640 SDVTList Tys = DAG.getVTList(MVT::Other);
31641 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
31642 Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
31643 MVT::i64, Node->getMemOperand());
31644 } else if (Subtarget.hasX87()) {
31645 // First load this into an 80-bit X87 register using a stack temporary.
31646 // This will put the whole integer into the significand.
31647 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
31648 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
31649 MachinePointerInfo MPI =
31651 Chain = DAG.getStore(Node->getChain(), dl, Node->getVal(), StackPtr,
31653 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
31654 SDValue LdOps[] = {Chain, StackPtr};
31656 X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
31657 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
31658 Chain = Value.getValue(1);
31659
31660 // Now use an FIST to do the atomic store.
31661 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
31662 Chain =
31663 DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
31664 StoreOps, MVT::i64, Node->getMemOperand());
31665 }
31666
31667 if (Chain) {
31668 // If this is a sequentially consistent store, also emit an appropriate
31669 // barrier.
31670 if (IsSeqCst)
31671 Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
31672
31673 return Chain;
31674 }
31675 }
31676 }
31677
31678 // Convert seq_cst store -> xchg
31679 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
31680 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
31681 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, Node->getMemoryVT(),
31682 Node->getOperand(0), Node->getOperand(2),
31683 Node->getOperand(1), Node->getMemOperand());
31684 return Swap.getValue(1);
31685}
31686
31688 SDNode *N = Op.getNode();
31689 MVT VT = N->getSimpleValueType(0);
31690 unsigned Opc = Op.getOpcode();
31691
31692 // Let legalize expand this if it isn't a legal type yet.
31693 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
31694 return SDValue();
31695
31696 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
31697 SDLoc DL(N);
31698
31699 // Set the carry flag.
31700 SDValue Carry = Op.getOperand(2);
31701 EVT CarryVT = Carry.getValueType();
31702 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
31703 Carry, DAG.getAllOnesConstant(DL, CarryVT));
31704
31705 bool IsAdd = Opc == ISD::UADDO_CARRY || Opc == ISD::SADDO_CARRY;
31706 SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
31707 Op.getOperand(0), Op.getOperand(1),
31708 Carry.getValue(1));
31709
31710 bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
31711 SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
31712 Sum.getValue(1), DL, DAG);
31713 if (N->getValueType(1) == MVT::i1)
31714 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
31715
31716 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
31717}
31718
31719static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
31720 SelectionDAG &DAG) {
31721 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
31722
31723 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
31724 // which returns the values as { float, float } (in XMM0) or
31725 // { double, double } (which is returned in XMM0, XMM1).
31726 SDLoc dl(Op);
31727 SDValue Arg = Op.getOperand(0);
31728 EVT ArgVT = Arg.getValueType();
31729 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
31730
31733
31734 Entry.Node = Arg;
31735 Entry.Ty = ArgTy;
31736 Entry.IsSExt = false;
31737 Entry.IsZExt = false;
31738 Args.push_back(Entry);
31739
31740 bool isF64 = ArgVT == MVT::f64;
31741 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
31742 // the small struct {f32, f32} is returned in (eax, edx). For f64,
31743 // the results are returned via SRet in memory.
31744 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31745 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
31746 const char *LibcallName = TLI.getLibcallName(LC);
31747 SDValue Callee =
31748 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
31749
31750 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
31751 : (Type *)FixedVectorType::get(ArgTy, 4);
31752
31754 CLI.setDebugLoc(dl)
31755 .setChain(DAG.getEntryNode())
31756 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
31757
31758 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
31759
31760 if (isF64)
31761 // Returned in xmm0 and xmm1.
31762 return CallResult.first;
31763
31764 // Returned in bits 0:31 and 32:64 xmm0.
31765 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
31766 CallResult.first, DAG.getIntPtrConstant(0, dl));
31767 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
31768 CallResult.first, DAG.getIntPtrConstant(1, dl));
31769 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
31770 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
31771}
31772
31773/// Widen a vector input to a vector of NVT. The
31774/// input vector must have the same element type as NVT.
31776 bool FillWithZeroes = false) {
31777 // Check if InOp already has the right width.
31778 MVT InVT = InOp.getSimpleValueType();
31779 if (InVT == NVT)
31780 return InOp;
31781
31782 if (InOp.isUndef())
31783 return DAG.getUNDEF(NVT);
31784
31786 "input and widen element type must match");
31787
31788 unsigned InNumElts = InVT.getVectorNumElements();
31789 unsigned WidenNumElts = NVT.getVectorNumElements();
31790 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
31791 "Unexpected request for vector widening");
31792
31793 SDLoc dl(InOp);
31794 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
31795 InOp.getNumOperands() == 2) {
31796 SDValue N1 = InOp.getOperand(1);
31797 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
31798 N1.isUndef()) {
31799 InOp = InOp.getOperand(0);
31800 InVT = InOp.getSimpleValueType();
31801 InNumElts = InVT.getVectorNumElements();
31802 }
31803 }
31807 for (unsigned i = 0; i < InNumElts; ++i)
31808 Ops.push_back(InOp.getOperand(i));
31809
31810 EVT EltVT = InOp.getOperand(0).getValueType();
31811
31812 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
31813 DAG.getUNDEF(EltVT);
31814 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
31815 Ops.push_back(FillVal);
31816 return DAG.getBuildVector(NVT, dl, Ops);
31817 }
31818 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
31819 DAG.getUNDEF(NVT);
31820 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
31821 InOp, DAG.getIntPtrConstant(0, dl));
31822}
31823
31825 SelectionDAG &DAG) {
31826 assert(Subtarget.hasAVX512() &&
31827 "MGATHER/MSCATTER are supported on AVX-512 arch only");
31828
31829 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
31830 SDValue Src = N->getValue();
31831 MVT VT = Src.getSimpleValueType();
31832 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
31833 SDLoc dl(Op);
31834
31835 SDValue Scale = N->getScale();
31836 SDValue Index = N->getIndex();
31837 SDValue Mask = N->getMask();
31838 SDValue Chain = N->getChain();
31839 SDValue BasePtr = N->getBasePtr();
31840
31841 if (VT == MVT::v2f32 || VT == MVT::v2i32) {
31842 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
31843 // If the index is v2i64 and we have VLX we can use xmm for data and index.
31844 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
31845 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31846 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
31847 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
31848 SDVTList VTs = DAG.getVTList(MVT::Other);
31849 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
31850 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
31851 N->getMemoryVT(), N->getMemOperand());
31852 }
31853 return SDValue();
31854 }
31855
31856 MVT IndexVT = Index.getSimpleValueType();
31857
31858 // If the index is v2i32, we're being called by type legalization and we
31859 // should just let the default handling take care of it.
31860 if (IndexVT == MVT::v2i32)
31861 return SDValue();
31862
31863 // If we don't have VLX and neither the passthru or index is 512-bits, we
31864 // need to widen until one is.
31865 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
31866 !Index.getSimpleValueType().is512BitVector()) {
31867 // Determine how much we need to widen by to get a 512-bit type.
31868 unsigned Factor = std::min(512/VT.getSizeInBits(),
31869 512/IndexVT.getSizeInBits());
31870 unsigned NumElts = VT.getVectorNumElements() * Factor;
31871
31872 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
31873 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
31874 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
31875
31876 Src = ExtendToType(Src, VT, DAG);
31877 Index = ExtendToType(Index, IndexVT, DAG);
31878 Mask = ExtendToType(Mask, MaskVT, DAG, true);
31879 }
31880
31881 SDVTList VTs = DAG.getVTList(MVT::Other);
31882 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
31883 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
31884 N->getMemoryVT(), N->getMemOperand());
31885}
31886
31887static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
31888 SelectionDAG &DAG) {
31889
31890 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
31891 MVT VT = Op.getSimpleValueType();
31892 MVT ScalarVT = VT.getScalarType();
31893 SDValue Mask = N->getMask();
31894 MVT MaskVT = Mask.getSimpleValueType();
31895 SDValue PassThru = N->getPassThru();
31896 SDLoc dl(Op);
31897
31898 // Handle AVX masked loads which don't support passthru other than 0.
31899 if (MaskVT.getVectorElementType() != MVT::i1) {
31900 // We also allow undef in the isel pattern.
31901 if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
31902 return Op;
31903
31904 SDValue NewLoad = DAG.getMaskedLoad(
31905 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
31906 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
31907 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
31908 N->isExpandingLoad());
31909 // Emit a blend.
31910 SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
31911 return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
31912 }
31913
31914 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
31915 "Expanding masked load is supported on AVX-512 target only!");
31916
31917 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
31918 "Expanding masked load is supported for 32 and 64-bit types only!");
31919
31920 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
31921 "Cannot lower masked load op.");
31922
31923 assert((ScalarVT.getSizeInBits() >= 32 ||
31924 (Subtarget.hasBWI() &&
31925 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
31926 "Unsupported masked load op.");
31927
31928 // This operation is legal for targets with VLX, but without
31929 // VLX the vector should be widened to 512 bit
31930 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
31931 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
31932 PassThru = ExtendToType(PassThru, WideDataVT, DAG);
31933
31934 // Mask element has to be i1.
31935 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
31936 "Unexpected mask type");
31937
31938 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
31939
31940 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
31941 SDValue NewLoad = DAG.getMaskedLoad(
31942 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
31943 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
31944 N->getExtensionType(), N->isExpandingLoad());
31945
31946 SDValue Extract =
31947 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
31948 DAG.getIntPtrConstant(0, dl));
31949 SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
31950 return DAG.getMergeValues(RetOps, dl);
31951}
31952
31953static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
31954 SelectionDAG &DAG) {
31955 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
31956 SDValue DataToStore = N->getValue();
31957 MVT VT = DataToStore.getSimpleValueType();
31958 MVT ScalarVT = VT.getScalarType();
31959 SDValue Mask = N->getMask();
31960 SDLoc dl(Op);
31961
31962 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
31963 "Expanding masked load is supported on AVX-512 target only!");
31964
31965 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
31966 "Expanding masked load is supported for 32 and 64-bit types only!");
31967
31968 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
31969 "Cannot lower masked store op.");
31970
31971 assert((ScalarVT.getSizeInBits() >= 32 ||
31972 (Subtarget.hasBWI() &&
31973 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
31974 "Unsupported masked store op.");
31975
31976 // This operation is legal for targets with VLX, but without
31977 // VLX the vector should be widened to 512 bit
31978 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
31979 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
31980
31981 // Mask element has to be i1.
31982 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
31983 "Unexpected mask type");
31984
31985 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
31986
31987 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
31988 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
31989 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
31990 N->getOffset(), Mask, N->getMemoryVT(),
31991 N->getMemOperand(), N->getAddressingMode(),
31992 N->isTruncatingStore(), N->isCompressingStore());
31993}
31994
31995static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
31996 SelectionDAG &DAG) {
31997 assert(Subtarget.hasAVX2() &&
31998 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
31999
32000 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
32001 SDLoc dl(Op);
32002 MVT VT = Op.getSimpleValueType();
32003 SDValue Index = N->getIndex();
32004 SDValue Mask = N->getMask();
32005 SDValue PassThru = N->getPassThru();
32006 MVT IndexVT = Index.getSimpleValueType();
32007
32008 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
32009
32010 // If the index is v2i32, we're being called by type legalization.
32011 if (IndexVT == MVT::v2i32)
32012 return SDValue();
32013
32014 // If we don't have VLX and neither the passthru or index is 512-bits, we
32015 // need to widen until one is.
32016 MVT OrigVT = VT;
32017 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
32018 !IndexVT.is512BitVector()) {
32019 // Determine how much we need to widen by to get a 512-bit type.
32020 unsigned Factor = std::min(512/VT.getSizeInBits(),
32021 512/IndexVT.getSizeInBits());
32022
32023 unsigned NumElts = VT.getVectorNumElements() * Factor;
32024
32025 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
32026 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
32027 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
32028
32029 PassThru = ExtendToType(PassThru, VT, DAG);
32030 Index = ExtendToType(Index, IndexVT, DAG);
32031 Mask = ExtendToType(Mask, MaskVT, DAG, true);
32032 }
32033
32034 // Break dependency on the data register.
32035 if (PassThru.isUndef())
32036 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
32037
32038 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
32039 N->getScale() };
32040 SDValue NewGather = DAG.getMemIntrinsicNode(
32041 X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
32042 N->getMemOperand());
32043 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
32044 NewGather, DAG.getIntPtrConstant(0, dl));
32045 return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
32046}
32047
32049 SDLoc dl(Op);
32050 SDValue Src = Op.getOperand(0);
32051 MVT DstVT = Op.getSimpleValueType();
32052
32053 AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
32054 unsigned SrcAS = N->getSrcAddressSpace();
32055
32056 assert(SrcAS != N->getDestAddressSpace() &&
32057 "addrspacecast must be between different address spaces");
32058
32059 if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
32060 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
32061 } else if (DstVT == MVT::i64) {
32062 Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
32063 } else if (DstVT == MVT::i32) {
32064 Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
32065 } else {
32066 report_fatal_error("Bad address space in addrspacecast");
32067 }
32068 return Op;
32069}
32070
32071SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
32072 SelectionDAG &DAG) const {
32073 // TODO: Eventually, the lowering of these nodes should be informed by or
32074 // deferred to the GC strategy for the function in which they appear. For
32075 // now, however, they must be lowered to something. Since they are logically
32076 // no-ops in the case of a null GC strategy (or a GC strategy which does not
32077 // require special handling for these nodes), lower them as literal NOOPs for
32078 // the time being.
32080 Ops.push_back(Op.getOperand(0));
32081 if (Op->getGluedNode())
32082 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
32083
32084 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
32085 return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
32086}
32087
32088// Custom split CVTPS2PH with wide types.
32090 SDLoc dl(Op);
32091 EVT VT = Op.getValueType();
32092 SDValue Lo, Hi;
32093 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
32094 EVT LoVT, HiVT;
32095 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
32096 SDValue RC = Op.getOperand(1);
32097 Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
32098 Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
32099 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
32100}
32101
32103 SelectionDAG &DAG) {
32104 unsigned IsData = Op.getConstantOperandVal(4);
32105
32106 // We don't support non-data prefetch without PREFETCHI.
32107 // Just preserve the chain.
32108 if (!IsData && !Subtarget.hasPREFETCHI())
32109 return Op.getOperand(0);
32110
32111 return Op;
32112}
32113
32115 unsigned OpNo) {
32116 const APInt Operand(32, OpNo);
32117 std::string OpNoStr = llvm::toString(Operand, 10, false);
32118 std::string Str(" $");
32119
32120 std::string OpNoStr1(Str + OpNoStr); // e.g. " $1" (OpNo=1)
32121 std::string OpNoStr2(Str + "{" + OpNoStr + ":"); // With modifier, e.g. ${1:P}
32122
32123 auto I = StringRef::npos;
32124 for (auto &AsmStr : AsmStrs) {
32125 // Match the OpNo string. We should match exactly to exclude match
32126 // sub-string, e.g. "$12" contain "$1"
32127 if (AsmStr.ends_with(OpNoStr1))
32128 I = AsmStr.size() - OpNoStr1.size();
32129
32130 // Get the index of operand in AsmStr.
32131 if (I == StringRef::npos)
32132 I = AsmStr.find(OpNoStr1 + ",");
32133 if (I == StringRef::npos)
32134 I = AsmStr.find(OpNoStr2);
32135
32136 if (I == StringRef::npos)
32137 continue;
32138
32139 assert(I > 0 && "Unexpected inline asm string!");
32140 // Remove the operand string and label (if exsit).
32141 // For example:
32142 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr ${0:P}"
32143 // ==>
32144 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr "
32145 // ==>
32146 // "call dword ptr "
32147 auto TmpStr = AsmStr.substr(0, I);
32148 I = TmpStr.rfind(':');
32149 if (I != StringRef::npos)
32150 TmpStr = TmpStr.substr(I + 1);
32151 return TmpStr.take_while(llvm::isAlpha);
32152 }
32153
32154 return StringRef();
32155}
32156
32158 const SmallVectorImpl<StringRef> &AsmStrs, unsigned OpNo) const {
32159 // In a __asm block, __asm inst foo where inst is CALL or JMP should be
32160 // changed from indirect TargetLowering::C_Memory to direct
32161 // TargetLowering::C_Address.
32162 // We don't need to special case LOOP* and Jcc, which cannot target a memory
32163 // location.
32164 StringRef Inst = getInstrStrFromOpNo(AsmStrs, OpNo);
32165 return Inst.equals_insensitive("call") || Inst.equals_insensitive("jmp");
32166}
32167
32168/// Provide custom lowering hooks for some operations.
32170 switch (Op.getOpcode()) {
32171 // clang-format off
32172 default: llvm_unreachable("Should not custom lower this!");
32173 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
32175 return LowerCMP_SWAP(Op, Subtarget, DAG);
32176 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
32181 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
32182 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
32183 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
32184 case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);
32185 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
32186 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
32187 case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
32188 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
32189 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
32190 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
32191 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
32192 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
32193 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
32194 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
32195 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
32196 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
32197 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
32198 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
32199 case ISD::SHL_PARTS:
32200 case ISD::SRA_PARTS:
32201 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
32202 case ISD::FSHL:
32203 case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
32205 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
32207 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
32208 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
32209 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
32210 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
32211 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
32214 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
32215 case ISD::FP_TO_SINT:
32217 case ISD::FP_TO_UINT:
32218 case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
32220 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);
32221 case ISD::FP_EXTEND:
32222 case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
32223 case ISD::FP_ROUND:
32224 case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
32225 case ISD::FP16_TO_FP:
32226 case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
32227 case ISD::FP_TO_FP16:
32228 case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
32229 case ISD::FP_TO_BF16: return LowerFP_TO_BF16(Op, DAG);
32230 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
32231 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
32232 case ISD::FADD:
32233 case ISD::FSUB: return lowerFaddFsub(Op, DAG);
32234 case ISD::FROUND: return LowerFROUND(Op, DAG);
32235 case ISD::FABS:
32236 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
32237 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
32238 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
32239 case ISD::LRINT:
32240 case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
32241 case ISD::SETCC:
32242 case ISD::STRICT_FSETCC:
32243 case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
32244 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
32245 case ISD::SELECT: return LowerSELECT(Op, DAG);
32246 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
32247 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
32248 case ISD::VASTART: return LowerVASTART(Op, DAG);
32249 case ISD::VAARG: return LowerVAARG(Op, DAG);
32250 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
32251 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
32253 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
32254 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
32255 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
32256 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
32258 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
32259 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
32260 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
32261 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
32262 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
32264 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
32265 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
32267 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
32268 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
32269 case ISD::GET_FPENV_MEM: return LowerGET_FPENV_MEM(Op, DAG);
32270 case ISD::SET_FPENV_MEM: return LowerSET_FPENV_MEM(Op, DAG);
32271 case ISD::RESET_FPENV: return LowerRESET_FPENV(Op, DAG);
32272 case ISD::CTLZ:
32273 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
32274 case ISD::CTTZ:
32275 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
32276 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
32277 case ISD::MULHS:
32278 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
32279 case ISD::ROTL:
32280 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
32281 case ISD::SRA:
32282 case ISD::SRL:
32283 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
32284 case ISD::SADDO:
32285 case ISD::UADDO:
32286 case ISD::SSUBO:
32287 case ISD::USUBO: return LowerXALUO(Op, DAG);
32288 case ISD::SMULO:
32289 case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);
32290 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
32291 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
32292 case ISD::SADDO_CARRY:
32293 case ISD::SSUBO_CARRY:
32294 case ISD::UADDO_CARRY:
32295 case ISD::USUBO_CARRY: return LowerADDSUBO_CARRY(Op, DAG);
32296 case ISD::ADD:
32297 case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
32298 case ISD::UADDSAT:
32299 case ISD::SADDSAT:
32300 case ISD::USUBSAT:
32301 case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
32302 case ISD::SMAX:
32303 case ISD::SMIN:
32304 case ISD::UMAX:
32305 case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG);
32306 case ISD::FMINIMUM:
32307 case ISD::FMAXIMUM:
32308 return LowerFMINIMUM_FMAXIMUM(Op, Subtarget, DAG);
32309 case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
32310 case ISD::ABDS:
32311 case ISD::ABDU: return LowerABD(Op, Subtarget, DAG);
32312 case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);
32313 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
32314 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
32315 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
32316 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
32317 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
32319 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
32320 case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
32321 case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
32322 case ISD::PREFETCH: return LowerPREFETCH(Op, Subtarget, DAG);
32323 // clang-format on
32324 }
32325}
32326
32327/// Replace a node with an illegal result type with a new node built out of
32328/// custom code.
32331 SelectionDAG &DAG) const {
32332 SDLoc dl(N);
32333 switch (N->getOpcode()) {
32334 default:
32335#ifndef NDEBUG
32336 dbgs() << "ReplaceNodeResults: ";
32337 N->dump(&DAG);
32338#endif
32339 llvm_unreachable("Do not know how to custom type legalize this operation!");
32340 case X86ISD::CVTPH2PS: {
32341 EVT VT = N->getValueType(0);
32342 SDValue Lo, Hi;
32343 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
32344 EVT LoVT, HiVT;
32345 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
32346 Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
32347 Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
32348 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
32349 Results.push_back(Res);
32350 return;
32351 }
32353 EVT VT = N->getValueType(0);
32354 SDValue Lo, Hi;
32355 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
32356 EVT LoVT, HiVT;
32357 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
32358 Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
32359 {N->getOperand(0), Lo});
32360 Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
32361 {N->getOperand(0), Hi});
32362 SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
32363 Lo.getValue(1), Hi.getValue(1));
32364 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
32365 Results.push_back(Res);
32366 Results.push_back(Chain);
32367 return;
32368 }
32369 case X86ISD::CVTPS2PH:
32370 Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
32371 return;
32372 case ISD::CTPOP: {
32373 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
32374 // If we have at most 32 active bits, then perform as i32 CTPOP.
32375 // TODO: Perform this in generic legalizer?
32376 KnownBits Known = DAG.computeKnownBits(N->getOperand(0));
32377 unsigned LZ = Known.countMinLeadingZeros();
32378 unsigned TZ = Known.countMinTrailingZeros();
32379 if ((LZ + TZ) >= 32) {
32380 SDValue Op = DAG.getNode(ISD::SRL, dl, MVT::i64, N->getOperand(0),
32381 DAG.getShiftAmountConstant(TZ, MVT::i64, dl));
32382 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Op);
32383 Op = DAG.getNode(ISD::CTPOP, dl, MVT::i32, Op);
32384 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Op);
32385 Results.push_back(Op);
32386 return;
32387 }
32388 // Use a v2i64 if possible.
32389 bool NoImplicitFloatOps =
32391 Attribute::NoImplicitFloat);
32392 if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
32393 SDValue Wide =
32394 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
32395 Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
32396 // Bit count should fit in 32-bits, extract it as that and then zero
32397 // extend to i64. Otherwise we end up extracting bits 63:32 separately.
32398 Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
32399 Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
32400 DAG.getIntPtrConstant(0, dl));
32401 Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
32402 Results.push_back(Wide);
32403 }
32404 return;
32405 }
32406 case ISD::MUL: {
32407 EVT VT = N->getValueType(0);
32409 VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");
32410 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
32411 // elements are needed.
32412 MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
32413 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
32414 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
32415 SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
32416 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
32417 unsigned NumConcats = 16 / VT.getVectorNumElements();
32418 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
32419 ConcatOps[0] = Res;
32420 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
32421 Results.push_back(Res);
32422 return;
32423 }
32424 case ISD::SMULO:
32425 case ISD::UMULO: {
32426 EVT VT = N->getValueType(0);
32428 VT == MVT::v2i32 && "Unexpected VT!");
32429 bool IsSigned = N->getOpcode() == ISD::SMULO;
32430 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
32431 SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0));
32432 SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1));
32433 SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1);
32434 // Extract the high 32 bits from each result using PSHUFD.
32435 // TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD.
32436 SDValue Hi = DAG.getBitcast(MVT::v4i32, Res);
32437 Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});
32438 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi,
32439 DAG.getIntPtrConstant(0, dl));
32440
32441 // Truncate the low bits of the result. This will become PSHUFD.
32442 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
32443
32444 SDValue HiCmp;
32445 if (IsSigned) {
32446 // SMULO overflows if the high bits don't match the sign of the low.
32447 HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT));
32448 } else {
32449 // UMULO overflows if the high bits are non-zero.
32450 HiCmp = DAG.getConstant(0, dl, VT);
32451 }
32452 SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);
32453
32454 // Widen the result with by padding with undef.
32455 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
32456 DAG.getUNDEF(VT));
32457 Results.push_back(Res);
32458 Results.push_back(Ovf);
32459 return;
32460 }
32461 case X86ISD::VPMADDWD: {
32462 // Legalize types for X86ISD::VPMADDWD by widening.
32463 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
32464
32465 EVT VT = N->getValueType(0);
32466 EVT InVT = N->getOperand(0).getValueType();
32467 assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
32468 "Expected a VT that divides into 128 bits.");
32470 "Unexpected type action!");
32471 unsigned NumConcat = 128 / InVT.getSizeInBits();
32472
32473 EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
32474 InVT.getVectorElementType(),
32475 NumConcat * InVT.getVectorNumElements());
32476 EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
32478 NumConcat * VT.getVectorNumElements());
32479
32480 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
32481 Ops[0] = N->getOperand(0);
32482 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
32483 Ops[0] = N->getOperand(1);
32484 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
32485
32486 SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
32487 Results.push_back(Res);
32488 return;
32489 }
32490 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
32491 case X86ISD::FMINC:
32492 case X86ISD::FMIN:
32493 case X86ISD::FMAXC:
32494 case X86ISD::FMAX: {
32495 EVT VT = N->getValueType(0);
32496 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
32497 SDValue UNDEF = DAG.getUNDEF(VT);
32498 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
32499 N->getOperand(0), UNDEF);
32500 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
32501 N->getOperand(1), UNDEF);
32502 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
32503 return;
32504 }
32505 case ISD::SDIV:
32506 case ISD::UDIV:
32507 case ISD::SREM:
32508 case ISD::UREM: {
32509 EVT VT = N->getValueType(0);
32510 if (VT.isVector()) {
32512 "Unexpected type action!");
32513 // If this RHS is a constant splat vector we can widen this and let
32514 // division/remainder by constant optimize it.
32515 // TODO: Can we do something for non-splat?
32516 APInt SplatVal;
32517 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
32518 unsigned NumConcats = 128 / VT.getSizeInBits();
32519 SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
32520 Ops0[0] = N->getOperand(0);
32521 EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
32522 SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
32523 SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
32524 SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
32525 Results.push_back(Res);
32526 }
32527 return;
32528 }
32529
32530 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
32531 Results.push_back(V);
32532 return;
32533 }
32534 case ISD::TRUNCATE: {
32535 MVT VT = N->getSimpleValueType(0);
32536 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
32537 return;
32538
32539 // The generic legalizer will try to widen the input type to the same
32540 // number of elements as the widened result type. But this isn't always
32541 // the best thing so do some custom legalization to avoid some cases.
32542 MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
32543 SDValue In = N->getOperand(0);
32544 EVT InVT = In.getValueType();
32545 EVT InEltVT = InVT.getVectorElementType();
32546 EVT EltVT = VT.getVectorElementType();
32547 unsigned MinElts = VT.getVectorNumElements();
32548 unsigned WidenNumElts = WidenVT.getVectorNumElements();
32549 unsigned InBits = InVT.getSizeInBits();
32550
32551 // See if there are sufficient leading bits to perform a PACKUS/PACKSS.
32552 unsigned PackOpcode;
32553 if (SDValue Src =
32554 matchTruncateWithPACK(PackOpcode, VT, In, dl, DAG, Subtarget)) {
32555 if (SDValue Res = truncateVectorWithPACK(PackOpcode, VT, Src,
32556 dl, DAG, Subtarget)) {
32557 Res = widenSubVector(WidenVT, Res, false, Subtarget, DAG, dl);
32558 Results.push_back(Res);
32559 return;
32560 }
32561 }
32562
32563 if ((128 % InBits) == 0 && WidenVT.is128BitVector()) {
32564 // 128 bit and smaller inputs should avoid truncate all together and
32565 // use a shuffle.
32566 if ((InEltVT.getSizeInBits() % EltVT.getSizeInBits()) == 0) {
32567 int Scale = InEltVT.getSizeInBits() / EltVT.getSizeInBits();
32568 SmallVector<int, 16> TruncMask(WidenNumElts, -1);
32569 for (unsigned I = 0; I < MinElts; ++I)
32570 TruncMask[I] = Scale * I;
32571 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl, 128);
32572 assert(isTypeLegal(WidenVT) && isTypeLegal(WidenIn.getValueType()) &&
32573 "Illegal vector type in truncation");
32574 WidenIn = DAG.getBitcast(WidenVT, WidenIn);
32575 Results.push_back(
32576 DAG.getVectorShuffle(WidenVT, dl, WidenIn, WidenIn, TruncMask));
32577 return;
32578 }
32579 }
32580
32581 // With AVX512 there are some cases that can use a target specific
32582 // truncate node to go from 256/512 to less than 128 with zeros in the
32583 // upper elements of the 128 bit result.
32584 if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
32585 // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
32586 if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
32587 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
32588 return;
32589 }
32590 // There's one case we can widen to 512 bits and use VTRUNC.
32591 if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
32592 In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
32593 DAG.getUNDEF(MVT::v4i64));
32594 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
32595 return;
32596 }
32597 }
32598 if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
32599 getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
32600 isTypeLegal(MVT::v4i64)) {
32601 // Input needs to be split and output needs to widened. Let's use two
32602 // VTRUNCs, and shuffle their results together into the wider type.
32603 SDValue Lo, Hi;
32604 std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
32605
32606 Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
32607 Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
32608 SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
32609 { 0, 1, 2, 3, 16, 17, 18, 19,
32610 -1, -1, -1, -1, -1, -1, -1, -1 });
32611 Results.push_back(Res);
32612 return;
32613 }
32614
32615 // Attempt to widen the truncation input vector to let LowerTRUNCATE handle
32616 // this via type legalization.
32617 if ((InEltVT == MVT::i16 || InEltVT == MVT::i32 || InEltVT == MVT::i64) &&
32618 (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32) &&
32619 (!Subtarget.hasSSSE3() ||
32620 (!isTypeLegal(InVT) &&
32621 !(MinElts <= 4 && InEltVT == MVT::i64 && EltVT == MVT::i8)))) {
32622 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl,
32623 InEltVT.getSizeInBits() * WidenNumElts);
32624 Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, WidenVT, WidenIn));
32625 return;
32626 }
32627
32628 return;
32629 }
32630 case ISD::ANY_EXTEND:
32631 // Right now, only MVT::v8i8 has Custom action for an illegal type.
32632 // It's intended to custom handle the input type.
32633 assert(N->getValueType(0) == MVT::v8i8 &&
32634 "Do not know how to legalize this Node");
32635 return;
32636 case ISD::SIGN_EXTEND:
32637 case ISD::ZERO_EXTEND: {
32638 EVT VT = N->getValueType(0);
32639 SDValue In = N->getOperand(0);
32640 EVT InVT = In.getValueType();
32641 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
32642 (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
32644 "Unexpected type action!");
32645 assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode");
32646 // Custom split this so we can extend i8/i16->i32 invec. This is better
32647 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
32648 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
32649 // we allow the sra from the extend to i32 to be shared by the split.
32650 In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
32651
32652 // Fill a vector with sign bits for each element.
32653 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
32654 SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
32655
32656 // Create an unpackl and unpackh to interleave the sign bits then bitcast
32657 // to v2i64.
32658 SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
32659 {0, 4, 1, 5});
32660 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
32661 SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
32662 {2, 6, 3, 7});
32663 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
32664
32665 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
32666 Results.push_back(Res);
32667 return;
32668 }
32669
32670 if (VT == MVT::v16i32 || VT == MVT::v8i64) {
32671 if (!InVT.is128BitVector()) {
32672 // Not a 128 bit vector, but maybe type legalization will promote
32673 // it to 128 bits.
32674 if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
32675 return;
32676 InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
32677 if (!InVT.is128BitVector())
32678 return;
32679
32680 // Promote the input to 128 bits. Type legalization will turn this into
32681 // zext_inreg/sext_inreg.
32682 In = DAG.getNode(N->getOpcode(), dl, InVT, In);
32683 }
32684
32685 // Perform custom splitting instead of the two stage extend we would get
32686 // by default.
32687 EVT LoVT, HiVT;
32688 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
32689 assert(isTypeLegal(LoVT) && "Split VT not legal?");
32690
32691 SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG);
32692
32693 // We need to shift the input over by half the number of elements.
32694 unsigned NumElts = InVT.getVectorNumElements();
32695 unsigned HalfNumElts = NumElts / 2;
32696 SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
32697 for (unsigned i = 0; i != HalfNumElts; ++i)
32698 ShufMask[i] = i + HalfNumElts;
32699
32700 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
32701 Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG);
32702
32703 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
32704 Results.push_back(Res);
32705 }
32706 return;
32707 }
32708 case ISD::FP_TO_SINT:
32710 case ISD::FP_TO_UINT:
32712 bool IsStrict = N->isStrictFPOpcode();
32713 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||
32714 N->getOpcode() == ISD::STRICT_FP_TO_SINT;
32715 EVT VT = N->getValueType(0);
32716 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
32717 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
32718 EVT SrcVT = Src.getValueType();
32719
32720 SDValue Res;
32721 if (isSoftF16(SrcVT, Subtarget)) {
32722 EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
32723 if (IsStrict) {
32724 Res =
32725 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
32726 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
32727 {NVT, MVT::Other}, {Chain, Src})});
32728 Chain = Res.getValue(1);
32729 } else {
32730 Res = DAG.getNode(N->getOpcode(), dl, VT,
32731 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
32732 }
32733 Results.push_back(Res);
32734 if (IsStrict)
32735 Results.push_back(Chain);
32736
32737 return;
32738 }
32739
32740 if (VT.isVector() && Subtarget.hasFP16() &&
32741 SrcVT.getVectorElementType() == MVT::f16) {
32742 EVT EleVT = VT.getVectorElementType();
32743 EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
32744
32745 if (SrcVT != MVT::v8f16) {
32746 SDValue Tmp =
32747 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
32748 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
32749 Ops[0] = Src;
32750 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
32751 }
32752
32753 if (IsStrict) {
32754 unsigned Opc =
32756 Res =
32757 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
32758 Chain = Res.getValue(1);
32759 } else {
32760 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
32761 Res = DAG.getNode(Opc, dl, ResVT, Src);
32762 }
32763
32764 // TODO: Need to add exception check code for strict FP.
32765 if (EleVT.getSizeInBits() < 16) {
32766 MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);
32767 Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);
32768
32769 // Now widen to 128 bits.
32770 unsigned NumConcats = 128 / TmpVT.getSizeInBits();
32771 MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);
32772 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));
32773 ConcatOps[0] = Res;
32774 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
32775 }
32776
32777 Results.push_back(Res);
32778 if (IsStrict)
32779 Results.push_back(Chain);
32780
32781 return;
32782 }
32783
32784 if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
32786 "Unexpected type action!");
32787
32788 // Try to create a 128 bit vector, but don't exceed a 32 bit element.
32789 unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
32790 MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
32792 SDValue Res;
32793 SDValue Chain;
32794 if (IsStrict) {
32795 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
32796 {N->getOperand(0), Src});
32797 Chain = Res.getValue(1);
32798 } else
32799 Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
32800
32801 // Preserve what we know about the size of the original result. If the
32802 // result is v2i32, we have to manually widen the assert.
32803 if (PromoteVT == MVT::v2i32)
32804 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
32805 DAG.getUNDEF(MVT::v2i32));
32806
32807 Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
32808 Res.getValueType(), Res,
32810
32811 if (PromoteVT == MVT::v2i32)
32812 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
32813 DAG.getIntPtrConstant(0, dl));
32814
32815 // Truncate back to the original width.
32816 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
32817
32818 // Now widen to 128 bits.
32819 unsigned NumConcats = 128 / VT.getSizeInBits();
32821 VT.getVectorNumElements() * NumConcats);
32822 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
32823 ConcatOps[0] = Res;
32824 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
32825 Results.push_back(Res);
32826 if (IsStrict)
32827 Results.push_back(Chain);
32828 return;
32829 }
32830
32831
32832 if (VT == MVT::v2i32) {
32833 assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&
32834 "Strict unsigned conversion requires AVX512");
32835 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
32837 "Unexpected type action!");
32838 if (Src.getValueType() == MVT::v2f64) {
32839 if (!IsSigned && !Subtarget.hasAVX512()) {
32840 SDValue Res =
32841 expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
32842 Results.push_back(Res);
32843 return;
32844 }
32845
32846 unsigned Opc;
32847 if (IsStrict)
32849 else
32850 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
32851
32852 // If we have VLX we can emit a target specific FP_TO_UINT node,.
32853 if (!IsSigned && !Subtarget.hasVLX()) {
32854 // Otherwise we can defer to the generic legalizer which will widen
32855 // the input as well. This will be further widened during op
32856 // legalization to v8i32<-v8f64.
32857 // For strict nodes we'll need to widen ourselves.
32858 // FIXME: Fix the type legalizer to safely widen strict nodes?
32859 if (!IsStrict)
32860 return;
32861 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
32862 DAG.getConstantFP(0.0, dl, MVT::v2f64));
32863 Opc = N->getOpcode();
32864 }
32865 SDValue Res;
32866 SDValue Chain;
32867 if (IsStrict) {
32868 Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
32869 {N->getOperand(0), Src});
32870 Chain = Res.getValue(1);
32871 } else {
32872 Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
32873 }
32874 Results.push_back(Res);
32875 if (IsStrict)
32876 Results.push_back(Chain);
32877 return;
32878 }
32879
32880 // Custom widen strict v2f32->v2i32 by padding with zeros.
32881 // FIXME: Should generic type legalizer do this?
32882 if (Src.getValueType() == MVT::v2f32 && IsStrict) {
32883 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
32884 DAG.getConstantFP(0.0, dl, MVT::v2f32));
32885 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},
32886 {N->getOperand(0), Src});
32887 Results.push_back(Res);
32888 Results.push_back(Res.getValue(1));
32889 return;
32890 }
32891
32892 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
32893 // so early out here.
32894 return;
32895 }
32896
32897 assert(!VT.isVector() && "Vectors should have been handled above!");
32898
32899 if ((Subtarget.hasDQI() && VT == MVT::i64 &&
32900 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
32901 (Subtarget.hasFP16() && SrcVT == MVT::f16)) {
32902 assert(!Subtarget.is64Bit() && "i64 should be legal");
32903 unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
32904 // If we use a 128-bit result we might need to use a target specific node.
32905 unsigned SrcElts =
32906 std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
32907 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
32908 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
32909 unsigned Opc = N->getOpcode();
32910 if (NumElts != SrcElts) {
32911 if (IsStrict)
32913 else
32914 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
32915 }
32916
32917 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
32918 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
32919 DAG.getConstantFP(0.0, dl, VecInVT), Src,
32920 ZeroIdx);
32921 SDValue Chain;
32922 if (IsStrict) {
32923 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
32924 Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
32925 Chain = Res.getValue(1);
32926 } else
32927 Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
32928 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
32929 Results.push_back(Res);
32930 if (IsStrict)
32931 Results.push_back(Chain);
32932 return;
32933 }
32934
32935 if (VT == MVT::i128 && Subtarget.isTargetWin64()) {
32936 SDValue Chain;
32937 SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);
32938 Results.push_back(V);
32939 if (IsStrict)
32940 Results.push_back(Chain);
32941 return;
32942 }
32943
32944 if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
32945 Results.push_back(V);
32946 if (IsStrict)
32947 Results.push_back(Chain);
32948 }
32949 return;
32950 }
32951 case ISD::LRINT:
32952 case ISD::LLRINT: {
32953 if (SDValue V = LRINT_LLRINTHelper(N, DAG))
32954 Results.push_back(V);
32955 return;
32956 }
32957
32958 case ISD::SINT_TO_FP:
32960 case ISD::UINT_TO_FP:
32962 bool IsStrict = N->isStrictFPOpcode();
32963 bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||
32964 N->getOpcode() == ISD::STRICT_SINT_TO_FP;
32965 EVT VT = N->getValueType(0);
32966 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
32967 if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&
32968 Subtarget.hasVLX()) {
32969 if (Src.getValueType().getVectorElementType() == MVT::i16)
32970 return;
32971
32972 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)
32973 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
32974 IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)
32975 : DAG.getUNDEF(MVT::v2i32));
32976 if (IsStrict) {
32977 unsigned Opc =
32979 SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
32980 {N->getOperand(0), Src});
32981 Results.push_back(Res);
32982 Results.push_back(Res.getValue(1));
32983 } else {
32984 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
32985 Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));
32986 }
32987 return;
32988 }
32989 if (VT != MVT::v2f32)
32990 return;
32991 EVT SrcVT = Src.getValueType();
32992 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
32993 if (IsStrict) {
32994 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
32996 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
32997 {N->getOperand(0), Src});
32998 Results.push_back(Res);
32999 Results.push_back(Res.getValue(1));
33000 } else {
33001 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
33002 Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
33003 }
33004 return;
33005 }
33006 if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
33007 Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
33008 SDValue Zero = DAG.getConstant(0, dl, SrcVT);
33009 SDValue One = DAG.getConstant(1, dl, SrcVT);
33010 SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
33011 DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
33012 DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
33013 SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
33014 SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
33015 SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
33016 for (int i = 0; i != 2; ++i) {
33017 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
33018 SignSrc, DAG.getIntPtrConstant(i, dl));
33019 if (IsStrict)
33020 SignCvts[i] =
33021 DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
33022 {N->getOperand(0), Elt});
33023 else
33024 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
33025 };
33026 SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
33027 SDValue Slow, Chain;
33028 if (IsStrict) {
33029 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
33030 SignCvts[0].getValue(1), SignCvts[1].getValue(1));
33031 Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
33032 {Chain, SignCvt, SignCvt});
33033 Chain = Slow.getValue(1);
33034 } else {
33035 Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
33036 }
33037 IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
33038 IsNeg =
33039 DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
33040 SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
33041 Results.push_back(Cvt);
33042 if (IsStrict)
33043 Results.push_back(Chain);
33044 return;
33045 }
33046
33047 if (SrcVT != MVT::v2i32)
33048 return;
33049
33050 if (IsSigned || Subtarget.hasAVX512()) {
33051 if (!IsStrict)
33052 return;
33053
33054 // Custom widen strict v2i32->v2f32 to avoid scalarization.
33055 // FIXME: Should generic type legalizer do this?
33056 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
33057 DAG.getConstant(0, dl, MVT::v2i32));
33058 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
33059 {N->getOperand(0), Src});
33060 Results.push_back(Res);
33061 Results.push_back(Res.getValue(1));
33062 return;
33063 }
33064
33065 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
33066 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
33067 SDValue VBias = DAG.getConstantFP(
33068 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::v2f64);
33069 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
33070 DAG.getBitcast(MVT::v2i64, VBias));
33071 Or = DAG.getBitcast(MVT::v2f64, Or);
33072 if (IsStrict) {
33073 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
33074 {N->getOperand(0), Or, VBias});
33076 {MVT::v4f32, MVT::Other},
33077 {Sub.getValue(1), Sub});
33078 Results.push_back(Res);
33079 Results.push_back(Res.getValue(1));
33080 } else {
33081 // TODO: Are there any fast-math-flags to propagate here?
33082 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
33083 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
33084 }
33085 return;
33086 }
33088 case ISD::FP_ROUND: {
33089 bool IsStrict = N->isStrictFPOpcode();
33090 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
33091 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
33092 SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);
33093 EVT SrcVT = Src.getValueType();
33094 EVT VT = N->getValueType(0);
33095 SDValue V;
33096 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
33097 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
33098 : DAG.getUNDEF(MVT::v2f32);
33099 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
33100 }
33101 if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {
33102 assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C");
33103 if (SrcVT.getVectorElementType() != MVT::f32)
33104 return;
33105
33106 if (IsStrict)
33107 V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
33108 {Chain, Src, Rnd});
33109 else
33110 V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd);
33111
33112 Results.push_back(DAG.getBitcast(MVT::v8f16, V));
33113 if (IsStrict)
33114 Results.push_back(V.getValue(1));
33115 return;
33116 }
33117 if (!isTypeLegal(Src.getValueType()))
33118 return;
33119 EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
33120 if (IsStrict)
33121 V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
33122 {Chain, Src});
33123 else
33124 V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
33125 Results.push_back(V);
33126 if (IsStrict)
33127 Results.push_back(V.getValue(1));
33128 return;
33129 }
33130 case ISD::FP_EXTEND:
33131 case ISD::STRICT_FP_EXTEND: {
33132 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
33133 // No other ValueType for FP_EXTEND should reach this point.
33134 assert(N->getValueType(0) == MVT::v2f32 &&
33135 "Do not know how to legalize this Node");
33136 if (!Subtarget.hasFP16() || !Subtarget.hasVLX())
33137 return;
33138 bool IsStrict = N->isStrictFPOpcode();
33139 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
33140 if (Src.getValueType().getVectorElementType() != MVT::f16)
33141 return;
33142 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)
33143 : DAG.getUNDEF(MVT::v2f16);
33144 SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);
33145 if (IsStrict)
33146 V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},
33147 {N->getOperand(0), V});
33148 else
33149 V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);
33150 Results.push_back(V);
33151 if (IsStrict)
33152 Results.push_back(V.getValue(1));
33153 return;
33154 }
33156 unsigned IntNo = N->getConstantOperandVal(1);
33157 switch (IntNo) {
33158 default : llvm_unreachable("Do not know how to custom type "
33159 "legalize this intrinsic operation!");
33160 case Intrinsic::x86_rdtsc:
33161 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
33162 Results);
33163 case Intrinsic::x86_rdtscp:
33164 return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
33165 Results);
33166 case Intrinsic::x86_rdpmc:
33167 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
33168 Results);
33169 return;
33170 case Intrinsic::x86_rdpru:
33171 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,
33172 Results);
33173 return;
33174 case Intrinsic::x86_xgetbv:
33175 expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
33176 Results);
33177 return;
33178 }
33179 }
33180 case ISD::READCYCLECOUNTER: {
33181 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
33182 }
33184 EVT T = N->getValueType(0);
33185 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
33186 bool Regs64bit = T == MVT::i128;
33187 assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&
33188 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
33189 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
33190 SDValue cpInL, cpInH;
33191 std::tie(cpInL, cpInH) =
33192 DAG.SplitScalar(N->getOperand(2), dl, HalfT, HalfT);
33193 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
33194 Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue());
33195 cpInH =
33196 DAG.getCopyToReg(cpInL.getValue(0), dl, Regs64bit ? X86::RDX : X86::EDX,
33197 cpInH, cpInL.getValue(1));
33198 SDValue swapInL, swapInH;
33199 std::tie(swapInL, swapInH) =
33200 DAG.SplitScalar(N->getOperand(3), dl, HalfT, HalfT);
33201 swapInH =
33202 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
33203 swapInH, cpInH.getValue(1));
33204
33205 // In 64-bit mode we might need the base pointer in RBX, but we can't know
33206 // until later. So we keep the RBX input in a vreg and use a custom
33207 // inserter.
33208 // Since RBX will be a reserved register the register allocator will not
33209 // make sure its value will be properly saved and restored around this
33210 // live-range.
33211 SDValue Result;
33212 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
33213 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
33214 if (Regs64bit) {
33215 SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
33216 swapInH.getValue(1)};
33217 Result =
33218 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
33219 } else {
33220 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
33221 swapInH.getValue(1));
33222 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
33223 swapInL.getValue(1)};
33224 Result =
33225 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
33226 }
33227
33228 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
33229 Regs64bit ? X86::RAX : X86::EAX,
33230 HalfT, Result.getValue(1));
33231 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
33232 Regs64bit ? X86::RDX : X86::EDX,
33233 HalfT, cpOutL.getValue(2));
33234 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
33235
33236 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
33237 MVT::i32, cpOutH.getValue(2));
33238 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
33239 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
33240
33241 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
33242 Results.push_back(Success);
33243 Results.push_back(EFLAGS.getValue(1));
33244 return;
33245 }
33246 case ISD::ATOMIC_LOAD: {
33247 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
33248 bool NoImplicitFloatOps =
33250 Attribute::NoImplicitFloat);
33251 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
33252 auto *Node = cast<AtomicSDNode>(N);
33253 if (Subtarget.hasSSE1()) {
33254 // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
33255 // Then extract the lower 64-bits.
33256 MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
33257 SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
33258 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
33259 SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
33260 MVT::i64, Node->getMemOperand());
33261 if (Subtarget.hasSSE2()) {
33262 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
33263 DAG.getIntPtrConstant(0, dl));
33264 Results.push_back(Res);
33265 Results.push_back(Ld.getValue(1));
33266 return;
33267 }
33268 // We use an alternative sequence for SSE1 that extracts as v2f32 and
33269 // then casts to i64. This avoids a 128-bit stack temporary being
33270 // created by type legalization if we were to cast v4f32->v2i64.
33271 SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
33272 DAG.getIntPtrConstant(0, dl));
33273 Res = DAG.getBitcast(MVT::i64, Res);
33274 Results.push_back(Res);
33275 Results.push_back(Ld.getValue(1));
33276 return;
33277 }
33278 if (Subtarget.hasX87()) {
33279 // First load this into an 80-bit X87 register. This will put the whole
33280 // integer into the significand.
33281 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
33282 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
33284 dl, Tys, Ops, MVT::i64,
33285 Node->getMemOperand());
33286 SDValue Chain = Result.getValue(1);
33287
33288 // Now store the X87 register to a stack temporary and convert to i64.
33289 // This store is not atomic and doesn't need to be.
33290 // FIXME: We don't need a stack temporary if the result of the load
33291 // is already being stored. We could just directly store there.
33292 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
33293 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
33294 MachinePointerInfo MPI =
33296 SDValue StoreOps[] = { Chain, Result, StackPtr };
33297 Chain = DAG.getMemIntrinsicNode(
33298 X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
33299 MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore);
33300
33301 // Finally load the value back from the stack temporary and return it.
33302 // This load is not atomic and doesn't need to be.
33303 // This load will be further type legalized.
33304 Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
33305 Results.push_back(Result);
33306 Results.push_back(Result.getValue(1));
33307 return;
33308 }
33309 }
33310 // TODO: Use MOVLPS when SSE1 is available?
33311 // Delegate to generic TypeLegalization. Situations we can really handle
33312 // should have already been dealt with by AtomicExpandPass.cpp.
33313 break;
33314 }
33315 case ISD::ATOMIC_SWAP:
33326 // Delegate to generic TypeLegalization. Situations we can really handle
33327 // should have already been dealt with by AtomicExpandPass.cpp.
33328 break;
33329
33330 case ISD::BITCAST: {
33331 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
33332 EVT DstVT = N->getValueType(0);
33333 EVT SrcVT = N->getOperand(0).getValueType();
33334
33335 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
33336 // we can split using the k-register rather than memory.
33337 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
33338 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
33339 SDValue Lo, Hi;
33340 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
33341 Lo = DAG.getBitcast(MVT::i32, Lo);
33342 Hi = DAG.getBitcast(MVT::i32, Hi);
33343 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
33344 Results.push_back(Res);
33345 return;
33346 }
33347
33348 if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
33349 // FIXME: Use v4f32 for SSE1?
33350 assert(Subtarget.hasSSE2() && "Requires SSE2");
33351 assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
33352 "Unexpected type action!");
33353 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
33354 SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
33355 N->getOperand(0));
33356 Res = DAG.getBitcast(WideVT, Res);
33357 Results.push_back(Res);
33358 return;
33359 }
33360
33361 return;
33362 }
33363 case ISD::MGATHER: {
33364 EVT VT = N->getValueType(0);
33365 if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
33366 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
33367 auto *Gather = cast<MaskedGatherSDNode>(N);
33368 SDValue Index = Gather->getIndex();
33369 if (Index.getValueType() != MVT::v2i64)
33370 return;
33372 "Unexpected type action!");
33373 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
33374 SDValue Mask = Gather->getMask();
33375 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
33376 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
33377 Gather->getPassThru(),
33378 DAG.getUNDEF(VT));
33379 if (!Subtarget.hasVLX()) {
33380 // We need to widen the mask, but the instruction will only use 2
33381 // of its elements. So we can use undef.
33382 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
33383 DAG.getUNDEF(MVT::v2i1));
33384 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
33385 }
33386 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
33387 Gather->getBasePtr(), Index, Gather->getScale() };
33388 SDValue Res = DAG.getMemIntrinsicNode(
33389 X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
33390 Gather->getMemoryVT(), Gather->getMemOperand());
33391 Results.push_back(Res);
33392 Results.push_back(Res.getValue(1));
33393 return;
33394 }
33395 return;
33396 }
33397 case ISD::LOAD: {
33398 // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
33399 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
33400 // cast since type legalization will try to use an i64 load.
33401 MVT VT = N->getSimpleValueType(0);
33402 assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
33404 "Unexpected type action!");
33405 if (!ISD::isNON_EXTLoad(N))
33406 return;
33407 auto *Ld = cast<LoadSDNode>(N);
33408 if (Subtarget.hasSSE2()) {
33409 MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
33410 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
33411 Ld->getPointerInfo(), Ld->getOriginalAlign(),
33412 Ld->getMemOperand()->getFlags());
33413 SDValue Chain = Res.getValue(1);
33414 MVT VecVT = MVT::getVectorVT(LdVT, 2);
33415 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
33416 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
33417 Res = DAG.getBitcast(WideVT, Res);
33418 Results.push_back(Res);
33419 Results.push_back(Chain);
33420 return;
33421 }
33422 assert(Subtarget.hasSSE1() && "Expected SSE");
33423 SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
33424 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
33425 SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
33426 MVT::i64, Ld->getMemOperand());
33427 Results.push_back(Res);
33428 Results.push_back(Res.getValue(1));
33429 return;
33430 }
33431 case ISD::ADDRSPACECAST: {
33432 SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
33433 Results.push_back(V);
33434 return;
33435 }
33436 case ISD::BITREVERSE: {
33437 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
33438 assert(Subtarget.hasXOP() && "Expected XOP");
33439 // We can use VPPERM by copying to a vector register and back. We'll need
33440 // to move the scalar in two i32 pieces.
33441 Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
33442 return;
33443 }
33445 // f16 = extract vXf16 %vec, i64 %idx
33446 assert(N->getSimpleValueType(0) == MVT::f16 &&
33447 "Unexpected Value type of EXTRACT_VECTOR_ELT!");
33448 assert(Subtarget.hasFP16() && "Expected FP16");
33449 SDValue VecOp = N->getOperand(0);
33451 SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0));
33452 Split = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Split,
33453 N->getOperand(1));
33454 Split = DAG.getBitcast(MVT::f16, Split);
33455 Results.push_back(Split);
33456 return;
33457 }
33458 }
33459}
33460
33461const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
33462 switch ((X86ISD::NodeType)Opcode) {
33463 case X86ISD::FIRST_NUMBER: break;
33464#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
33465 NODE_NAME_CASE(BSF)
33466 NODE_NAME_CASE(BSR)
33467 NODE_NAME_CASE(FSHL)
33468 NODE_NAME_CASE(FSHR)
33469 NODE_NAME_CASE(FAND)
33470 NODE_NAME_CASE(FANDN)
33471 NODE_NAME_CASE(FOR)
33472 NODE_NAME_CASE(FXOR)
33473 NODE_NAME_CASE(FILD)
33474 NODE_NAME_CASE(FIST)
33475 NODE_NAME_CASE(FP_TO_INT_IN_MEM)
33476 NODE_NAME_CASE(FLD)
33477 NODE_NAME_CASE(FST)
33478 NODE_NAME_CASE(CALL)
33479 NODE_NAME_CASE(CALL_RVMARKER)
33481 NODE_NAME_CASE(CMP)
33482 NODE_NAME_CASE(FCMP)
33483 NODE_NAME_CASE(STRICT_FCMP)
33484 NODE_NAME_CASE(STRICT_FCMPS)
33486 NODE_NAME_CASE(UCOMI)
33487 NODE_NAME_CASE(CMPM)
33488 NODE_NAME_CASE(CMPMM)
33489 NODE_NAME_CASE(STRICT_CMPM)
33490 NODE_NAME_CASE(CMPMM_SAE)
33491 NODE_NAME_CASE(SETCC)
33492 NODE_NAME_CASE(SETCC_CARRY)
33493 NODE_NAME_CASE(FSETCC)
33494 NODE_NAME_CASE(FSETCCM)
33495 NODE_NAME_CASE(FSETCCM_SAE)
33496 NODE_NAME_CASE(CMOV)
33497 NODE_NAME_CASE(BRCOND)
33498 NODE_NAME_CASE(RET_GLUE)
33499 NODE_NAME_CASE(IRET)
33500 NODE_NAME_CASE(REP_STOS)
33501 NODE_NAME_CASE(REP_MOVS)
33502 NODE_NAME_CASE(GlobalBaseReg)
33504 NODE_NAME_CASE(WrapperRIP)
33505 NODE_NAME_CASE(MOVQ2DQ)
33506 NODE_NAME_CASE(MOVDQ2Q)
33507 NODE_NAME_CASE(MMX_MOVD2W)
33508 NODE_NAME_CASE(MMX_MOVW2D)
33509 NODE_NAME_CASE(PEXTRB)
33510 NODE_NAME_CASE(PEXTRW)
33511 NODE_NAME_CASE(INSERTPS)
33512 NODE_NAME_CASE(PINSRB)
33513 NODE_NAME_CASE(PINSRW)
33514 NODE_NAME_CASE(PSHUFB)
33515 NODE_NAME_CASE(ANDNP)
33516 NODE_NAME_CASE(BLENDI)
33518 NODE_NAME_CASE(HADD)
33519 NODE_NAME_CASE(HSUB)
33520 NODE_NAME_CASE(FHADD)
33521 NODE_NAME_CASE(FHSUB)
33522 NODE_NAME_CASE(CONFLICT)
33523 NODE_NAME_CASE(FMAX)
33524 NODE_NAME_CASE(FMAXS)
33525 NODE_NAME_CASE(FMAX_SAE)
33526 NODE_NAME_CASE(FMAXS_SAE)
33527 NODE_NAME_CASE(FMIN)
33528 NODE_NAME_CASE(FMINS)
33529 NODE_NAME_CASE(FMIN_SAE)
33530 NODE_NAME_CASE(FMINS_SAE)
33531 NODE_NAME_CASE(FMAXC)
33532 NODE_NAME_CASE(FMINC)
33533 NODE_NAME_CASE(FRSQRT)
33534 NODE_NAME_CASE(FRCP)
33535 NODE_NAME_CASE(EXTRQI)
33536 NODE_NAME_CASE(INSERTQI)
33537 NODE_NAME_CASE(TLSADDR)
33538 NODE_NAME_CASE(TLSBASEADDR)
33539 NODE_NAME_CASE(TLSCALL)
33540 NODE_NAME_CASE(TLSDESC)
33541 NODE_NAME_CASE(EH_SJLJ_SETJMP)
33542 NODE_NAME_CASE(EH_SJLJ_LONGJMP)
33543 NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
33544 NODE_NAME_CASE(EH_RETURN)
33545 NODE_NAME_CASE(TC_RETURN)
33546 NODE_NAME_CASE(FNSTCW16m)
33547 NODE_NAME_CASE(FLDCW16m)
33548 NODE_NAME_CASE(FNSTENVm)
33549 NODE_NAME_CASE(FLDENVm)
33550 NODE_NAME_CASE(LCMPXCHG_DAG)
33551 NODE_NAME_CASE(LCMPXCHG8_DAG)
33552 NODE_NAME_CASE(LCMPXCHG16_DAG)
33553 NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
33554 NODE_NAME_CASE(LADD)
33555 NODE_NAME_CASE(LSUB)
33556 NODE_NAME_CASE(LOR)
33557 NODE_NAME_CASE(LXOR)
33558 NODE_NAME_CASE(LAND)
33559 NODE_NAME_CASE(LBTS)
33560 NODE_NAME_CASE(LBTC)
33561 NODE_NAME_CASE(LBTR)
33562 NODE_NAME_CASE(LBTS_RM)
33563 NODE_NAME_CASE(LBTC_RM)
33564 NODE_NAME_CASE(LBTR_RM)
33565 NODE_NAME_CASE(AADD)
33566 NODE_NAME_CASE(AOR)
33567 NODE_NAME_CASE(AXOR)
33568 NODE_NAME_CASE(AAND)
33569 NODE_NAME_CASE(VZEXT_MOVL)
33570 NODE_NAME_CASE(VZEXT_LOAD)
33571 NODE_NAME_CASE(VEXTRACT_STORE)
33572 NODE_NAME_CASE(VTRUNC)
33573 NODE_NAME_CASE(VTRUNCS)
33574 NODE_NAME_CASE(VTRUNCUS)
33575 NODE_NAME_CASE(VMTRUNC)
33576 NODE_NAME_CASE(VMTRUNCS)
33577 NODE_NAME_CASE(VMTRUNCUS)
33578 NODE_NAME_CASE(VTRUNCSTORES)
33579 NODE_NAME_CASE(VTRUNCSTOREUS)
33580 NODE_NAME_CASE(VMTRUNCSTORES)
33581 NODE_NAME_CASE(VMTRUNCSTOREUS)
33582 NODE_NAME_CASE(VFPEXT)
33583 NODE_NAME_CASE(STRICT_VFPEXT)
33584 NODE_NAME_CASE(VFPEXT_SAE)
33585 NODE_NAME_CASE(VFPEXTS)
33586 NODE_NAME_CASE(VFPEXTS_SAE)
33587 NODE_NAME_CASE(VFPROUND)
33588 NODE_NAME_CASE(STRICT_VFPROUND)
33589 NODE_NAME_CASE(VMFPROUND)
33590 NODE_NAME_CASE(VFPROUND_RND)
33591 NODE_NAME_CASE(VFPROUNDS)
33592 NODE_NAME_CASE(VFPROUNDS_RND)
33593 NODE_NAME_CASE(VSHLDQ)
33594 NODE_NAME_CASE(VSRLDQ)
33595 NODE_NAME_CASE(VSHL)
33596 NODE_NAME_CASE(VSRL)
33597 NODE_NAME_CASE(VSRA)
33598 NODE_NAME_CASE(VSHLI)
33599 NODE_NAME_CASE(VSRLI)
33600 NODE_NAME_CASE(VSRAI)
33601 NODE_NAME_CASE(VSHLV)
33602 NODE_NAME_CASE(VSRLV)
33603 NODE_NAME_CASE(VSRAV)
33604 NODE_NAME_CASE(VROTLI)
33605 NODE_NAME_CASE(VROTRI)
33606 NODE_NAME_CASE(VPPERM)
33607 NODE_NAME_CASE(CMPP)
33608 NODE_NAME_CASE(STRICT_CMPP)
33609 NODE_NAME_CASE(PCMPEQ)
33610 NODE_NAME_CASE(PCMPGT)
33611 NODE_NAME_CASE(PHMINPOS)
33612 NODE_NAME_CASE(ADD)
33613 NODE_NAME_CASE(SUB)
33614 NODE_NAME_CASE(ADC)
33615 NODE_NAME_CASE(SBB)
33616 NODE_NAME_CASE(SMUL)
33617 NODE_NAME_CASE(UMUL)
33618 NODE_NAME_CASE(OR)
33619 NODE_NAME_CASE(XOR)
33620 NODE_NAME_CASE(AND)
33621 NODE_NAME_CASE(BEXTR)
33623 NODE_NAME_CASE(BZHI)
33624 NODE_NAME_CASE(PDEP)
33625 NODE_NAME_CASE(PEXT)
33626 NODE_NAME_CASE(MUL_IMM)
33627 NODE_NAME_CASE(MOVMSK)
33628 NODE_NAME_CASE(PTEST)
33629 NODE_NAME_CASE(TESTP)
33630 NODE_NAME_CASE(KORTEST)
33631 NODE_NAME_CASE(KTEST)
33632 NODE_NAME_CASE(KADD)
33633 NODE_NAME_CASE(KSHIFTL)
33634 NODE_NAME_CASE(KSHIFTR)
33635 NODE_NAME_CASE(PACKSS)
33636 NODE_NAME_CASE(PACKUS)
33637 NODE_NAME_CASE(PALIGNR)
33638 NODE_NAME_CASE(VALIGN)
33639 NODE_NAME_CASE(VSHLD)
33640 NODE_NAME_CASE(VSHRD)
33641 NODE_NAME_CASE(VSHLDV)
33642 NODE_NAME_CASE(VSHRDV)
33643 NODE_NAME_CASE(PSHUFD)
33644 NODE_NAME_CASE(PSHUFHW)
33645 NODE_NAME_CASE(PSHUFLW)
33646 NODE_NAME_CASE(SHUFP)
33647 NODE_NAME_CASE(SHUF128)
33648 NODE_NAME_CASE(MOVLHPS)
33649 NODE_NAME_CASE(MOVHLPS)
33650 NODE_NAME_CASE(MOVDDUP)
33651 NODE_NAME_CASE(MOVSHDUP)
33652 NODE_NAME_CASE(MOVSLDUP)
33653 NODE_NAME_CASE(MOVSD)
33654 NODE_NAME_CASE(MOVSS)
33655 NODE_NAME_CASE(MOVSH)
33656 NODE_NAME_CASE(UNPCKL)
33657 NODE_NAME_CASE(UNPCKH)
33658 NODE_NAME_CASE(VBROADCAST)
33659 NODE_NAME_CASE(VBROADCAST_LOAD)
33660 NODE_NAME_CASE(VBROADCASTM)
33661 NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
33662 NODE_NAME_CASE(VPERMILPV)
33663 NODE_NAME_CASE(VPERMILPI)
33664 NODE_NAME_CASE(VPERM2X128)
33665 NODE_NAME_CASE(VPERMV)
33666 NODE_NAME_CASE(VPERMV3)
33667 NODE_NAME_CASE(VPERMI)
33668 NODE_NAME_CASE(VPTERNLOG)
33669 NODE_NAME_CASE(VFIXUPIMM)
33670 NODE_NAME_CASE(VFIXUPIMM_SAE)
33671 NODE_NAME_CASE(VFIXUPIMMS)
33672 NODE_NAME_CASE(VFIXUPIMMS_SAE)
33673 NODE_NAME_CASE(VRANGE)
33674 NODE_NAME_CASE(VRANGE_SAE)
33675 NODE_NAME_CASE(VRANGES)
33676 NODE_NAME_CASE(VRANGES_SAE)
33677 NODE_NAME_CASE(PMULUDQ)
33678 NODE_NAME_CASE(PMULDQ)
33679 NODE_NAME_CASE(PSADBW)
33680 NODE_NAME_CASE(DBPSADBW)
33681 NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
33682 NODE_NAME_CASE(VAARG_64)
33683 NODE_NAME_CASE(VAARG_X32)
33684 NODE_NAME_CASE(DYN_ALLOCA)
33685 NODE_NAME_CASE(MFENCE)
33686 NODE_NAME_CASE(SEG_ALLOCA)
33687 NODE_NAME_CASE(PROBED_ALLOCA)
33690 NODE_NAME_CASE(RDPKRU)
33691 NODE_NAME_CASE(WRPKRU)
33692 NODE_NAME_CASE(VPMADDUBSW)
33693 NODE_NAME_CASE(VPMADDWD)
33694 NODE_NAME_CASE(VPSHA)
33695 NODE_NAME_CASE(VPSHL)
33696 NODE_NAME_CASE(VPCOM)
33697 NODE_NAME_CASE(VPCOMU)
33698 NODE_NAME_CASE(VPERMIL2)
33700 NODE_NAME_CASE(STRICT_FMSUB)
33702 NODE_NAME_CASE(STRICT_FNMADD)
33704 NODE_NAME_CASE(STRICT_FNMSUB)
33705 NODE_NAME_CASE(FMADDSUB)
33706 NODE_NAME_CASE(FMSUBADD)
33707 NODE_NAME_CASE(FMADD_RND)
33708 NODE_NAME_CASE(FNMADD_RND)
33709 NODE_NAME_CASE(FMSUB_RND)
33710 NODE_NAME_CASE(FNMSUB_RND)
33711 NODE_NAME_CASE(FMADDSUB_RND)
33712 NODE_NAME_CASE(FMSUBADD_RND)
33713 NODE_NAME_CASE(VFMADDC)
33714 NODE_NAME_CASE(VFMADDC_RND)
33715 NODE_NAME_CASE(VFCMADDC)
33716 NODE_NAME_CASE(VFCMADDC_RND)
33717 NODE_NAME_CASE(VFMULC)
33718 NODE_NAME_CASE(VFMULC_RND)
33719 NODE_NAME_CASE(VFCMULC)
33720 NODE_NAME_CASE(VFCMULC_RND)
33721 NODE_NAME_CASE(VFMULCSH)
33722 NODE_NAME_CASE(VFMULCSH_RND)
33723 NODE_NAME_CASE(VFCMULCSH)
33724 NODE_NAME_CASE(VFCMULCSH_RND)
33725 NODE_NAME_CASE(VFMADDCSH)
33726 NODE_NAME_CASE(VFMADDCSH_RND)
33727 NODE_NAME_CASE(VFCMADDCSH)
33728 NODE_NAME_CASE(VFCMADDCSH_RND)
33729 NODE_NAME_CASE(VPMADD52H)
33730 NODE_NAME_CASE(VPMADD52L)
33731 NODE_NAME_CASE(VRNDSCALE)
33732 NODE_NAME_CASE(STRICT_VRNDSCALE)
33733 NODE_NAME_CASE(VRNDSCALE_SAE)
33734 NODE_NAME_CASE(VRNDSCALES)
33735 NODE_NAME_CASE(VRNDSCALES_SAE)
33736 NODE_NAME_CASE(VREDUCE)
33737 NODE_NAME_CASE(VREDUCE_SAE)
33738 NODE_NAME_CASE(VREDUCES)
33739 NODE_NAME_CASE(VREDUCES_SAE)
33740 NODE_NAME_CASE(VGETMANT)
33741 NODE_NAME_CASE(VGETMANT_SAE)
33742 NODE_NAME_CASE(VGETMANTS)
33743 NODE_NAME_CASE(VGETMANTS_SAE)
33744 NODE_NAME_CASE(PCMPESTR)
33745 NODE_NAME_CASE(PCMPISTR)
33747 NODE_NAME_CASE(COMPRESS)
33749 NODE_NAME_CASE(SELECTS)
33750 NODE_NAME_CASE(ADDSUB)
33751 NODE_NAME_CASE(RCP14)
33752 NODE_NAME_CASE(RCP14S)
33753 NODE_NAME_CASE(RCP28)
33754 NODE_NAME_CASE(RCP28_SAE)
33755 NODE_NAME_CASE(RCP28S)
33756 NODE_NAME_CASE(RCP28S_SAE)
33757 NODE_NAME_CASE(EXP2)
33758 NODE_NAME_CASE(EXP2_SAE)
33759 NODE_NAME_CASE(RSQRT14)
33760 NODE_NAME_CASE(RSQRT14S)
33761 NODE_NAME_CASE(RSQRT28)
33762 NODE_NAME_CASE(RSQRT28_SAE)
33763 NODE_NAME_CASE(RSQRT28S)
33764 NODE_NAME_CASE(RSQRT28S_SAE)
33765 NODE_NAME_CASE(FADD_RND)
33766 NODE_NAME_CASE(FADDS)
33767 NODE_NAME_CASE(FADDS_RND)
33768 NODE_NAME_CASE(FSUB_RND)
33769 NODE_NAME_CASE(FSUBS)
33770 NODE_NAME_CASE(FSUBS_RND)
33771 NODE_NAME_CASE(FMUL_RND)
33772 NODE_NAME_CASE(FMULS)
33773 NODE_NAME_CASE(FMULS_RND)
33774 NODE_NAME_CASE(FDIV_RND)
33775 NODE_NAME_CASE(FDIVS)
33776 NODE_NAME_CASE(FDIVS_RND)
33777 NODE_NAME_CASE(FSQRT_RND)
33778 NODE_NAME_CASE(FSQRTS)
33779 NODE_NAME_CASE(FSQRTS_RND)
33780 NODE_NAME_CASE(FGETEXP)
33781 NODE_NAME_CASE(FGETEXP_SAE)
33782 NODE_NAME_CASE(FGETEXPS)
33783 NODE_NAME_CASE(FGETEXPS_SAE)
33784 NODE_NAME_CASE(SCALEF)
33785 NODE_NAME_CASE(SCALEF_RND)
33786 NODE_NAME_CASE(SCALEFS)
33787 NODE_NAME_CASE(SCALEFS_RND)
33788 NODE_NAME_CASE(MULHRS)
33789 NODE_NAME_CASE(SINT_TO_FP_RND)
33790 NODE_NAME_CASE(UINT_TO_FP_RND)
33791 NODE_NAME_CASE(CVTTP2SI)
33792 NODE_NAME_CASE(CVTTP2UI)
33793 NODE_NAME_CASE(STRICT_CVTTP2SI)
33794 NODE_NAME_CASE(STRICT_CVTTP2UI)
33795 NODE_NAME_CASE(MCVTTP2SI)
33796 NODE_NAME_CASE(MCVTTP2UI)
33797 NODE_NAME_CASE(CVTTP2SI_SAE)
33798 NODE_NAME_CASE(CVTTP2UI_SAE)
33799 NODE_NAME_CASE(CVTTS2SI)
33800 NODE_NAME_CASE(CVTTS2UI)
33801 NODE_NAME_CASE(CVTTS2SI_SAE)
33802 NODE_NAME_CASE(CVTTS2UI_SAE)
33803 NODE_NAME_CASE(CVTSI2P)
33804 NODE_NAME_CASE(CVTUI2P)
33805 NODE_NAME_CASE(STRICT_CVTSI2P)
33806 NODE_NAME_CASE(STRICT_CVTUI2P)
33807 NODE_NAME_CASE(MCVTSI2P)
33808 NODE_NAME_CASE(MCVTUI2P)
33809 NODE_NAME_CASE(VFPCLASS)
33810 NODE_NAME_CASE(VFPCLASSS)
33811 NODE_NAME_CASE(MULTISHIFT)
33812 NODE_NAME_CASE(SCALAR_SINT_TO_FP)
33813 NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
33814 NODE_NAME_CASE(SCALAR_UINT_TO_FP)
33815 NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
33816 NODE_NAME_CASE(CVTPS2PH)
33817 NODE_NAME_CASE(STRICT_CVTPS2PH)
33818 NODE_NAME_CASE(CVTPS2PH_SAE)
33819 NODE_NAME_CASE(MCVTPS2PH)
33820 NODE_NAME_CASE(MCVTPS2PH_SAE)
33821 NODE_NAME_CASE(CVTPH2PS)
33822 NODE_NAME_CASE(STRICT_CVTPH2PS)
33823 NODE_NAME_CASE(CVTPH2PS_SAE)
33824 NODE_NAME_CASE(CVTP2SI)
33825 NODE_NAME_CASE(CVTP2UI)
33826 NODE_NAME_CASE(MCVTP2SI)
33827 NODE_NAME_CASE(MCVTP2UI)
33828 NODE_NAME_CASE(CVTP2SI_RND)
33829 NODE_NAME_CASE(CVTP2UI_RND)
33830 NODE_NAME_CASE(CVTS2SI)
33831 NODE_NAME_CASE(CVTS2UI)
33832 NODE_NAME_CASE(CVTS2SI_RND)
33833 NODE_NAME_CASE(CVTS2UI_RND)
33834 NODE_NAME_CASE(CVTNE2PS2BF16)
33835 NODE_NAME_CASE(CVTNEPS2BF16)
33836 NODE_NAME_CASE(MCVTNEPS2BF16)
33837 NODE_NAME_CASE(DPBF16PS)
33838 NODE_NAME_CASE(LWPINS)
33839 NODE_NAME_CASE(MGATHER)
33840 NODE_NAME_CASE(MSCATTER)
33841 NODE_NAME_CASE(VPDPBUSD)
33842 NODE_NAME_CASE(VPDPBUSDS)
33843 NODE_NAME_CASE(VPDPWSSD)
33844 NODE_NAME_CASE(VPDPWSSDS)
33845 NODE_NAME_CASE(VPSHUFBITQMB)
33846 NODE_NAME_CASE(GF2P8MULB)
33847 NODE_NAME_CASE(GF2P8AFFINEQB)
33848 NODE_NAME_CASE(GF2P8AFFINEINVQB)
33849 NODE_NAME_CASE(NT_CALL)
33850 NODE_NAME_CASE(NT_BRIND)
33851 NODE_NAME_CASE(UMWAIT)
33852 NODE_NAME_CASE(TPAUSE)
33853 NODE_NAME_CASE(ENQCMD)
33854 NODE_NAME_CASE(ENQCMDS)
33855 NODE_NAME_CASE(VP2INTERSECT)
33856 NODE_NAME_CASE(VPDPBSUD)
33857 NODE_NAME_CASE(VPDPBSUDS)
33858 NODE_NAME_CASE(VPDPBUUD)
33859 NODE_NAME_CASE(VPDPBUUDS)
33860 NODE_NAME_CASE(VPDPBSSD)
33861 NODE_NAME_CASE(VPDPBSSDS)
33862 NODE_NAME_CASE(AESENC128KL)
33863 NODE_NAME_CASE(AESDEC128KL)
33864 NODE_NAME_CASE(AESENC256KL)
33865 NODE_NAME_CASE(AESDEC256KL)
33866 NODE_NAME_CASE(AESENCWIDE128KL)
33867 NODE_NAME_CASE(AESDECWIDE128KL)
33868 NODE_NAME_CASE(AESENCWIDE256KL)
33869 NODE_NAME_CASE(AESDECWIDE256KL)
33870 NODE_NAME_CASE(CMPCCXADD)
33871 NODE_NAME_CASE(TESTUI)
33872 NODE_NAME_CASE(FP80_ADD)
33873 NODE_NAME_CASE(STRICT_FP80_ADD)
33874 }
33875 return nullptr;
33876#undef NODE_NAME_CASE
33877}
33878
33879/// Return true if the addressing mode represented by AM is legal for this
33880/// target, for a load/store of the specified type.
33882 const AddrMode &AM, Type *Ty,
33883 unsigned AS,
33884 Instruction *I) const {
33885 // X86 supports extremely general addressing modes.
33887
33888 // X86 allows a sign-extended 32-bit immediate field as a displacement.
33889 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
33890 return false;
33891
33892 if (AM.BaseGV) {
33893 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
33894
33895 // If a reference to this global requires an extra load, we can't fold it.
33896 if (isGlobalStubReference(GVFlags))
33897 return false;
33898
33899 // If BaseGV requires a register for the PIC base, we cannot also have a
33900 // BaseReg specified.
33901 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
33902 return false;
33903
33904 // If lower 4G is not available, then we must use rip-relative addressing.
33905 if ((M != CodeModel::Small || isPositionIndependent()) &&
33906 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
33907 return false;
33908 }
33909
33910 switch (AM.Scale) {
33911 case 0:
33912 case 1:
33913 case 2:
33914 case 4:
33915 case 8:
33916 // These scales always work.
33917 break;
33918 case 3:
33919 case 5:
33920 case 9:
33921 // These scales are formed with basereg+scalereg. Only accept if there is
33922 // no basereg yet.
33923 if (AM.HasBaseReg)
33924 return false;
33925 break;
33926 default: // Other stuff never works.
33927 return false;
33928 }
33929
33930 return true;
33931}
33932
33934 unsigned Bits = Ty->getScalarSizeInBits();
33935
33936 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
33937 // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
33938 if (Subtarget.hasXOP() &&
33939 (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
33940 return false;
33941
33942 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
33943 // shifts just as cheap as scalar ones.
33944 if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
33945 return false;
33946
33947 // AVX512BW has shifts such as vpsllvw.
33948 if (Subtarget.hasBWI() && Bits == 16)
33949 return false;
33950
33951 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
33952 // fully general vector.
33953 return true;
33954}
33955
33956bool X86TargetLowering::isBinOp(unsigned Opcode) const {
33957 switch (Opcode) {
33958 // These are non-commutative binops.
33959 // TODO: Add more X86ISD opcodes once we have test coverage.
33960 case X86ISD::ANDNP:
33961 case X86ISD::PCMPGT:
33962 case X86ISD::FMAX:
33963 case X86ISD::FMIN:
33964 case X86ISD::FANDN:
33965 case X86ISD::VPSHA:
33966 case X86ISD::VPSHL:
33967 case X86ISD::VSHLV:
33968 case X86ISD::VSRLV:
33969 case X86ISD::VSRAV:
33970 return true;
33971 }
33972
33973 return TargetLoweringBase::isBinOp(Opcode);
33974}
33975
33976bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
33977 switch (Opcode) {
33978 // TODO: Add more X86ISD opcodes once we have test coverage.
33979 case X86ISD::PCMPEQ:
33980 case X86ISD::PMULDQ:
33981 case X86ISD::PMULUDQ:
33982 case X86ISD::FMAXC:
33983 case X86ISD::FMINC:
33984 case X86ISD::FAND:
33985 case X86ISD::FOR:
33986 case X86ISD::FXOR:
33987 return true;
33988 }
33989
33991}
33992
33994 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
33995 return false;
33996 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
33997 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
33998 return NumBits1 > NumBits2;
33999}
34000
34002 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
34003 return false;
34004
34005 if (!isTypeLegal(EVT::getEVT(Ty1)))
34006 return false;
34007
34008 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
34009
34010 // Assuming the caller doesn't have a zeroext or signext return parameter,
34011 // truncation all the way down to i1 is valid.
34012 return true;
34013}
34014
34016 return isInt<32>(Imm);
34017}
34018
34020 // Can also use sub to handle negated immediates.
34021 return isInt<32>(Imm);
34022}
34023
34025 return isInt<32>(Imm);
34026}
34027
34029 if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
34030 return false;
34031 unsigned NumBits1 = VT1.getSizeInBits();
34032 unsigned NumBits2 = VT2.getSizeInBits();
34033 return NumBits1 > NumBits2;
34034}
34035
34037 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
34038 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
34039}
34040
34042 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
34043 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
34044}
34045
34047 EVT VT1 = Val.getValueType();
34048 if (isZExtFree(VT1, VT2))
34049 return true;
34050
34051 if (Val.getOpcode() != ISD::LOAD)
34052 return false;
34053
34054 if (!VT1.isSimple() || !VT1.isInteger() ||
34055 !VT2.isSimple() || !VT2.isInteger())
34056 return false;
34057
34058 switch (VT1.getSimpleVT().SimpleTy) {
34059 default: break;
34060 case MVT::i8:
34061 case MVT::i16:
34062 case MVT::i32:
34063 // X86 has 8, 16, and 32-bit zero-extending loads.
34064 return true;
34065 }
34066
34067 return false;
34068}
34069
34071 SmallVectorImpl<Use *> &Ops) const {
34072 using namespace llvm::PatternMatch;
34073
34074 FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
34075 if (!VTy)
34076 return false;
34077
34078 if (I->getOpcode() == Instruction::Mul &&
34079 VTy->getElementType()->isIntegerTy(64)) {
34080 for (auto &Op : I->operands()) {
34081 // Make sure we are not already sinking this operand
34082 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
34083 continue;
34084
34085 // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or
34086 // the PMULUDQ pattern where the input is a zext_inreg from vXi32.
34087 if (Subtarget.hasSSE41() &&
34088 match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),
34089 m_SpecificInt(32)))) {
34090 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
34091 Ops.push_back(&Op);
34092 } else if (Subtarget.hasSSE2() &&
34093 match(Op.get(),
34094 m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) {
34095 Ops.push_back(&Op);
34096 }
34097 }
34098
34099 return !Ops.empty();
34100 }
34101
34102 // A uniform shift amount in a vector shift or funnel shift may be much
34103 // cheaper than a generic variable vector shift, so make that pattern visible
34104 // to SDAG by sinking the shuffle instruction next to the shift.
34105 int ShiftAmountOpNum = -1;
34106 if (I->isShift())
34107 ShiftAmountOpNum = 1;
34108 else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
34109 if (II->getIntrinsicID() == Intrinsic::fshl ||
34110 II->getIntrinsicID() == Intrinsic::fshr)
34111 ShiftAmountOpNum = 2;
34112 }
34113
34114 if (ShiftAmountOpNum == -1)
34115 return false;
34116
34117 auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
34118 if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
34119 isVectorShiftByScalarCheap(I->getType())) {
34120 Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
34121 return true;
34122 }
34123
34124 return false;
34125}
34126
34128 if (!Subtarget.is64Bit())
34129 return false;
34131}
34132
34134 if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
34135 return false;
34136
34137 EVT SrcVT = ExtVal.getOperand(0).getValueType();
34138
34139 // There is no extending load for vXi1.
34140 if (SrcVT.getScalarType() == MVT::i1)
34141 return false;
34142
34143 return true;
34144}
34145
34147 EVT VT) const {
34148 if (!Subtarget.hasAnyFMA())
34149 return false;
34150
34151 VT = VT.getScalarType();
34152
34153 if (!VT.isSimple())
34154 return false;
34155
34156 switch (VT.getSimpleVT().SimpleTy) {
34157 case MVT::f16:
34158 return Subtarget.hasFP16();
34159 case MVT::f32:
34160 case MVT::f64:
34161 return true;
34162 default:
34163 break;
34164 }
34165
34166 return false;
34167}
34168
34170 // i16 instructions are longer (0x66 prefix) and potentially slower.
34171 return !(SrcVT == MVT::i32 && DestVT == MVT::i16);
34172}
34173
34175 EVT VT) const {
34176 // TODO: This is too general. There are cases where pre-AVX512 codegen would
34177 // benefit. The transform may also be profitable for scalar code.
34178 if (!Subtarget.hasAVX512())
34179 return false;
34180 if (!Subtarget.hasVLX() && !VT.is512BitVector())
34181 return false;
34182 if (!VT.isVector() || VT.getScalarType() == MVT::i1)
34183 return false;
34184
34185 return true;
34186}
34187
34188/// Targets can use this to indicate that they only support *some*
34189/// VECTOR_SHUFFLE operations, those with specific masks.
34190/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
34191/// are assumed to be legal.
34193 if (!VT.isSimple())
34194 return false;
34195
34196 // Not for i1 vectors
34197 if (VT.getSimpleVT().getScalarType() == MVT::i1)
34198 return false;
34199
34200 // Very little shuffling can be done for 64-bit vectors right now.
34201 if (VT.getSimpleVT().getSizeInBits() == 64)
34202 return false;
34203
34204 // We only care that the types being shuffled are legal. The lowering can
34205 // handle any possible shuffle mask that results.
34206 return isTypeLegal(VT.getSimpleVT());
34207}
34208
34210 EVT VT) const {
34211 // Don't convert an 'and' into a shuffle that we don't directly support.
34212 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
34213 if (!Subtarget.hasAVX2())
34214 if (VT == MVT::v32i8 || VT == MVT::v16i16)
34215 return false;
34216
34217 // Just delegate to the generic legality, clear masks aren't special.
34218 return isShuffleMaskLegal(Mask, VT);
34219}
34220
34222 // If the subtarget is using thunks, we need to not generate jump tables.
34223 if (Subtarget.useIndirectThunkBranches())
34224 return false;
34225
34226 // Otherwise, fallback on the generic logic.
34228}
34229
34231 EVT ConditionVT) const {
34232 // Avoid 8 and 16 bit types because they increase the chance for unnecessary
34233 // zero-extensions.
34234 if (ConditionVT.getSizeInBits() < 32)
34235 return MVT::i32;
34237 ConditionVT);
34238}
34239
34240//===----------------------------------------------------------------------===//
34241// X86 Scheduler Hooks
34242//===----------------------------------------------------------------------===//
34243
34244// Returns true if EFLAG is consumed after this iterator in the rest of the
34245// basic block or any successors of the basic block.
34247 MachineBasicBlock *BB) {
34248 // Scan forward through BB for a use/def of EFLAGS.
34249 for (const MachineInstr &mi : llvm::make_range(std::next(Itr), BB->end())) {
34250 if (mi.readsRegister(X86::EFLAGS, /*TRI=*/nullptr))
34251 return true;
34252 // If we found a def, we can stop searching.
34253 if (mi.definesRegister(X86::EFLAGS, /*TRI=*/nullptr))
34254 return false;
34255 }
34256
34257 // If we hit the end of the block, check whether EFLAGS is live into a
34258 // successor.
34259 for (MachineBasicBlock *Succ : BB->successors())
34260 if (Succ->isLiveIn(X86::EFLAGS))
34261 return true;
34262
34263 return false;
34264}
34265
34266/// Utility function to emit xbegin specifying the start of an RTM region.
34268 const TargetInstrInfo *TII) {
34269 const MIMetadata MIMD(MI);
34270
34271 const BasicBlock *BB = MBB->getBasicBlock();
34273
34274 // For the v = xbegin(), we generate
34275 //
34276 // thisMBB:
34277 // xbegin sinkMBB
34278 //
34279 // mainMBB:
34280 // s0 = -1
34281 //
34282 // fallBB:
34283 // eax = # XABORT_DEF
34284 // s1 = eax
34285 //
34286 // sinkMBB:
34287 // v = phi(s0/mainBB, s1/fallBB)
34288
34289 MachineBasicBlock *thisMBB = MBB;
34290 MachineFunction *MF = MBB->getParent();
34291 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
34292 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
34293 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
34294 MF->insert(I, mainMBB);
34295 MF->insert(I, fallMBB);
34296 MF->insert(I, sinkMBB);
34297
34298 if (isEFLAGSLiveAfter(MI, MBB)) {
34299 mainMBB->addLiveIn(X86::EFLAGS);
34300 fallMBB->addLiveIn(X86::EFLAGS);
34301 sinkMBB->addLiveIn(X86::EFLAGS);
34302 }
34303
34304 // Transfer the remainder of BB and its successor edges to sinkMBB.
34305 sinkMBB->splice(sinkMBB->begin(), MBB,
34306 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
34308
34310 Register DstReg = MI.getOperand(0).getReg();
34311 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
34312 Register mainDstReg = MRI.createVirtualRegister(RC);
34313 Register fallDstReg = MRI.createVirtualRegister(RC);
34314
34315 // thisMBB:
34316 // xbegin fallMBB
34317 // # fallthrough to mainMBB
34318 // # abortion to fallMBB
34319 BuildMI(thisMBB, MIMD, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
34320 thisMBB->addSuccessor(mainMBB);
34321 thisMBB->addSuccessor(fallMBB);
34322
34323 // mainMBB:
34324 // mainDstReg := -1
34325 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
34326 BuildMI(mainMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
34327 mainMBB->addSuccessor(sinkMBB);
34328
34329 // fallMBB:
34330 // ; pseudo instruction to model hardware's definition from XABORT
34331 // EAX := XABORT_DEF
34332 // fallDstReg := EAX
34333 BuildMI(fallMBB, MIMD, TII->get(X86::XABORT_DEF));
34334 BuildMI(fallMBB, MIMD, TII->get(TargetOpcode::COPY), fallDstReg)
34335 .addReg(X86::EAX);
34336 fallMBB->addSuccessor(sinkMBB);
34337
34338 // sinkMBB:
34339 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
34340 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
34341 .addReg(mainDstReg).addMBB(mainMBB)
34342 .addReg(fallDstReg).addMBB(fallMBB);
34343
34344 MI.eraseFromParent();
34345 return sinkMBB;
34346}
34347
34349X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
34350 MachineBasicBlock *MBB) const {
34351 // Emit va_arg instruction on X86-64.
34352
34353 // Operands to this pseudo-instruction:
34354 // 0 ) Output : destination address (reg)
34355 // 1-5) Input : va_list address (addr, i64mem)
34356 // 6 ) ArgSize : Size (in bytes) of vararg type
34357 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
34358 // 8 ) Align : Alignment of type
34359 // 9 ) EFLAGS (implicit-def)
34360
34361 assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!");
34362 static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
34363
34364 Register DestReg = MI.getOperand(0).getReg();
34365 MachineOperand &Base = MI.getOperand(1);
34366 MachineOperand &Scale = MI.getOperand(2);
34367 MachineOperand &Index = MI.getOperand(3);
34368 MachineOperand &Disp = MI.getOperand(4);
34369 MachineOperand &Segment = MI.getOperand(5);
34370 unsigned ArgSize = MI.getOperand(6).getImm();
34371 unsigned ArgMode = MI.getOperand(7).getImm();
34372 Align Alignment = Align(MI.getOperand(8).getImm());
34373
34374 MachineFunction *MF = MBB->getParent();
34375
34376 // Memory Reference
34377 assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand");
34378
34379 MachineMemOperand *OldMMO = MI.memoperands().front();
34380
34381 // Clone the MMO into two separate MMOs for loading and storing
34382 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
34383 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
34384 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
34385 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
34386
34387 // Machine Information
34388 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34390 const TargetRegisterClass *AddrRegClass =
34392 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
34393 const MIMetadata MIMD(MI);
34394
34395 // struct va_list {
34396 // i32 gp_offset
34397 // i32 fp_offset
34398 // i64 overflow_area (address)
34399 // i64 reg_save_area (address)
34400 // }
34401 // sizeof(va_list) = 24
34402 // alignment(va_list) = 8
34403
34404 unsigned TotalNumIntRegs = 6;
34405 unsigned TotalNumXMMRegs = 8;
34406 bool UseGPOffset = (ArgMode == 1);
34407 bool UseFPOffset = (ArgMode == 2);
34408 unsigned MaxOffset = TotalNumIntRegs * 8 +
34409 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
34410
34411 /* Align ArgSize to a multiple of 8 */
34412 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
34413 bool NeedsAlign = (Alignment > 8);
34414
34415 MachineBasicBlock *thisMBB = MBB;
34416 MachineBasicBlock *overflowMBB;
34417 MachineBasicBlock *offsetMBB;
34418 MachineBasicBlock *endMBB;
34419
34420 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
34421 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
34422 unsigned OffsetReg = 0;
34423
34424 if (!UseGPOffset && !UseFPOffset) {
34425 // If we only pull from the overflow region, we don't create a branch.
34426 // We don't need to alter control flow.
34427 OffsetDestReg = 0; // unused
34428 OverflowDestReg = DestReg;
34429
34430 offsetMBB = nullptr;
34431 overflowMBB = thisMBB;
34432 endMBB = thisMBB;
34433 } else {
34434 // First emit code to check if gp_offset (or fp_offset) is below the bound.
34435 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
34436 // If not, pull from overflow_area. (branch to overflowMBB)
34437 //
34438 // thisMBB
34439 // | .
34440 // | .
34441 // offsetMBB overflowMBB
34442 // | .
34443 // | .
34444 // endMBB
34445
34446 // Registers for the PHI in endMBB
34447 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
34448 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
34449
34450 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
34451 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34452 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34453 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34454
34456
34457 // Insert the new basic blocks
34458 MF->insert(MBBIter, offsetMBB);
34459 MF->insert(MBBIter, overflowMBB);
34460 MF->insert(MBBIter, endMBB);
34461
34462 // Transfer the remainder of MBB and its successor edges to endMBB.
34463 endMBB->splice(endMBB->begin(), thisMBB,
34464 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
34465 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
34466
34467 // Make offsetMBB and overflowMBB successors of thisMBB
34468 thisMBB->addSuccessor(offsetMBB);
34469 thisMBB->addSuccessor(overflowMBB);
34470
34471 // endMBB is a successor of both offsetMBB and overflowMBB
34472 offsetMBB->addSuccessor(endMBB);
34473 overflowMBB->addSuccessor(endMBB);
34474
34475 // Load the offset value into a register
34476 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
34477 BuildMI(thisMBB, MIMD, TII->get(X86::MOV32rm), OffsetReg)
34478 .add(Base)
34479 .add(Scale)
34480 .add(Index)
34481 .addDisp(Disp, UseFPOffset ? 4 : 0)
34482 .add(Segment)
34483 .setMemRefs(LoadOnlyMMO);
34484
34485 // Check if there is enough room left to pull this argument.
34486 BuildMI(thisMBB, MIMD, TII->get(X86::CMP32ri))
34487 .addReg(OffsetReg)
34488 .addImm(MaxOffset + 8 - ArgSizeA8);
34489
34490 // Branch to "overflowMBB" if offset >= max
34491 // Fall through to "offsetMBB" otherwise
34492 BuildMI(thisMBB, MIMD, TII->get(X86::JCC_1))
34493 .addMBB(overflowMBB).addImm(X86::COND_AE);
34494 }
34495
34496 // In offsetMBB, emit code to use the reg_save_area.
34497 if (offsetMBB) {
34498 assert(OffsetReg != 0);
34499
34500 // Read the reg_save_area address.
34501 Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
34502 BuildMI(
34503 offsetMBB, MIMD,
34504 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
34505 RegSaveReg)
34506 .add(Base)
34507 .add(Scale)
34508 .add(Index)
34509 .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
34510 .add(Segment)
34511 .setMemRefs(LoadOnlyMMO);
34512
34513 if (Subtarget.isTarget64BitLP64()) {
34514 // Zero-extend the offset
34515 Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
34516 BuildMI(offsetMBB, MIMD, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
34517 .addImm(0)
34518 .addReg(OffsetReg)
34519 .addImm(X86::sub_32bit);
34520
34521 // Add the offset to the reg_save_area to get the final address.
34522 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD64rr), OffsetDestReg)
34523 .addReg(OffsetReg64)
34524 .addReg(RegSaveReg);
34525 } else {
34526 // Add the offset to the reg_save_area to get the final address.
34527 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32rr), OffsetDestReg)
34528 .addReg(OffsetReg)
34529 .addReg(RegSaveReg);
34530 }
34531
34532 // Compute the offset for the next argument
34533 Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
34534 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32ri), NextOffsetReg)
34535 .addReg(OffsetReg)
34536 .addImm(UseFPOffset ? 16 : 8);
34537
34538 // Store it back into the va_list.
34539 BuildMI(offsetMBB, MIMD, TII->get(X86::MOV32mr))
34540 .add(Base)
34541 .add(Scale)
34542 .add(Index)
34543 .addDisp(Disp, UseFPOffset ? 4 : 0)
34544 .add(Segment)
34545 .addReg(NextOffsetReg)
34546 .setMemRefs(StoreOnlyMMO);
34547
34548 // Jump to endMBB
34549 BuildMI(offsetMBB, MIMD, TII->get(X86::JMP_1))
34550 .addMBB(endMBB);
34551 }
34552
34553 //
34554 // Emit code to use overflow area
34555 //
34556
34557 // Load the overflow_area address into a register.
34558 Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
34559 BuildMI(overflowMBB, MIMD,
34560 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
34561 OverflowAddrReg)
34562 .add(Base)
34563 .add(Scale)
34564 .add(Index)
34565 .addDisp(Disp, 8)
34566 .add(Segment)
34567 .setMemRefs(LoadOnlyMMO);
34568
34569 // If we need to align it, do so. Otherwise, just copy the address
34570 // to OverflowDestReg.
34571 if (NeedsAlign) {
34572 // Align the overflow address
34573 Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
34574
34575 // aligned_addr = (addr + (align-1)) & ~(align-1)
34576 BuildMI(
34577 overflowMBB, MIMD,
34578 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
34579 TmpReg)
34580 .addReg(OverflowAddrReg)
34581 .addImm(Alignment.value() - 1);
34582
34583 BuildMI(
34584 overflowMBB, MIMD,
34585 TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
34586 OverflowDestReg)
34587 .addReg(TmpReg)
34588 .addImm(~(uint64_t)(Alignment.value() - 1));
34589 } else {
34590 BuildMI(overflowMBB, MIMD, TII->get(TargetOpcode::COPY), OverflowDestReg)
34591 .addReg(OverflowAddrReg);
34592 }
34593
34594 // Compute the next overflow address after this argument.
34595 // (the overflow address should be kept 8-byte aligned)
34596 Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
34597 BuildMI(
34598 overflowMBB, MIMD,
34599 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
34600 NextAddrReg)
34601 .addReg(OverflowDestReg)
34602 .addImm(ArgSizeA8);
34603
34604 // Store the new overflow address.
34605 BuildMI(overflowMBB, MIMD,
34606 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
34607 .add(Base)
34608 .add(Scale)
34609 .add(Index)
34610 .addDisp(Disp, 8)
34611 .add(Segment)
34612 .addReg(NextAddrReg)
34613 .setMemRefs(StoreOnlyMMO);
34614
34615 // If we branched, emit the PHI to the front of endMBB.
34616 if (offsetMBB) {
34617 BuildMI(*endMBB, endMBB->begin(), MIMD,
34618 TII->get(X86::PHI), DestReg)
34619 .addReg(OffsetDestReg).addMBB(offsetMBB)
34620 .addReg(OverflowDestReg).addMBB(overflowMBB);
34621 }
34622
34623 // Erase the pseudo instruction
34624 MI.eraseFromParent();
34625
34626 return endMBB;
34627}
34628
34629// The EFLAGS operand of SelectItr might be missing a kill marker
34630// because there were multiple uses of EFLAGS, and ISel didn't know
34631// which to mark. Figure out whether SelectItr should have had a
34632// kill marker, and set it if it should. Returns the correct kill
34633// marker value.
34636 const TargetRegisterInfo* TRI) {
34637 if (isEFLAGSLiveAfter(SelectItr, BB))
34638 return false;
34639
34640 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
34641 // out. SelectMI should have a kill flag on EFLAGS.
34642 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
34643 return true;
34644}
34645
34646// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
34647// together with other CMOV pseudo-opcodes into a single basic-block with
34648// conditional jump around it.
34650 switch (MI.getOpcode()) {
34651 case X86::CMOV_FR16:
34652 case X86::CMOV_FR16X:
34653 case X86::CMOV_FR32:
34654 case X86::CMOV_FR32X:
34655 case X86::CMOV_FR64:
34656 case X86::CMOV_FR64X:
34657 case X86::CMOV_GR8:
34658 case X86::CMOV_GR16:
34659 case X86::CMOV_GR32:
34660 case X86::CMOV_RFP32:
34661 case X86::CMOV_RFP64:
34662 case X86::CMOV_RFP80:
34663 case X86::CMOV_VR64:
34664 case X86::CMOV_VR128:
34665 case X86::CMOV_VR128X:
34666 case X86::CMOV_VR256:
34667 case X86::CMOV_VR256X:
34668 case X86::CMOV_VR512:
34669 case X86::CMOV_VK1:
34670 case X86::CMOV_VK2:
34671 case X86::CMOV_VK4:
34672 case X86::CMOV_VK8:
34673 case X86::CMOV_VK16:
34674 case X86::CMOV_VK32:
34675 case X86::CMOV_VK64:
34676 return true;
34677
34678 default:
34679 return false;
34680 }
34681}
34682
34683// Helper function, which inserts PHI functions into SinkMBB:
34684// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
34685// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
34686// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
34687// the last PHI function inserted.
34690 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
34691 MachineBasicBlock *SinkMBB) {
34692 MachineFunction *MF = TrueMBB->getParent();
34694 const MIMetadata MIMD(*MIItBegin);
34695
34696 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
34698
34699 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
34700
34701 // As we are creating the PHIs, we have to be careful if there is more than
34702 // one. Later CMOVs may reference the results of earlier CMOVs, but later
34703 // PHIs have to reference the individual true/false inputs from earlier PHIs.
34704 // That also means that PHI construction must work forward from earlier to
34705 // later, and that the code must maintain a mapping from earlier PHI's
34706 // destination registers, and the registers that went into the PHI.
34709
34710 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
34711 Register DestReg = MIIt->getOperand(0).getReg();
34712 Register Op1Reg = MIIt->getOperand(1).getReg();
34713 Register Op2Reg = MIIt->getOperand(2).getReg();
34714
34715 // If this CMOV we are generating is the opposite condition from
34716 // the jump we generated, then we have to swap the operands for the
34717 // PHI that is going to be generated.
34718 if (MIIt->getOperand(3).getImm() == OppCC)
34719 std::swap(Op1Reg, Op2Reg);
34720
34721 if (RegRewriteTable.contains(Op1Reg))
34722 Op1Reg = RegRewriteTable[Op1Reg].first;
34723
34724 if (RegRewriteTable.contains(Op2Reg))
34725 Op2Reg = RegRewriteTable[Op2Reg].second;
34726
34727 MIB =
34728 BuildMI(*SinkMBB, SinkInsertionPoint, MIMD, TII->get(X86::PHI), DestReg)
34729 .addReg(Op1Reg)
34730 .addMBB(FalseMBB)
34731 .addReg(Op2Reg)
34732 .addMBB(TrueMBB);
34733
34734 // Add this PHI to the rewrite table.
34735 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
34736 }
34737
34738 return MIB;
34739}
34740
34741// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
34743X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
34744 MachineInstr &SecondCascadedCMOV,
34745 MachineBasicBlock *ThisMBB) const {
34746 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34747 const MIMetadata MIMD(FirstCMOV);
34748
34749 // We lower cascaded CMOVs such as
34750 //
34751 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
34752 //
34753 // to two successive branches.
34754 //
34755 // Without this, we would add a PHI between the two jumps, which ends up
34756 // creating a few copies all around. For instance, for
34757 //
34758 // (sitofp (zext (fcmp une)))
34759 //
34760 // we would generate:
34761 //
34762 // ucomiss %xmm1, %xmm0
34763 // movss <1.0f>, %xmm0
34764 // movaps %xmm0, %xmm1
34765 // jne .LBB5_2
34766 // xorps %xmm1, %xmm1
34767 // .LBB5_2:
34768 // jp .LBB5_4
34769 // movaps %xmm1, %xmm0
34770 // .LBB5_4:
34771 // retq
34772 //
34773 // because this custom-inserter would have generated:
34774 //
34775 // A
34776 // | \
34777 // | B
34778 // | /
34779 // C
34780 // | \
34781 // | D
34782 // | /
34783 // E
34784 //
34785 // A: X = ...; Y = ...
34786 // B: empty
34787 // C: Z = PHI [X, A], [Y, B]
34788 // D: empty
34789 // E: PHI [X, C], [Z, D]
34790 //
34791 // If we lower both CMOVs in a single step, we can instead generate:
34792 //
34793 // A
34794 // | \
34795 // | C
34796 // | /|
34797 // |/ |
34798 // | |
34799 // | D
34800 // | /
34801 // E
34802 //
34803 // A: X = ...; Y = ...
34804 // D: empty
34805 // E: PHI [X, A], [X, C], [Y, D]
34806 //
34807 // Which, in our sitofp/fcmp example, gives us something like:
34808 //
34809 // ucomiss %xmm1, %xmm0
34810 // movss <1.0f>, %xmm0
34811 // jne .LBB5_4
34812 // jp .LBB5_4
34813 // xorps %xmm0, %xmm0
34814 // .LBB5_4:
34815 // retq
34816 //
34817
34818 // We lower cascaded CMOV into two successive branches to the same block.
34819 // EFLAGS is used by both, so mark it as live in the second.
34820 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
34821 MachineFunction *F = ThisMBB->getParent();
34822 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
34823 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
34824 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
34825
34826 MachineFunction::iterator It = ++ThisMBB->getIterator();
34827 F->insert(It, FirstInsertedMBB);
34828 F->insert(It, SecondInsertedMBB);
34829 F->insert(It, SinkMBB);
34830
34831 // For a cascaded CMOV, we lower it to two successive branches to
34832 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
34833 // the FirstInsertedMBB.
34834 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
34835
34836 // If the EFLAGS register isn't dead in the terminator, then claim that it's
34837 // live into the sink and copy blocks.
34838 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
34839 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
34840 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
34841 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
34842 SinkMBB->addLiveIn(X86::EFLAGS);
34843 }
34844
34845 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
34846 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
34847 std::next(MachineBasicBlock::iterator(FirstCMOV)),
34848 ThisMBB->end());
34849 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
34850
34851 // Fallthrough block for ThisMBB.
34852 ThisMBB->addSuccessor(FirstInsertedMBB);
34853 // The true block target of the first branch is always SinkMBB.
34854 ThisMBB->addSuccessor(SinkMBB);
34855 // Fallthrough block for FirstInsertedMBB.
34856 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
34857 // The true block for the branch of FirstInsertedMBB.
34858 FirstInsertedMBB->addSuccessor(SinkMBB);
34859 // This is fallthrough.
34860 SecondInsertedMBB->addSuccessor(SinkMBB);
34861
34862 // Create the conditional branch instructions.
34863 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
34864 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
34865
34866 X86::CondCode SecondCC =
34867 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
34868 BuildMI(FirstInsertedMBB, MIMD, TII->get(X86::JCC_1))
34869 .addMBB(SinkMBB)
34870 .addImm(SecondCC);
34871
34872 // SinkMBB:
34873 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
34874 Register DestReg = SecondCascadedCMOV.getOperand(0).getReg();
34875 Register Op1Reg = FirstCMOV.getOperand(1).getReg();
34876 Register Op2Reg = FirstCMOV.getOperand(2).getReg();
34878 BuildMI(*SinkMBB, SinkMBB->begin(), MIMD, TII->get(X86::PHI), DestReg)
34879 .addReg(Op1Reg)
34880 .addMBB(SecondInsertedMBB)
34881 .addReg(Op2Reg)
34882 .addMBB(ThisMBB);
34883
34884 // The second SecondInsertedMBB provides the same incoming value as the
34885 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
34886 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
34887
34888 // Now remove the CMOVs.
34889 FirstCMOV.eraseFromParent();
34890 SecondCascadedCMOV.eraseFromParent();
34891
34892 return SinkMBB;
34893}
34894
34896X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
34897 MachineBasicBlock *ThisMBB) const {
34898 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34899 const MIMetadata MIMD(MI);
34900
34901 // To "insert" a SELECT_CC instruction, we actually have to insert the
34902 // diamond control-flow pattern. The incoming instruction knows the
34903 // destination vreg to set, the condition code register to branch on, the
34904 // true/false values to select between and a branch opcode to use.
34905
34906 // ThisMBB:
34907 // ...
34908 // TrueVal = ...
34909 // cmpTY ccX, r1, r2
34910 // bCC copy1MBB
34911 // fallthrough --> FalseMBB
34912
34913 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
34914 // as described above, by inserting a BB, and then making a PHI at the join
34915 // point to select the true and false operands of the CMOV in the PHI.
34916 //
34917 // The code also handles two different cases of multiple CMOV opcodes
34918 // in a row.
34919 //
34920 // Case 1:
34921 // In this case, there are multiple CMOVs in a row, all which are based on
34922 // the same condition setting (or the exact opposite condition setting).
34923 // In this case we can lower all the CMOVs using a single inserted BB, and
34924 // then make a number of PHIs at the join point to model the CMOVs. The only
34925 // trickiness here, is that in a case like:
34926 //
34927 // t2 = CMOV cond1 t1, f1
34928 // t3 = CMOV cond1 t2, f2
34929 //
34930 // when rewriting this into PHIs, we have to perform some renaming on the
34931 // temps since you cannot have a PHI operand refer to a PHI result earlier
34932 // in the same block. The "simple" but wrong lowering would be:
34933 //
34934 // t2 = PHI t1(BB1), f1(BB2)
34935 // t3 = PHI t2(BB1), f2(BB2)
34936 //
34937 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
34938 // renaming is to note that on the path through BB1, t2 is really just a
34939 // copy of t1, and do that renaming, properly generating:
34940 //
34941 // t2 = PHI t1(BB1), f1(BB2)
34942 // t3 = PHI t1(BB1), f2(BB2)
34943 //
34944 // Case 2:
34945 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
34946 // function - EmitLoweredCascadedSelect.
34947
34948 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
34950 MachineInstr *LastCMOV = &MI;
34952
34953 // Check for case 1, where there are multiple CMOVs with the same condition
34954 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
34955 // number of jumps the most.
34956
34957 if (isCMOVPseudo(MI)) {
34958 // See if we have a string of CMOVS with the same condition. Skip over
34959 // intervening debug insts.
34960 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
34961 (NextMIIt->getOperand(3).getImm() == CC ||
34962 NextMIIt->getOperand(3).getImm() == OppCC)) {
34963 LastCMOV = &*NextMIIt;
34964 NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
34965 }
34966 }
34967
34968 // This checks for case 2, but only do this if we didn't already find
34969 // case 1, as indicated by LastCMOV == MI.
34970 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
34971 NextMIIt->getOpcode() == MI.getOpcode() &&
34972 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
34973 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
34974 NextMIIt->getOperand(1).isKill()) {
34975 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
34976 }
34977
34978 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
34979 MachineFunction *F = ThisMBB->getParent();
34980 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
34981 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
34982
34983 MachineFunction::iterator It = ++ThisMBB->getIterator();
34984 F->insert(It, FalseMBB);
34985 F->insert(It, SinkMBB);
34986
34987 // Set the call frame size on entry to the new basic blocks.
34988 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
34989 FalseMBB->setCallFrameSize(CallFrameSize);
34990 SinkMBB->setCallFrameSize(CallFrameSize);
34991
34992 // If the EFLAGS register isn't dead in the terminator, then claim that it's
34993 // live into the sink and copy blocks.
34994 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
34995 if (!LastCMOV->killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
34996 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
34997 FalseMBB->addLiveIn(X86::EFLAGS);
34998 SinkMBB->addLiveIn(X86::EFLAGS);
34999 }
35000
35001 // Transfer any debug instructions inside the CMOV sequence to the sunk block.
35003 MachineBasicBlock::iterator(LastCMOV));
35004 for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))
35005 if (MI.isDebugInstr())
35006 SinkMBB->push_back(MI.removeFromParent());
35007
35008 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
35009 SinkMBB->splice(SinkMBB->end(), ThisMBB,
35010 std::next(MachineBasicBlock::iterator(LastCMOV)),
35011 ThisMBB->end());
35012 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
35013
35014 // Fallthrough block for ThisMBB.
35015 ThisMBB->addSuccessor(FalseMBB);
35016 // The true block target of the first (or only) branch is always a SinkMBB.
35017 ThisMBB->addSuccessor(SinkMBB);
35018 // Fallthrough block for FalseMBB.
35019 FalseMBB->addSuccessor(SinkMBB);
35020
35021 // Create the conditional branch instruction.
35022 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
35023
35024 // SinkMBB:
35025 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
35026 // ...
35029 std::next(MachineBasicBlock::iterator(LastCMOV));
35030 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
35031
35032 // Now remove the CMOV(s).
35033 ThisMBB->erase(MIItBegin, MIItEnd);
35034
35035 return SinkMBB;
35036}
35037
35038static unsigned getSUBriOpcode(bool IsLP64) {
35039 if (IsLP64)
35040 return X86::SUB64ri32;
35041 else
35042 return X86::SUB32ri;
35043}
35044
35046X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
35047 MachineBasicBlock *MBB) const {
35048 MachineFunction *MF = MBB->getParent();
35049 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35050 const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
35051 const MIMetadata MIMD(MI);
35052 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
35053
35054 const unsigned ProbeSize = getStackProbeSize(*MF);
35055
35057 MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35058 MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35059 MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35060
35062 MF->insert(MBBIter, testMBB);
35063 MF->insert(MBBIter, blockMBB);
35064 MF->insert(MBBIter, tailMBB);
35065
35066 Register sizeVReg = MI.getOperand(1).getReg();
35067
35068 Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
35069
35070 Register TmpStackPtr = MRI.createVirtualRegister(
35071 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
35072 Register FinalStackPtr = MRI.createVirtualRegister(
35073 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
35074
35075 BuildMI(*MBB, {MI}, MIMD, TII->get(TargetOpcode::COPY), TmpStackPtr)
35076 .addReg(physSPReg);
35077 {
35078 const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
35079 BuildMI(*MBB, {MI}, MIMD, TII->get(Opc), FinalStackPtr)
35080 .addReg(TmpStackPtr)
35081 .addReg(sizeVReg);
35082 }
35083
35084 // test rsp size
35085
35086 BuildMI(testMBB, MIMD,
35087 TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
35088 .addReg(FinalStackPtr)
35089 .addReg(physSPReg);
35090
35091 BuildMI(testMBB, MIMD, TII->get(X86::JCC_1))
35092 .addMBB(tailMBB)
35094 testMBB->addSuccessor(blockMBB);
35095 testMBB->addSuccessor(tailMBB);
35096
35097 // Touch the block then extend it. This is done on the opposite side of
35098 // static probe where we allocate then touch, to avoid the need of probing the
35099 // tail of the static alloca. Possible scenarios are:
35100 //
35101 // + ---- <- ------------ <- ------------- <- ------------ +
35102 // | |
35103 // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
35104 // | |
35105 // + <- ----------- <- ------------ <- ----------- <- ------------ +
35106 //
35107 // The property we want to enforce is to never have more than [page alloc] between two probes.
35108
35109 const unsigned XORMIOpc =
35110 TFI.Uses64BitFramePtr ? X86::XOR64mi32 : X86::XOR32mi;
35111 addRegOffset(BuildMI(blockMBB, MIMD, TII->get(XORMIOpc)), physSPReg, false, 0)
35112 .addImm(0);
35113
35114 BuildMI(blockMBB, MIMD, TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr)),
35115 physSPReg)
35116 .addReg(physSPReg)
35117 .addImm(ProbeSize);
35118
35119 BuildMI(blockMBB, MIMD, TII->get(X86::JMP_1)).addMBB(testMBB);
35120 blockMBB->addSuccessor(testMBB);
35121
35122 // Replace original instruction by the expected stack ptr
35123 BuildMI(tailMBB, MIMD, TII->get(TargetOpcode::COPY),
35124 MI.getOperand(0).getReg())
35125 .addReg(FinalStackPtr);
35126
35127 tailMBB->splice(tailMBB->end(), MBB,
35128 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
35130 MBB->addSuccessor(testMBB);
35131
35132 // Delete the original pseudo instruction.
35133 MI.eraseFromParent();
35134
35135 // And we're done.
35136 return tailMBB;
35137}
35138
35140X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
35141 MachineBasicBlock *BB) const {
35142 MachineFunction *MF = BB->getParent();
35143 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35144 const MIMetadata MIMD(MI);
35145 const BasicBlock *LLVM_BB = BB->getBasicBlock();
35146
35147 assert(MF->shouldSplitStack());
35148
35149 const bool Is64Bit = Subtarget.is64Bit();
35150 const bool IsLP64 = Subtarget.isTarget64BitLP64();
35151
35152 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
35153 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
35154
35155 // BB:
35156 // ... [Till the alloca]
35157 // If stacklet is not large enough, jump to mallocMBB
35158 //
35159 // bumpMBB:
35160 // Allocate by subtracting from RSP
35161 // Jump to continueMBB
35162 //
35163 // mallocMBB:
35164 // Allocate by call to runtime
35165 //
35166 // continueMBB:
35167 // ...
35168 // [rest of original BB]
35169 //
35170
35171 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35172 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35173 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35174
35176 const TargetRegisterClass *AddrRegClass =
35178
35179 Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
35180 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
35181 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
35182 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
35183 sizeVReg = MI.getOperand(1).getReg(),
35184 physSPReg =
35185 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
35186
35187 MachineFunction::iterator MBBIter = ++BB->getIterator();
35188
35189 MF->insert(MBBIter, bumpMBB);
35190 MF->insert(MBBIter, mallocMBB);
35191 MF->insert(MBBIter, continueMBB);
35192
35193 continueMBB->splice(continueMBB->begin(), BB,
35194 std::next(MachineBasicBlock::iterator(MI)), BB->end());
35195 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
35196
35197 // Add code to the main basic block to check if the stack limit has been hit,
35198 // and if so, jump to mallocMBB otherwise to bumpMBB.
35199 BuildMI(BB, MIMD, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
35200 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
35201 .addReg(tmpSPVReg).addReg(sizeVReg);
35202 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
35203 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
35204 .addReg(SPLimitVReg);
35205 BuildMI(BB, MIMD, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
35206
35207 // bumpMBB simply decreases the stack pointer, since we know the current
35208 // stacklet has enough space.
35209 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), physSPReg)
35210 .addReg(SPLimitVReg);
35211 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
35212 .addReg(SPLimitVReg);
35213 BuildMI(bumpMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
35214
35215 // Calls into a routine in libgcc to allocate more space from the heap.
35216 const uint32_t *RegMask =
35218 if (IsLP64) {
35219 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV64rr), X86::RDI)
35220 .addReg(sizeVReg);
35221 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
35222 .addExternalSymbol("__morestack_allocate_stack_space")
35223 .addRegMask(RegMask)
35224 .addReg(X86::RDI, RegState::Implicit)
35225 .addReg(X86::RAX, RegState::ImplicitDefine);
35226 } else if (Is64Bit) {
35227 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV32rr), X86::EDI)
35228 .addReg(sizeVReg);
35229 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
35230 .addExternalSymbol("__morestack_allocate_stack_space")
35231 .addRegMask(RegMask)
35232 .addReg(X86::EDI, RegState::Implicit)
35233 .addReg(X86::EAX, RegState::ImplicitDefine);
35234 } else {
35235 BuildMI(mallocMBB, MIMD, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
35236 .addImm(12);
35237 BuildMI(mallocMBB, MIMD, TII->get(X86::PUSH32r)).addReg(sizeVReg);
35238 BuildMI(mallocMBB, MIMD, TII->get(X86::CALLpcrel32))
35239 .addExternalSymbol("__morestack_allocate_stack_space")
35240 .addRegMask(RegMask)
35241 .addReg(X86::EAX, RegState::ImplicitDefine);
35242 }
35243
35244 if (!Is64Bit)
35245 BuildMI(mallocMBB, MIMD, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
35246 .addImm(16);
35247
35248 BuildMI(mallocMBB, MIMD, TII->get(TargetOpcode::COPY), mallocPtrVReg)
35249 .addReg(IsLP64 ? X86::RAX : X86::EAX);
35250 BuildMI(mallocMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
35251
35252 // Set up the CFG correctly.
35253 BB->addSuccessor(bumpMBB);
35254 BB->addSuccessor(mallocMBB);
35255 mallocMBB->addSuccessor(continueMBB);
35256 bumpMBB->addSuccessor(continueMBB);
35257
35258 // Take care of the PHI nodes.
35259 BuildMI(*continueMBB, continueMBB->begin(), MIMD, TII->get(X86::PHI),
35260 MI.getOperand(0).getReg())
35261 .addReg(mallocPtrVReg)
35262 .addMBB(mallocMBB)
35263 .addReg(bumpSPPtrVReg)
35264 .addMBB(bumpMBB);
35265
35266 // Delete the original pseudo instruction.
35267 MI.eraseFromParent();
35268
35269 // And we're done.
35270 return continueMBB;
35271}
35272
35274X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
35275 MachineBasicBlock *BB) const {
35276 MachineFunction *MF = BB->getParent();
35277 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
35278 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
35279 const MIMetadata MIMD(MI);
35280
35283 "SEH does not use catchret!");
35284
35285 // Only 32-bit EH needs to worry about manually restoring stack pointers.
35286 if (!Subtarget.is32Bit())
35287 return BB;
35288
35289 // C++ EH creates a new target block to hold the restore code, and wires up
35290 // the new block to the return destination with a normal JMP_4.
35291 MachineBasicBlock *RestoreMBB =
35293 assert(BB->succ_size() == 1);
35294 MF->insert(std::next(BB->getIterator()), RestoreMBB);
35295 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
35296 BB->addSuccessor(RestoreMBB);
35297 MI.getOperand(0).setMBB(RestoreMBB);
35298
35299 // Marking this as an EH pad but not a funclet entry block causes PEI to
35300 // restore stack pointers in the block.
35301 RestoreMBB->setIsEHPad(true);
35302
35303 auto RestoreMBBI = RestoreMBB->begin();
35304 BuildMI(*RestoreMBB, RestoreMBBI, MIMD, TII.get(X86::JMP_4)).addMBB(TargetMBB);
35305 return BB;
35306}
35307
35309X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
35310 MachineBasicBlock *BB) const {
35311 // So, here we replace TLSADDR with the sequence:
35312 // adjust_stackdown -> TLSADDR -> adjust_stackup.
35313 // We need this because TLSADDR is lowered into calls
35314 // inside MC, therefore without the two markers shrink-wrapping
35315 // may push the prologue/epilogue pass them.
35316 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
35317 const MIMetadata MIMD(MI);
35318 MachineFunction &MF = *BB->getParent();
35319
35320 // Emit CALLSEQ_START right before the instruction.
35321 BB->getParent()->getFrameInfo().setAdjustsStack(true);
35322 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
35323 MachineInstrBuilder CallseqStart =
35324 BuildMI(MF, MIMD, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
35325 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
35326
35327 // Emit CALLSEQ_END right after the instruction.
35328 // We don't call erase from parent because we want to keep the
35329 // original instruction around.
35330 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
35331 MachineInstrBuilder CallseqEnd =
35332 BuildMI(MF, MIMD, TII.get(AdjStackUp)).addImm(0).addImm(0);
35333 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
35334
35335 return BB;
35336}
35337
35339X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
35340 MachineBasicBlock *BB) const {
35341 // This is pretty easy. We're taking the value that we received from
35342 // our load from the relocation, sticking it in either RDI (x86-64)
35343 // or EAX and doing an indirect call. The return value will then
35344 // be in the normal return register.
35345 MachineFunction *F = BB->getParent();
35346 const X86InstrInfo *TII = Subtarget.getInstrInfo();
35347 const MIMetadata MIMD(MI);
35348
35349 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
35350 assert(MI.getOperand(3).isGlobal() && "This should be a global");
35351
35352 // Get a register mask for the lowered call.
35353 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
35354 // proper register mask.
35355 const uint32_t *RegMask =
35356 Subtarget.is64Bit() ?
35359 if (Subtarget.is64Bit()) {
35361 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV64rm), X86::RDI)
35362 .addReg(X86::RIP)
35363 .addImm(0)
35364 .addReg(0)
35365 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
35366 MI.getOperand(3).getTargetFlags())
35367 .addReg(0);
35368 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL64m));
35369 addDirectMem(MIB, X86::RDI);
35370 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
35371 } else if (!isPositionIndependent()) {
35373 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
35374 .addReg(0)
35375 .addImm(0)
35376 .addReg(0)
35377 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
35378 MI.getOperand(3).getTargetFlags())
35379 .addReg(0);
35380 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
35381 addDirectMem(MIB, X86::EAX);
35382 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
35383 } else {
35385 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
35386 .addReg(TII->getGlobalBaseReg(F))
35387 .addImm(0)
35388 .addReg(0)
35389 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
35390 MI.getOperand(3).getTargetFlags())
35391 .addReg(0);
35392 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
35393 addDirectMem(MIB, X86::EAX);
35394 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
35395 }
35396
35397 MI.eraseFromParent(); // The pseudo instruction is gone now.
35398 return BB;
35399}
35400
35401static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
35402 switch (RPOpc) {
35403 case X86::INDIRECT_THUNK_CALL32:
35404 return X86::CALLpcrel32;
35405 case X86::INDIRECT_THUNK_CALL64:
35406 return X86::CALL64pcrel32;
35407 case X86::INDIRECT_THUNK_TCRETURN32:
35408 return X86::TCRETURNdi;
35409 case X86::INDIRECT_THUNK_TCRETURN64:
35410 return X86::TCRETURNdi64;
35411 }
35412 llvm_unreachable("not indirect thunk opcode");
35413}
35414
35415static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
35416 unsigned Reg) {
35417 if (Subtarget.useRetpolineExternalThunk()) {
35418 // When using an external thunk for retpolines, we pick names that match the
35419 // names GCC happens to use as well. This helps simplify the implementation
35420 // of the thunks for kernels where they have no easy ability to create
35421 // aliases and are doing non-trivial configuration of the thunk's body. For
35422 // example, the Linux kernel will do boot-time hot patching of the thunk
35423 // bodies and cannot easily export aliases of these to loaded modules.
35424 //
35425 // Note that at any point in the future, we may need to change the semantics
35426 // of how we implement retpolines and at that time will likely change the
35427 // name of the called thunk. Essentially, there is no hard guarantee that
35428 // LLVM will generate calls to specific thunks, we merely make a best-effort
35429 // attempt to help out kernels and other systems where duplicating the
35430 // thunks is costly.
35431 switch (Reg) {
35432 case X86::EAX:
35433 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35434 return "__x86_indirect_thunk_eax";
35435 case X86::ECX:
35436 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35437 return "__x86_indirect_thunk_ecx";
35438 case X86::EDX:
35439 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35440 return "__x86_indirect_thunk_edx";
35441 case X86::EDI:
35442 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35443 return "__x86_indirect_thunk_edi";
35444 case X86::R11:
35445 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
35446 return "__x86_indirect_thunk_r11";
35447 }
35448 llvm_unreachable("unexpected reg for external indirect thunk");
35449 }
35450
35451 if (Subtarget.useRetpolineIndirectCalls() ||
35452 Subtarget.useRetpolineIndirectBranches()) {
35453 // When targeting an internal COMDAT thunk use an LLVM-specific name.
35454 switch (Reg) {
35455 case X86::EAX:
35456 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35457 return "__llvm_retpoline_eax";
35458 case X86::ECX:
35459 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35460 return "__llvm_retpoline_ecx";
35461 case X86::EDX:
35462 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35463 return "__llvm_retpoline_edx";
35464 case X86::EDI:
35465 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35466 return "__llvm_retpoline_edi";
35467 case X86::R11:
35468 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
35469 return "__llvm_retpoline_r11";
35470 }
35471 llvm_unreachable("unexpected reg for retpoline");
35472 }
35473
35474 if (Subtarget.useLVIControlFlowIntegrity()) {
35475 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
35476 return "__llvm_lvi_thunk_r11";
35477 }
35478 llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
35479}
35480
35482X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
35483 MachineBasicBlock *BB) const {
35484 // Copy the virtual register into the R11 physical register and
35485 // call the retpoline thunk.
35486 const MIMetadata MIMD(MI);
35487 const X86InstrInfo *TII = Subtarget.getInstrInfo();
35488 Register CalleeVReg = MI.getOperand(0).getReg();
35489 unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
35490
35491 // Find an available scratch register to hold the callee. On 64-bit, we can
35492 // just use R11, but we scan for uses anyway to ensure we don't generate
35493 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
35494 // already a register use operand to the call to hold the callee. If none
35495 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
35496 // register and ESI is the base pointer to realigned stack frames with VLAs.
35497 SmallVector<unsigned, 3> AvailableRegs;
35498 if (Subtarget.is64Bit())
35499 AvailableRegs.push_back(X86::R11);
35500 else
35501 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
35502
35503 // Zero out any registers that are already used.
35504 for (const auto &MO : MI.operands()) {
35505 if (MO.isReg() && MO.isUse())
35506 for (unsigned &Reg : AvailableRegs)
35507 if (Reg == MO.getReg())
35508 Reg = 0;
35509 }
35510
35511 // Choose the first remaining non-zero available register.
35512 unsigned AvailableReg = 0;
35513 for (unsigned MaybeReg : AvailableRegs) {
35514 if (MaybeReg) {
35515 AvailableReg = MaybeReg;
35516 break;
35517 }
35518 }
35519 if (!AvailableReg)
35520 report_fatal_error("calling convention incompatible with retpoline, no "
35521 "available registers");
35522
35523 const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
35524
35525 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), AvailableReg)
35526 .addReg(CalleeVReg);
35527 MI.getOperand(0).ChangeToES(Symbol);
35528 MI.setDesc(TII->get(Opc));
35530 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
35531 return BB;
35532}
35533
35534/// SetJmp implies future control flow change upon calling the corresponding
35535/// LongJmp.
35536/// Instead of using the 'return' instruction, the long jump fixes the stack and
35537/// performs an indirect branch. To do so it uses the registers that were stored
35538/// in the jump buffer (when calling SetJmp).
35539/// In case the shadow stack is enabled we need to fix it as well, because some
35540/// return addresses will be skipped.
35541/// The function will save the SSP for future fixing in the function
35542/// emitLongJmpShadowStackFix.
35543/// \sa emitLongJmpShadowStackFix
35544/// \param [in] MI The temporary Machine Instruction for the builtin.
35545/// \param [in] MBB The Machine Basic Block that will be modified.
35546void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
35547 MachineBasicBlock *MBB) const {
35548 const MIMetadata MIMD(MI);
35549 MachineFunction *MF = MBB->getParent();
35550 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35553
35554 // Memory Reference.
35555 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
35556 MI.memoperands_end());
35557
35558 // Initialize a register with zero.
35559 MVT PVT = getPointerTy(MF->getDataLayout());
35560 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
35561 Register ZReg = MRI.createVirtualRegister(PtrRC);
35562 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
35563 BuildMI(*MBB, MI, MIMD, TII->get(XorRROpc))
35564 .addDef(ZReg)
35565 .addReg(ZReg, RegState::Undef)
35566 .addReg(ZReg, RegState::Undef);
35567
35568 // Read the current SSP Register value to the zeroed register.
35569 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
35570 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
35571 BuildMI(*MBB, MI, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
35572
35573 // Write the SSP register value to offset 3 in input memory buffer.
35574 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
35575 MIB = BuildMI(*MBB, MI, MIMD, TII->get(PtrStoreOpc));
35576 const int64_t SSPOffset = 3 * PVT.getStoreSize();
35577 const unsigned MemOpndSlot = 1;
35578 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35579 if (i == X86::AddrDisp)
35580 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
35581 else
35582 MIB.add(MI.getOperand(MemOpndSlot + i));
35583 }
35584 MIB.addReg(SSPCopyReg);
35585 MIB.setMemRefs(MMOs);
35586}
35587
35589X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
35590 MachineBasicBlock *MBB) const {
35591 const MIMetadata MIMD(MI);
35592 MachineFunction *MF = MBB->getParent();
35593 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35594 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
35596
35597 const BasicBlock *BB = MBB->getBasicBlock();
35599
35600 // Memory Reference
35601 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
35602 MI.memoperands_end());
35603
35604 unsigned DstReg;
35605 unsigned MemOpndSlot = 0;
35606
35607 unsigned CurOp = 0;
35608
35609 DstReg = MI.getOperand(CurOp++).getReg();
35610 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
35611 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
35612 (void)TRI;
35613 Register mainDstReg = MRI.createVirtualRegister(RC);
35614 Register restoreDstReg = MRI.createVirtualRegister(RC);
35615
35616 MemOpndSlot = CurOp;
35617
35618 MVT PVT = getPointerTy(MF->getDataLayout());
35619 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
35620 "Invalid Pointer Size!");
35621
35622 // For v = setjmp(buf), we generate
35623 //
35624 // thisMBB:
35625 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
35626 // SjLjSetup restoreMBB
35627 //
35628 // mainMBB:
35629 // v_main = 0
35630 //
35631 // sinkMBB:
35632 // v = phi(main, restore)
35633 //
35634 // restoreMBB:
35635 // if base pointer being used, load it from frame
35636 // v_restore = 1
35637
35638 MachineBasicBlock *thisMBB = MBB;
35639 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
35640 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35641 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
35642 MF->insert(I, mainMBB);
35643 MF->insert(I, sinkMBB);
35644 MF->push_back(restoreMBB);
35645 restoreMBB->setMachineBlockAddressTaken();
35646
35648
35649 // Transfer the remainder of BB and its successor edges to sinkMBB.
35650 sinkMBB->splice(sinkMBB->begin(), MBB,
35651 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
35653
35654 // thisMBB:
35655 unsigned PtrStoreOpc = 0;
35656 unsigned LabelReg = 0;
35657 const int64_t LabelOffset = 1 * PVT.getStoreSize();
35658 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
35660
35661 // Prepare IP either in reg or imm.
35662 if (!UseImmLabel) {
35663 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
35664 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
35665 LabelReg = MRI.createVirtualRegister(PtrRC);
35666 if (Subtarget.is64Bit()) {
35667 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA64r), LabelReg)
35668 .addReg(X86::RIP)
35669 .addImm(0)
35670 .addReg(0)
35671 .addMBB(restoreMBB)
35672 .addReg(0);
35673 } else {
35674 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
35675 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA32r), LabelReg)
35676 .addReg(XII->getGlobalBaseReg(MF))
35677 .addImm(0)
35678 .addReg(0)
35679 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
35680 .addReg(0);
35681 }
35682 } else
35683 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
35684 // Store IP
35685 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrStoreOpc));
35686 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35687 if (i == X86::AddrDisp)
35688 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
35689 else
35690 MIB.add(MI.getOperand(MemOpndSlot + i));
35691 }
35692 if (!UseImmLabel)
35693 MIB.addReg(LabelReg);
35694 else
35695 MIB.addMBB(restoreMBB);
35696 MIB.setMemRefs(MMOs);
35697
35698 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
35699 emitSetJmpShadowStackFix(MI, thisMBB);
35700 }
35701
35702 // Setup
35703 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::EH_SjLj_Setup))
35704 .addMBB(restoreMBB);
35705
35706 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
35707 MIB.addRegMask(RegInfo->getNoPreservedMask());
35708 thisMBB->addSuccessor(mainMBB);
35709 thisMBB->addSuccessor(restoreMBB);
35710
35711 // mainMBB:
35712 // EAX = 0
35713 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32r0), mainDstReg);
35714 mainMBB->addSuccessor(sinkMBB);
35715
35716 // sinkMBB:
35717 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
35718 .addReg(mainDstReg)
35719 .addMBB(mainMBB)
35720 .addReg(restoreDstReg)
35721 .addMBB(restoreMBB);
35722
35723 // restoreMBB:
35724 if (RegInfo->hasBasePointer(*MF)) {
35725 const bool Uses64BitFramePtr =
35726 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
35728 X86FI->setRestoreBasePointer(MF);
35729 Register FramePtr = RegInfo->getFrameRegister(*MF);
35730 Register BasePtr = RegInfo->getBaseRegister();
35731 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
35732 addRegOffset(BuildMI(restoreMBB, MIMD, TII->get(Opm), BasePtr),
35733 FramePtr, true, X86FI->getRestoreBasePointerOffset())
35735 }
35736 BuildMI(restoreMBB, MIMD, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
35737 BuildMI(restoreMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
35738 restoreMBB->addSuccessor(sinkMBB);
35739
35740 MI.eraseFromParent();
35741 return sinkMBB;
35742}
35743
35744/// Fix the shadow stack using the previously saved SSP pointer.
35745/// \sa emitSetJmpShadowStackFix
35746/// \param [in] MI The temporary Machine Instruction for the builtin.
35747/// \param [in] MBB The Machine Basic Block that will be modified.
35748/// \return The sink MBB that will perform the future indirect branch.
35750X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
35751 MachineBasicBlock *MBB) const {
35752 const MIMetadata MIMD(MI);
35753 MachineFunction *MF = MBB->getParent();
35754 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35756
35757 // Memory Reference
35758 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
35759 MI.memoperands_end());
35760
35761 MVT PVT = getPointerTy(MF->getDataLayout());
35762 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
35763
35764 // checkSspMBB:
35765 // xor vreg1, vreg1
35766 // rdssp vreg1
35767 // test vreg1, vreg1
35768 // je sinkMBB # Jump if Shadow Stack is not supported
35769 // fallMBB:
35770 // mov buf+24/12(%rip), vreg2
35771 // sub vreg1, vreg2
35772 // jbe sinkMBB # No need to fix the Shadow Stack
35773 // fixShadowMBB:
35774 // shr 3/2, vreg2
35775 // incssp vreg2 # fix the SSP according to the lower 8 bits
35776 // shr 8, vreg2
35777 // je sinkMBB
35778 // fixShadowLoopPrepareMBB:
35779 // shl vreg2
35780 // mov 128, vreg3
35781 // fixShadowLoopMBB:
35782 // incssp vreg3
35783 // dec vreg2
35784 // jne fixShadowLoopMBB # Iterate until you finish fixing
35785 // # the Shadow Stack
35786 // sinkMBB:
35787
35789 const BasicBlock *BB = MBB->getBasicBlock();
35790
35791 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
35792 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
35793 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
35794 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
35795 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
35796 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35797 MF->insert(I, checkSspMBB);
35798 MF->insert(I, fallMBB);
35799 MF->insert(I, fixShadowMBB);
35800 MF->insert(I, fixShadowLoopPrepareMBB);
35801 MF->insert(I, fixShadowLoopMBB);
35802 MF->insert(I, sinkMBB);
35803
35804 // Transfer the remainder of BB and its successor edges to sinkMBB.
35805 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
35806 MBB->end());
35808
35809 MBB->addSuccessor(checkSspMBB);
35810
35811 // Initialize a register with zero.
35812 Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
35813 BuildMI(checkSspMBB, MIMD, TII->get(X86::MOV32r0), ZReg);
35814
35815 if (PVT == MVT::i64) {
35816 Register TmpZReg = MRI.createVirtualRegister(PtrRC);
35817 BuildMI(checkSspMBB, MIMD, TII->get(X86::SUBREG_TO_REG), TmpZReg)
35818 .addImm(0)
35819 .addReg(ZReg)
35820 .addImm(X86::sub_32bit);
35821 ZReg = TmpZReg;
35822 }
35823
35824 // Read the current SSP Register value to the zeroed register.
35825 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
35826 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
35827 BuildMI(checkSspMBB, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
35828
35829 // Check whether the result of the SSP register is zero and jump directly
35830 // to the sink.
35831 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
35832 BuildMI(checkSspMBB, MIMD, TII->get(TestRROpc))
35833 .addReg(SSPCopyReg)
35834 .addReg(SSPCopyReg);
35835 BuildMI(checkSspMBB, MIMD, TII->get(X86::JCC_1))
35836 .addMBB(sinkMBB)
35838 checkSspMBB->addSuccessor(sinkMBB);
35839 checkSspMBB->addSuccessor(fallMBB);
35840
35841 // Reload the previously saved SSP register value.
35842 Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
35843 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
35844 const int64_t SPPOffset = 3 * PVT.getStoreSize();
35846 BuildMI(fallMBB, MIMD, TII->get(PtrLoadOpc), PrevSSPReg);
35847 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35848 const MachineOperand &MO = MI.getOperand(i);
35849 if (i == X86::AddrDisp)
35850 MIB.addDisp(MO, SPPOffset);
35851 else if (MO.isReg()) // Don't add the whole operand, we don't want to
35852 // preserve kill flags.
35853 MIB.addReg(MO.getReg());
35854 else
35855 MIB.add(MO);
35856 }
35857 MIB.setMemRefs(MMOs);
35858
35859 // Subtract the current SSP from the previous SSP.
35860 Register SspSubReg = MRI.createVirtualRegister(PtrRC);
35861 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
35862 BuildMI(fallMBB, MIMD, TII->get(SubRROpc), SspSubReg)
35863 .addReg(PrevSSPReg)
35864 .addReg(SSPCopyReg);
35865
35866 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
35867 BuildMI(fallMBB, MIMD, TII->get(X86::JCC_1))
35868 .addMBB(sinkMBB)
35870 fallMBB->addSuccessor(sinkMBB);
35871 fallMBB->addSuccessor(fixShadowMBB);
35872
35873 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
35874 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
35875 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
35876 Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
35877 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspFirstShrReg)
35878 .addReg(SspSubReg)
35879 .addImm(Offset);
35880
35881 // Increase SSP when looking only on the lower 8 bits of the delta.
35882 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
35883 BuildMI(fixShadowMBB, MIMD, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
35884
35885 // Reset the lower 8 bits.
35886 Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
35887 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspSecondShrReg)
35888 .addReg(SspFirstShrReg)
35889 .addImm(8);
35890
35891 // Jump if the result of the shift is zero.
35892 BuildMI(fixShadowMBB, MIMD, TII->get(X86::JCC_1))
35893 .addMBB(sinkMBB)
35895 fixShadowMBB->addSuccessor(sinkMBB);
35896 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
35897
35898 // Do a single shift left.
35899 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64ri : X86::SHL32ri;
35900 Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
35901 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(ShlR1Opc), SspAfterShlReg)
35902 .addReg(SspSecondShrReg)
35903 .addImm(1);
35904
35905 // Save the value 128 to a register (will be used next with incssp).
35906 Register Value128InReg = MRI.createVirtualRegister(PtrRC);
35907 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
35908 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(MovRIOpc), Value128InReg)
35909 .addImm(128);
35910 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
35911
35912 // Since incssp only looks at the lower 8 bits, we might need to do several
35913 // iterations of incssp until we finish fixing the shadow stack.
35914 Register DecReg = MRI.createVirtualRegister(PtrRC);
35915 Register CounterReg = MRI.createVirtualRegister(PtrRC);
35916 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::PHI), CounterReg)
35917 .addReg(SspAfterShlReg)
35918 .addMBB(fixShadowLoopPrepareMBB)
35919 .addReg(DecReg)
35920 .addMBB(fixShadowLoopMBB);
35921
35922 // Every iteration we increase the SSP by 128.
35923 BuildMI(fixShadowLoopMBB, MIMD, TII->get(IncsspOpc)).addReg(Value128InReg);
35924
35925 // Every iteration we decrement the counter by 1.
35926 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
35927 BuildMI(fixShadowLoopMBB, MIMD, TII->get(DecROpc), DecReg).addReg(CounterReg);
35928
35929 // Jump if the counter is not zero yet.
35930 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::JCC_1))
35931 .addMBB(fixShadowLoopMBB)
35933 fixShadowLoopMBB->addSuccessor(sinkMBB);
35934 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
35935
35936 return sinkMBB;
35937}
35938
35940X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
35941 MachineBasicBlock *MBB) const {
35942 const MIMetadata MIMD(MI);
35943 MachineFunction *MF = MBB->getParent();
35944 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35946
35947 // Memory Reference
35948 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
35949 MI.memoperands_end());
35950
35951 MVT PVT = getPointerTy(MF->getDataLayout());
35952 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
35953 "Invalid Pointer Size!");
35954
35955 const TargetRegisterClass *RC =
35956 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
35957 Register Tmp = MRI.createVirtualRegister(RC);
35958 // Since FP is only updated here but NOT referenced, it's treated as GPR.
35959 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
35960 Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
35961 Register SP = RegInfo->getStackRegister();
35962
35964
35965 const int64_t LabelOffset = 1 * PVT.getStoreSize();
35966 const int64_t SPOffset = 2 * PVT.getStoreSize();
35967
35968 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
35969 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
35970
35971 MachineBasicBlock *thisMBB = MBB;
35972
35973 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
35974 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
35975 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
35976 }
35977
35978 // Reload FP
35979 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), FP);
35980 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35981 const MachineOperand &MO = MI.getOperand(i);
35982 if (MO.isReg()) // Don't add the whole operand, we don't want to
35983 // preserve kill flags.
35984 MIB.addReg(MO.getReg());
35985 else
35986 MIB.add(MO);
35987 }
35988 MIB.setMemRefs(MMOs);
35989
35990 // Reload IP
35991 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), Tmp);
35992 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35993 const MachineOperand &MO = MI.getOperand(i);
35994 if (i == X86::AddrDisp)
35995 MIB.addDisp(MO, LabelOffset);
35996 else if (MO.isReg()) // Don't add the whole operand, we don't want to
35997 // preserve kill flags.
35998 MIB.addReg(MO.getReg());
35999 else
36000 MIB.add(MO);
36001 }
36002 MIB.setMemRefs(MMOs);
36003
36004 // Reload SP
36005 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), SP);
36006 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
36007 if (i == X86::AddrDisp)
36008 MIB.addDisp(MI.getOperand(i), SPOffset);
36009 else
36010 MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
36011 // the last instruction of the expansion.
36012 }
36013 MIB.setMemRefs(MMOs);
36014
36015 // Jump
36016 BuildMI(*thisMBB, MI, MIMD, TII->get(IJmpOpc)).addReg(Tmp);
36017
36018 MI.eraseFromParent();
36019 return thisMBB;
36020}
36021
36022void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
36024 MachineBasicBlock *DispatchBB,
36025 int FI) const {
36026 const MIMetadata MIMD(MI);
36027 MachineFunction *MF = MBB->getParent();
36029 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36030
36031 MVT PVT = getPointerTy(MF->getDataLayout());
36032 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
36033
36034 unsigned Op = 0;
36035 unsigned VR = 0;
36036
36037 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
36039
36040 if (UseImmLabel) {
36041 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
36042 } else {
36043 const TargetRegisterClass *TRC =
36044 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
36045 VR = MRI->createVirtualRegister(TRC);
36046 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
36047
36048 if (Subtarget.is64Bit())
36049 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA64r), VR)
36050 .addReg(X86::RIP)
36051 .addImm(1)
36052 .addReg(0)
36053 .addMBB(DispatchBB)
36054 .addReg(0);
36055 else
36056 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA32r), VR)
36057 .addReg(0) /* TII->getGlobalBaseReg(MF) */
36058 .addImm(1)
36059 .addReg(0)
36060 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
36061 .addReg(0);
36062 }
36063
36064 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MIMD, TII->get(Op));
36065 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
36066 if (UseImmLabel)
36067 MIB.addMBB(DispatchBB);
36068 else
36069 MIB.addReg(VR);
36070}
36071
36073X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
36074 MachineBasicBlock *BB) const {
36075 const MIMetadata MIMD(MI);
36076 MachineFunction *MF = BB->getParent();
36078 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36079 int FI = MF->getFrameInfo().getFunctionContextIndex();
36080
36081 // Get a mapping of the call site numbers to all of the landing pads they're
36082 // associated with.
36084 unsigned MaxCSNum = 0;
36085 for (auto &MBB : *MF) {
36086 if (!MBB.isEHPad())
36087 continue;
36088
36089 MCSymbol *Sym = nullptr;
36090 for (const auto &MI : MBB) {
36091 if (MI.isDebugInstr())
36092 continue;
36093
36094 assert(MI.isEHLabel() && "expected EH_LABEL");
36095 Sym = MI.getOperand(0).getMCSymbol();
36096 break;
36097 }
36098
36099 if (!MF->hasCallSiteLandingPad(Sym))
36100 continue;
36101
36102 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
36103 CallSiteNumToLPad[CSI].push_back(&MBB);
36104 MaxCSNum = std::max(MaxCSNum, CSI);
36105 }
36106 }
36107
36108 // Get an ordered list of the machine basic blocks for the jump table.
36109 std::vector<MachineBasicBlock *> LPadList;
36111 LPadList.reserve(CallSiteNumToLPad.size());
36112
36113 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
36114 for (auto &LP : CallSiteNumToLPad[CSI]) {
36115 LPadList.push_back(LP);
36116 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
36117 }
36118 }
36119
36120 assert(!LPadList.empty() &&
36121 "No landing pad destinations for the dispatch jump table!");
36122
36123 // Create the MBBs for the dispatch code.
36124
36125 // Shove the dispatch's address into the return slot in the function context.
36126 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
36127 DispatchBB->setIsEHPad(true);
36128
36129 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
36130 BuildMI(TrapBB, MIMD, TII->get(X86::TRAP));
36131 DispatchBB->addSuccessor(TrapBB);
36132
36133 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
36134 DispatchBB->addSuccessor(DispContBB);
36135
36136 // Insert MBBs.
36137 MF->push_back(DispatchBB);
36138 MF->push_back(DispContBB);
36139 MF->push_back(TrapBB);
36140
36141 // Insert code into the entry block that creates and registers the function
36142 // context.
36143 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
36144
36145 // Create the jump table and associated information
36146 unsigned JTE = getJumpTableEncoding();
36147 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
36148 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
36149
36150 const X86RegisterInfo &RI = TII->getRegisterInfo();
36151 // Add a register mask with no preserved registers. This results in all
36152 // registers being marked as clobbered.
36153 if (RI.hasBasePointer(*MF)) {
36154 const bool FPIs64Bit =
36155 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
36156 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
36157 MFI->setRestoreBasePointer(MF);
36158
36159 Register FP = RI.getFrameRegister(*MF);
36160 Register BP = RI.getBaseRegister();
36161 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
36162 addRegOffset(BuildMI(DispatchBB, MIMD, TII->get(Op), BP), FP, true,
36165 } else {
36166 BuildMI(DispatchBB, MIMD, TII->get(X86::NOOP))
36168 }
36169
36170 // IReg is used as an index in a memory operand and therefore can't be SP
36171 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
36172 addFrameReference(BuildMI(DispatchBB, MIMD, TII->get(X86::MOV32rm), IReg), FI,
36173 Subtarget.is64Bit() ? 8 : 4);
36174 BuildMI(DispatchBB, MIMD, TII->get(X86::CMP32ri))
36175 .addReg(IReg)
36176 .addImm(LPadList.size());
36177 BuildMI(DispatchBB, MIMD, TII->get(X86::JCC_1))
36178 .addMBB(TrapBB)
36180
36181 if (Subtarget.is64Bit()) {
36182 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
36183 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
36184
36185 // leaq .LJTI0_0(%rip), BReg
36186 BuildMI(DispContBB, MIMD, TII->get(X86::LEA64r), BReg)
36187 .addReg(X86::RIP)
36188 .addImm(1)
36189 .addReg(0)
36190 .addJumpTableIndex(MJTI)
36191 .addReg(0);
36192 // movzx IReg64, IReg
36193 BuildMI(DispContBB, MIMD, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
36194 .addImm(0)
36195 .addReg(IReg)
36196 .addImm(X86::sub_32bit);
36197
36198 switch (JTE) {
36200 // jmpq *(BReg,IReg64,8)
36201 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64m))
36202 .addReg(BReg)
36203 .addImm(8)
36204 .addReg(IReg64)
36205 .addImm(0)
36206 .addReg(0);
36207 break;
36209 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
36210 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
36211 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
36212
36213 // movl (BReg,IReg64,4), OReg
36214 BuildMI(DispContBB, MIMD, TII->get(X86::MOV32rm), OReg)
36215 .addReg(BReg)
36216 .addImm(4)
36217 .addReg(IReg64)
36218 .addImm(0)
36219 .addReg(0);
36220 // movsx OReg64, OReg
36221 BuildMI(DispContBB, MIMD, TII->get(X86::MOVSX64rr32), OReg64)
36222 .addReg(OReg);
36223 // addq BReg, OReg64, TReg
36224 BuildMI(DispContBB, MIMD, TII->get(X86::ADD64rr), TReg)
36225 .addReg(OReg64)
36226 .addReg(BReg);
36227 // jmpq *TReg
36228 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64r)).addReg(TReg);
36229 break;
36230 }
36231 default:
36232 llvm_unreachable("Unexpected jump table encoding");
36233 }
36234 } else {
36235 // jmpl *.LJTI0_0(,IReg,4)
36236 BuildMI(DispContBB, MIMD, TII->get(X86::JMP32m))
36237 .addReg(0)
36238 .addImm(4)
36239 .addReg(IReg)
36240 .addJumpTableIndex(MJTI)
36241 .addReg(0);
36242 }
36243
36244 // Add the jump table entries as successors to the MBB.
36246 for (auto &LP : LPadList)
36247 if (SeenMBBs.insert(LP).second)
36248 DispContBB->addSuccessor(LP);
36249
36250 // N.B. the order the invoke BBs are processed in doesn't matter here.
36252 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
36253 for (MachineBasicBlock *MBB : InvokeBBs) {
36254 // Remove the landing pad successor from the invoke block and replace it
36255 // with the new dispatch block.
36256 // Keep a copy of Successors since it's modified inside the loop.
36258 MBB->succ_rend());
36259 // FIXME: Avoid quadratic complexity.
36260 for (auto *MBBS : Successors) {
36261 if (MBBS->isEHPad()) {
36262 MBB->removeSuccessor(MBBS);
36263 MBBLPads.push_back(MBBS);
36264 }
36265 }
36266
36267 MBB->addSuccessor(DispatchBB);
36268
36269 // Find the invoke call and mark all of the callee-saved registers as
36270 // 'implicit defined' so that they're spilled. This prevents code from
36271 // moving instructions to before the EH block, where they will never be
36272 // executed.
36273 for (auto &II : reverse(*MBB)) {
36274 if (!II.isCall())
36275 continue;
36276
36278 for (auto &MOp : II.operands())
36279 if (MOp.isReg())
36280 DefRegs[MOp.getReg()] = true;
36281
36282 MachineInstrBuilder MIB(*MF, &II);
36283 for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
36284 unsigned Reg = SavedRegs[RegIdx];
36285 if (!DefRegs[Reg])
36287 }
36288
36289 break;
36290 }
36291 }
36292
36293 // Mark all former landing pads as non-landing pads. The dispatch is the only
36294 // landing pad now.
36295 for (auto &LP : MBBLPads)
36296 LP->setIsEHPad(false);
36297
36298 // The instruction is gone now.
36299 MI.eraseFromParent();
36300 return BB;
36301}
36302
36305 MachineBasicBlock *BB) const {
36306 MachineFunction *MF = BB->getParent();
36307 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36308 const MIMetadata MIMD(MI);
36309
36310 auto TMMImmToTMMReg = [](unsigned Imm) {
36311 assert (Imm < 8 && "Illegal tmm index");
36312 return X86::TMM0 + Imm;
36313 };
36314 switch (MI.getOpcode()) {
36315 default: llvm_unreachable("Unexpected instr type to insert");
36316 case X86::TLS_addr32:
36317 case X86::TLS_addr64:
36318 case X86::TLS_addrX32:
36319 case X86::TLS_base_addr32:
36320 case X86::TLS_base_addr64:
36321 case X86::TLS_base_addrX32:
36322 case X86::TLS_desc32:
36323 case X86::TLS_desc64:
36324 return EmitLoweredTLSAddr(MI, BB);
36325 case X86::INDIRECT_THUNK_CALL32:
36326 case X86::INDIRECT_THUNK_CALL64:
36327 case X86::INDIRECT_THUNK_TCRETURN32:
36328 case X86::INDIRECT_THUNK_TCRETURN64:
36329 return EmitLoweredIndirectThunk(MI, BB);
36330 case X86::CATCHRET:
36331 return EmitLoweredCatchRet(MI, BB);
36332 case X86::SEG_ALLOCA_32:
36333 case X86::SEG_ALLOCA_64:
36334 return EmitLoweredSegAlloca(MI, BB);
36335 case X86::PROBED_ALLOCA_32:
36336 case X86::PROBED_ALLOCA_64:
36337 return EmitLoweredProbedAlloca(MI, BB);
36338 case X86::TLSCall_32:
36339 case X86::TLSCall_64:
36340 return EmitLoweredTLSCall(MI, BB);
36341 case X86::CMOV_FR16:
36342 case X86::CMOV_FR16X:
36343 case X86::CMOV_FR32:
36344 case X86::CMOV_FR32X:
36345 case X86::CMOV_FR64:
36346 case X86::CMOV_FR64X:
36347 case X86::CMOV_GR8:
36348 case X86::CMOV_GR16:
36349 case X86::CMOV_GR32:
36350 case X86::CMOV_RFP32:
36351 case X86::CMOV_RFP64:
36352 case X86::CMOV_RFP80:
36353 case X86::CMOV_VR64:
36354 case X86::CMOV_VR128:
36355 case X86::CMOV_VR128X:
36356 case X86::CMOV_VR256:
36357 case X86::CMOV_VR256X:
36358 case X86::CMOV_VR512:
36359 case X86::CMOV_VK1:
36360 case X86::CMOV_VK2:
36361 case X86::CMOV_VK4:
36362 case X86::CMOV_VK8:
36363 case X86::CMOV_VK16:
36364 case X86::CMOV_VK32:
36365 case X86::CMOV_VK64:
36366 return EmitLoweredSelect(MI, BB);
36367
36368 case X86::FP80_ADDr:
36369 case X86::FP80_ADDm32: {
36370 // Change the floating point control register to use double extended
36371 // precision when performing the addition.
36372 int OrigCWFrameIdx =
36373 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
36374 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
36375 OrigCWFrameIdx);
36376
36377 // Load the old value of the control word...
36378 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
36379 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
36380 OrigCWFrameIdx);
36381
36382 // OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended
36383 // precision.
36384 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
36385 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
36386 .addReg(OldCW, RegState::Kill)
36387 .addImm(0x300);
36388
36389 // Extract to 16 bits.
36390 Register NewCW16 =
36391 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
36392 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
36393 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
36394
36395 // Prepare memory for FLDCW.
36396 int NewCWFrameIdx =
36397 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
36398 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
36399 NewCWFrameIdx)
36400 .addReg(NewCW16, RegState::Kill);
36401
36402 // Reload the modified control word now...
36403 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
36404 NewCWFrameIdx);
36405
36406 // Do the addition.
36407 if (MI.getOpcode() == X86::FP80_ADDr) {
36408 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80))
36409 .add(MI.getOperand(0))
36410 .add(MI.getOperand(1))
36411 .add(MI.getOperand(2));
36412 } else {
36413 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80m32))
36414 .add(MI.getOperand(0))
36415 .add(MI.getOperand(1))
36416 .add(MI.getOperand(2))
36417 .add(MI.getOperand(3))
36418 .add(MI.getOperand(4))
36419 .add(MI.getOperand(5))
36420 .add(MI.getOperand(6));
36421 }
36422
36423 // Reload the original control word now.
36424 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
36425 OrigCWFrameIdx);
36426
36427 MI.eraseFromParent(); // The pseudo instruction is gone now.
36428 return BB;
36429 }
36430
36431 case X86::FP32_TO_INT16_IN_MEM:
36432 case X86::FP32_TO_INT32_IN_MEM:
36433 case X86::FP32_TO_INT64_IN_MEM:
36434 case X86::FP64_TO_INT16_IN_MEM:
36435 case X86::FP64_TO_INT32_IN_MEM:
36436 case X86::FP64_TO_INT64_IN_MEM:
36437 case X86::FP80_TO_INT16_IN_MEM:
36438 case X86::FP80_TO_INT32_IN_MEM:
36439 case X86::FP80_TO_INT64_IN_MEM: {
36440 // Change the floating point control register to use "round towards zero"
36441 // mode when truncating to an integer value.
36442 int OrigCWFrameIdx =
36443 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
36444 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
36445 OrigCWFrameIdx);
36446
36447 // Load the old value of the control word...
36448 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
36449 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
36450 OrigCWFrameIdx);
36451
36452 // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
36453 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
36454 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
36455 .addReg(OldCW, RegState::Kill).addImm(0xC00);
36456
36457 // Extract to 16 bits.
36458 Register NewCW16 =
36459 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
36460 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
36461 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
36462
36463 // Prepare memory for FLDCW.
36464 int NewCWFrameIdx =
36465 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
36466 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
36467 NewCWFrameIdx)
36468 .addReg(NewCW16, RegState::Kill);
36469
36470 // Reload the modified control word now...
36471 addFrameReference(BuildMI(*BB, MI, MIMD,
36472 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
36473
36474 // Get the X86 opcode to use.
36475 unsigned Opc;
36476 switch (MI.getOpcode()) {
36477 // clang-format off
36478 default: llvm_unreachable("illegal opcode!");
36479 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
36480 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
36481 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
36482 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
36483 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
36484 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
36485 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
36486 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
36487 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
36488 // clang-format on
36489 }
36490
36492 addFullAddress(BuildMI(*BB, MI, MIMD, TII->get(Opc)), AM)
36493 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
36494
36495 // Reload the original control word now.
36496 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
36497 OrigCWFrameIdx);
36498
36499 MI.eraseFromParent(); // The pseudo instruction is gone now.
36500 return BB;
36501 }
36502
36503 // xbegin
36504 case X86::XBEGIN:
36505 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
36506
36507 case X86::VAARG_64:
36508 case X86::VAARG_X32:
36509 return EmitVAARGWithCustomInserter(MI, BB);
36510
36511 case X86::EH_SjLj_SetJmp32:
36512 case X86::EH_SjLj_SetJmp64:
36513 return emitEHSjLjSetJmp(MI, BB);
36514
36515 case X86::EH_SjLj_LongJmp32:
36516 case X86::EH_SjLj_LongJmp64:
36517 return emitEHSjLjLongJmp(MI, BB);
36518
36519 case X86::Int_eh_sjlj_setup_dispatch:
36520 return EmitSjLjDispatchBlock(MI, BB);
36521
36522 case TargetOpcode::STATEPOINT:
36523 // As an implementation detail, STATEPOINT shares the STACKMAP format at
36524 // this point in the process. We diverge later.
36525 return emitPatchPoint(MI, BB);
36526
36527 case TargetOpcode::STACKMAP:
36528 case TargetOpcode::PATCHPOINT:
36529 return emitPatchPoint(MI, BB);
36530
36531 case TargetOpcode::PATCHABLE_EVENT_CALL:
36532 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
36533 return BB;
36534
36535 case X86::LCMPXCHG8B: {
36536 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36537 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
36538 // requires a memory operand. If it happens that current architecture is
36539 // i686 and for current function we need a base pointer
36540 // - which is ESI for i686 - register allocator would not be able to
36541 // allocate registers for an address in form of X(%reg, %reg, Y)
36542 // - there never would be enough unreserved registers during regalloc
36543 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
36544 // We are giving a hand to register allocator by precomputing the address in
36545 // a new vreg using LEA.
36546
36547 // If it is not i686 or there is no base pointer - nothing to do here.
36548 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
36549 return BB;
36550
36551 // Even though this code does not necessarily needs the base pointer to
36552 // be ESI, we check for that. The reason: if this assert fails, there are
36553 // some changes happened in the compiler base pointer handling, which most
36554 // probably have to be addressed somehow here.
36555 assert(TRI->getBaseRegister() == X86::ESI &&
36556 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
36557 "base pointer in mind");
36558
36560 MVT SPTy = getPointerTy(MF->getDataLayout());
36561 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
36562 Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
36563
36565 // Regalloc does not need any help when the memory operand of CMPXCHG8B
36566 // does not use index register.
36567 if (AM.IndexReg == X86::NoRegister)
36568 return BB;
36569
36570 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
36571 // four operand definitions that are E[ABCD] registers. We skip them and
36572 // then insert the LEA.
36573 MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
36574 while (RMBBI != BB->rend() &&
36575 (RMBBI->definesRegister(X86::EAX, /*TRI=*/nullptr) ||
36576 RMBBI->definesRegister(X86::EBX, /*TRI=*/nullptr) ||
36577 RMBBI->definesRegister(X86::ECX, /*TRI=*/nullptr) ||
36578 RMBBI->definesRegister(X86::EDX, /*TRI=*/nullptr))) {
36579 ++RMBBI;
36580 }
36583 BuildMI(*BB, *MBBI, MIMD, TII->get(X86::LEA32r), computedAddrVReg), AM);
36584
36585 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
36586
36587 return BB;
36588 }
36589 case X86::LCMPXCHG16B_NO_RBX: {
36590 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36591 Register BasePtr = TRI->getBaseRegister();
36592 if (TRI->hasBasePointer(*MF) &&
36593 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
36594 if (!BB->isLiveIn(BasePtr))
36595 BB->addLiveIn(BasePtr);
36596 // Save RBX into a virtual register.
36597 Register SaveRBX =
36598 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36599 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
36600 .addReg(X86::RBX);
36601 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36603 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
36604 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
36605 MIB.add(MI.getOperand(Idx));
36606 MIB.add(MI.getOperand(X86::AddrNumOperands));
36607 MIB.addReg(SaveRBX);
36608 } else {
36609 // Simple case, just copy the virtual register to RBX.
36610 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::RBX)
36611 .add(MI.getOperand(X86::AddrNumOperands));
36613 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B));
36614 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
36615 MIB.add(MI.getOperand(Idx));
36616 }
36617 MI.eraseFromParent();
36618 return BB;
36619 }
36620 case X86::MWAITX: {
36621 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36622 Register BasePtr = TRI->getBaseRegister();
36623 bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
36624 // If no need to save the base pointer, we generate MWAITXrrr,
36625 // else we generate pseudo MWAITX_SAVE_RBX.
36626 if (!IsRBX || !TRI->hasBasePointer(*MF)) {
36627 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
36628 .addReg(MI.getOperand(0).getReg());
36629 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
36630 .addReg(MI.getOperand(1).getReg());
36631 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EBX)
36632 .addReg(MI.getOperand(2).getReg());
36633 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITXrrr));
36634 MI.eraseFromParent();
36635 } else {
36636 if (!BB->isLiveIn(BasePtr)) {
36637 BB->addLiveIn(BasePtr);
36638 }
36639 // Parameters can be copied into ECX and EAX but not EBX yet.
36640 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
36641 .addReg(MI.getOperand(0).getReg());
36642 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
36643 .addReg(MI.getOperand(1).getReg());
36644 assert(Subtarget.is64Bit() && "Expected 64-bit mode!");
36645 // Save RBX into a virtual register.
36646 Register SaveRBX =
36647 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36648 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
36649 .addReg(X86::RBX);
36650 // Generate mwaitx pseudo.
36651 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36652 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITX_SAVE_RBX))
36653 .addDef(Dst) // Destination tied in with SaveRBX.
36654 .addReg(MI.getOperand(2).getReg()) // input value of EBX.
36655 .addUse(SaveRBX); // Save of base pointer.
36656 MI.eraseFromParent();
36657 }
36658 return BB;
36659 }
36660 case TargetOpcode::PREALLOCATED_SETUP: {
36661 assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
36662 auto MFI = MF->getInfo<X86MachineFunctionInfo>();
36663 MFI->setHasPreallocatedCall(true);
36664 int64_t PreallocatedId = MI.getOperand(0).getImm();
36665 size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
36666 assert(StackAdjustment != 0 && "0 stack adjustment");
36667 LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
36668 << StackAdjustment << "\n");
36669 BuildMI(*BB, MI, MIMD, TII->get(X86::SUB32ri), X86::ESP)
36670 .addReg(X86::ESP)
36671 .addImm(StackAdjustment);
36672 MI.eraseFromParent();
36673 return BB;
36674 }
36675 case TargetOpcode::PREALLOCATED_ARG: {
36676 assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
36677 int64_t PreallocatedId = MI.getOperand(1).getImm();
36678 int64_t ArgIdx = MI.getOperand(2).getImm();
36679 auto MFI = MF->getInfo<X86MachineFunctionInfo>();
36680 size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
36681 LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
36682 << ", arg offset " << ArgOffset << "\n");
36683 // stack pointer + offset
36684 addRegOffset(BuildMI(*BB, MI, MIMD, TII->get(X86::LEA32r),
36685 MI.getOperand(0).getReg()),
36686 X86::ESP, false, ArgOffset);
36687 MI.eraseFromParent();
36688 return BB;
36689 }
36690 case X86::PTDPBSSD:
36691 case X86::PTDPBSUD:
36692 case X86::PTDPBUSD:
36693 case X86::PTDPBUUD:
36694 case X86::PTDPBF16PS:
36695 case X86::PTDPFP16PS: {
36696 unsigned Opc;
36697 switch (MI.getOpcode()) {
36698 // clang-format off
36699 default: llvm_unreachable("illegal opcode!");
36700 case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
36701 case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
36702 case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
36703 case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
36704 case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
36705 case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;
36706 // clang-format on
36707 }
36708
36709 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
36710 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
36711 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
36712 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
36713 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
36714
36715 MI.eraseFromParent(); // The pseudo is gone now.
36716 return BB;
36717 }
36718 case X86::PTILEZERO: {
36719 unsigned Imm = MI.getOperand(0).getImm();
36720 BuildMI(*BB, MI, MIMD, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
36721 MI.eraseFromParent(); // The pseudo is gone now.
36722 return BB;
36723 }
36724 case X86::PTILELOADD:
36725 case X86::PTILELOADDT1:
36726 case X86::PTILESTORED: {
36727 unsigned Opc;
36728 switch (MI.getOpcode()) {
36729 default: llvm_unreachable("illegal opcode!");
36730#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
36731 case X86::PTILELOADD:
36732 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
36733 break;
36734 case X86::PTILELOADDT1:
36735 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDT1);
36736 break;
36737 case X86::PTILESTORED:
36738 Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED);
36739 break;
36740#undef GET_EGPR_IF_ENABLED
36741 }
36742
36743 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
36744 unsigned CurOp = 0;
36745 if (Opc != X86::TILESTORED && Opc != X86::TILESTORED_EVEX)
36746 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
36748
36749 MIB.add(MI.getOperand(CurOp++)); // base
36750 MIB.add(MI.getOperand(CurOp++)); // scale
36751 MIB.add(MI.getOperand(CurOp++)); // index -- stride
36752 MIB.add(MI.getOperand(CurOp++)); // displacement
36753 MIB.add(MI.getOperand(CurOp++)); // segment
36754
36755 if (Opc == X86::TILESTORED || Opc == X86::TILESTORED_EVEX)
36756 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
36758
36759 MI.eraseFromParent(); // The pseudo is gone now.
36760 return BB;
36761 }
36762 case X86::PTCMMIMFP16PS:
36763 case X86::PTCMMRLFP16PS: {
36764 const MIMetadata MIMD(MI);
36765 unsigned Opc;
36766 switch (MI.getOpcode()) {
36767 // clang-format off
36768 default: llvm_unreachable("Unexpected instruction!");
36769 case X86::PTCMMIMFP16PS: Opc = X86::TCMMIMFP16PS; break;
36770 case X86::PTCMMRLFP16PS: Opc = X86::TCMMRLFP16PS; break;
36771 // clang-format on
36772 }
36773 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
36774 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
36775 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
36776 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
36777 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
36778 MI.eraseFromParent(); // The pseudo is gone now.
36779 return BB;
36780 }
36781 }
36782}
36783
36784//===----------------------------------------------------------------------===//
36785// X86 Optimization Hooks
36786//===----------------------------------------------------------------------===//
36787
36788bool
36790 const APInt &DemandedBits,
36791 const APInt &DemandedElts,
36792 TargetLoweringOpt &TLO) const {
36793 EVT VT = Op.getValueType();
36794 unsigned Opcode = Op.getOpcode();
36795 unsigned EltSize = VT.getScalarSizeInBits();
36796
36797 if (VT.isVector()) {
36798 // If the constant is only all signbits in the active bits, then we should
36799 // extend it to the entire constant to allow it act as a boolean constant
36800 // vector.
36801 auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
36802 if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
36803 return false;
36804 for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
36805 if (!DemandedElts[i] || V.getOperand(i).isUndef())
36806 continue;
36807 const APInt &Val = V.getConstantOperandAPInt(i);
36808 if (Val.getBitWidth() > Val.getNumSignBits() &&
36809 Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
36810 return true;
36811 }
36812 return false;
36813 };
36814 // For vectors - if we have a constant, then try to sign extend.
36815 // TODO: Handle AND cases.
36816 unsigned ActiveBits = DemandedBits.getActiveBits();
36817 if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
36818 (Opcode == ISD::OR || Opcode == ISD::XOR || Opcode == X86ISD::ANDNP) &&
36819 NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
36820 EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
36821 EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
36823 SDValue NewC =
36825 Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
36826 SDValue NewOp =
36827 TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
36828 return TLO.CombineTo(Op, NewOp);
36829 }
36830 return false;
36831 }
36832
36833 // Only optimize Ands to prevent shrinking a constant that could be
36834 // matched by movzx.
36835 if (Opcode != ISD::AND)
36836 return false;
36837
36838 // Make sure the RHS really is a constant.
36839 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
36840 if (!C)
36841 return false;
36842
36843 const APInt &Mask = C->getAPIntValue();
36844
36845 // Clear all non-demanded bits initially.
36846 APInt ShrunkMask = Mask & DemandedBits;
36847
36848 // Find the width of the shrunk mask.
36849 unsigned Width = ShrunkMask.getActiveBits();
36850
36851 // If the mask is all 0s there's nothing to do here.
36852 if (Width == 0)
36853 return false;
36854
36855 // Find the next power of 2 width, rounding up to a byte.
36856 Width = llvm::bit_ceil(std::max(Width, 8U));
36857 // Truncate the width to size to handle illegal types.
36858 Width = std::min(Width, EltSize);
36859
36860 // Calculate a possible zero extend mask for this constant.
36861 APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
36862
36863 // If we aren't changing the mask, just return true to keep it and prevent
36864 // the caller from optimizing.
36865 if (ZeroExtendMask == Mask)
36866 return true;
36867
36868 // Make sure the new mask can be represented by a combination of mask bits
36869 // and non-demanded bits.
36870 if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
36871 return false;
36872
36873 // Replace the constant with the zero extend mask.
36874 SDLoc DL(Op);
36875 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
36876 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
36877 return TLO.CombineTo(Op, NewOp);
36878}
36879
36881 KnownBits &Known,
36882 const APInt &DemandedElts,
36883 const SelectionDAG &DAG, unsigned Depth) {
36884 KnownBits Known2;
36885 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
36886 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
36887 Known = DAG.computeKnownBits(RHS, DemandedSrcElts, Depth + 1);
36888 Known2 = DAG.computeKnownBits(LHS, DemandedSrcElts, Depth + 1);
36889 Known = KnownBits::abdu(Known, Known2).zext(16);
36890 // Known = (((D0 + D1) + (D2 + D3)) + ((D4 + D5) + (D6 + D7)))
36891 Known = KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/true, /*NUW=*/true,
36892 Known, Known);
36893 Known = KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/true, /*NUW=*/true,
36894 Known, Known);
36895 Known = KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/true, /*NUW=*/true,
36896 Known, Known);
36897 Known = Known.zext(64);
36898}
36899
36901 KnownBits &Known,
36902 const APInt &DemandedElts,
36903 const SelectionDAG &DAG,
36904 unsigned Depth) const {
36905 unsigned BitWidth = Known.getBitWidth();
36906 unsigned NumElts = DemandedElts.getBitWidth();
36907 unsigned Opc = Op.getOpcode();
36908 EVT VT = Op.getValueType();
36909 assert((Opc >= ISD::BUILTIN_OP_END ||
36910 Opc == ISD::INTRINSIC_WO_CHAIN ||
36911 Opc == ISD::INTRINSIC_W_CHAIN ||
36912 Opc == ISD::INTRINSIC_VOID) &&
36913 "Should use MaskedValueIsZero if you don't know whether Op"
36914 " is a target node!");
36915
36916 Known.resetAll();
36917 switch (Opc) {
36918 default: break;
36919 case X86ISD::MUL_IMM: {
36920 KnownBits Known2;
36921 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
36922 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
36923 Known = KnownBits::mul(Known, Known2);
36924 break;
36925 }
36926 case X86ISD::SETCC:
36927 Known.Zero.setBitsFrom(1);
36928 break;
36929 case X86ISD::MOVMSK: {
36930 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
36931 Known.Zero.setBitsFrom(NumLoBits);
36932 break;
36933 }
36934 case X86ISD::PEXTRB:
36935 case X86ISD::PEXTRW: {
36936 SDValue Src = Op.getOperand(0);
36937 EVT SrcVT = Src.getValueType();
36938 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
36939 Op.getConstantOperandVal(1));
36940 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
36941 Known = Known.anyextOrTrunc(BitWidth);
36942 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
36943 break;
36944 }
36945 case X86ISD::VSRAI:
36946 case X86ISD::VSHLI:
36947 case X86ISD::VSRLI: {
36948 unsigned ShAmt = Op.getConstantOperandVal(1);
36949 if (ShAmt >= VT.getScalarSizeInBits()) {
36950 // Out of range logical bit shifts are guaranteed to be zero.
36951 // Out of range arithmetic bit shifts splat the sign bit.
36952 if (Opc != X86ISD::VSRAI) {
36953 Known.setAllZero();
36954 break;
36955 }
36956
36957 ShAmt = VT.getScalarSizeInBits() - 1;
36958 }
36959
36960 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
36961 if (Opc == X86ISD::VSHLI) {
36962 Known.Zero <<= ShAmt;
36963 Known.One <<= ShAmt;
36964 // Low bits are known zero.
36965 Known.Zero.setLowBits(ShAmt);
36966 } else if (Opc == X86ISD::VSRLI) {
36967 Known.Zero.lshrInPlace(ShAmt);
36968 Known.One.lshrInPlace(ShAmt);
36969 // High bits are known zero.
36970 Known.Zero.setHighBits(ShAmt);
36971 } else {
36972 Known.Zero.ashrInPlace(ShAmt);
36973 Known.One.ashrInPlace(ShAmt);
36974 }
36975 break;
36976 }
36977 case X86ISD::PACKUS: {
36978 // PACKUS is just a truncation if the upper half is zero.
36979 APInt DemandedLHS, DemandedRHS;
36980 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
36981
36982 Known.One = APInt::getAllOnes(BitWidth * 2);
36983 Known.Zero = APInt::getAllOnes(BitWidth * 2);
36984
36985 KnownBits Known2;
36986 if (!!DemandedLHS) {
36987 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
36988 Known = Known.intersectWith(Known2);
36989 }
36990 if (!!DemandedRHS) {
36991 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
36992 Known = Known.intersectWith(Known2);
36993 }
36994
36995 if (Known.countMinLeadingZeros() < BitWidth)
36996 Known.resetAll();
36997 Known = Known.trunc(BitWidth);
36998 break;
36999 }
37000 case X86ISD::PSHUFB: {
37001 SDValue Src = Op.getOperand(0);
37002 SDValue Idx = Op.getOperand(1);
37003
37004 // If the index vector is never negative (MSB is zero), then all elements
37005 // come from the source vector. This is useful for cases where
37006 // PSHUFB is being used as a LUT (ctpop etc.) - the target shuffle handling
37007 // below will handle the more common constant shuffle mask case.
37008 KnownBits KnownIdx = DAG.computeKnownBits(Idx, DemandedElts, Depth + 1);
37009 if (KnownIdx.isNonNegative())
37010 Known = DAG.computeKnownBits(Src, Depth + 1);
37011 break;
37012 }
37013 case X86ISD::VBROADCAST: {
37014 SDValue Src = Op.getOperand(0);
37015 if (!Src.getSimpleValueType().isVector()) {
37016 Known = DAG.computeKnownBits(Src, Depth + 1);
37017 return;
37018 }
37019 break;
37020 }
37021 case X86ISD::AND: {
37022 if (Op.getResNo() == 0) {
37023 KnownBits Known2;
37024 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37025 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37026 Known &= Known2;
37027 }
37028 break;
37029 }
37030 case X86ISD::ANDNP: {
37031 KnownBits Known2;
37032 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37033 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37034
37035 // ANDNP = (~X & Y);
37036 Known.One &= Known2.Zero;
37037 Known.Zero |= Known2.One;
37038 break;
37039 }
37040 case X86ISD::FOR: {
37041 KnownBits Known2;
37042 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37043 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37044
37045 Known |= Known2;
37046 break;
37047 }
37048 case X86ISD::PSADBW: {
37049 SDValue LHS = Op.getOperand(0);
37050 SDValue RHS = Op.getOperand(1);
37051 assert(VT.getScalarType() == MVT::i64 &&
37052 LHS.getValueType() == RHS.getValueType() &&
37053 LHS.getValueType().getScalarType() == MVT::i8 &&
37054 "Unexpected PSADBW types");
37055 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
37056 break;
37057 }
37058 case X86ISD::PCMPGT:
37059 case X86ISD::PCMPEQ: {
37060 KnownBits KnownLhs =
37061 DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37062 KnownBits KnownRhs =
37063 DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37064 std::optional<bool> Res = Opc == X86ISD::PCMPEQ
37065 ? KnownBits::eq(KnownLhs, KnownRhs)
37066 : KnownBits::sgt(KnownLhs, KnownRhs);
37067 if (Res) {
37068 if (*Res)
37069 Known.setAllOnes();
37070 else
37071 Known.setAllZero();
37072 }
37073 break;
37074 }
37075 case X86ISD::PMULUDQ: {
37076 KnownBits Known2;
37077 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37078 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37079
37080 Known = Known.trunc(BitWidth / 2).zext(BitWidth);
37081 Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
37082 Known = KnownBits::mul(Known, Known2);
37083 break;
37084 }
37085 case X86ISD::CMOV: {
37086 Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
37087 // If we don't know any bits, early out.
37088 if (Known.isUnknown())
37089 break;
37090 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
37091
37092 // Only known if known in both the LHS and RHS.
37093 Known = Known.intersectWith(Known2);
37094 break;
37095 }
37096 case X86ISD::BEXTR:
37097 case X86ISD::BEXTRI: {
37098 SDValue Op0 = Op.getOperand(0);
37099 SDValue Op1 = Op.getOperand(1);
37100
37101 if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
37102 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
37103 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
37104
37105 // If the length is 0, the result is 0.
37106 if (Length == 0) {
37107 Known.setAllZero();
37108 break;
37109 }
37110
37111 if ((Shift + Length) <= BitWidth) {
37112 Known = DAG.computeKnownBits(Op0, Depth + 1);
37113 Known = Known.extractBits(Length, Shift);
37114 Known = Known.zextOrTrunc(BitWidth);
37115 }
37116 }
37117 break;
37118 }
37119 case X86ISD::PDEP: {
37120 KnownBits Known2;
37121 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37122 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37123 // Zeros are retained from the mask operand. But not ones.
37124 Known.One.clearAllBits();
37125 // The result will have at least as many trailing zeros as the non-mask
37126 // operand since bits can only map to the same or higher bit position.
37127 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
37128 break;
37129 }
37130 case X86ISD::PEXT: {
37131 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37132 // The result has as many leading zeros as the number of zeroes in the mask.
37133 unsigned Count = Known.Zero.popcount();
37134 Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
37135 Known.One.clearAllBits();
37136 break;
37137 }
37138 case X86ISD::VTRUNC:
37139 case X86ISD::VTRUNCS:
37140 case X86ISD::VTRUNCUS:
37141 case X86ISD::CVTSI2P:
37142 case X86ISD::CVTUI2P:
37143 case X86ISD::CVTP2SI:
37144 case X86ISD::CVTP2UI:
37145 case X86ISD::MCVTP2SI:
37146 case X86ISD::MCVTP2UI:
37147 case X86ISD::CVTTP2SI:
37148 case X86ISD::CVTTP2UI:
37149 case X86ISD::MCVTTP2SI:
37150 case X86ISD::MCVTTP2UI:
37151 case X86ISD::MCVTSI2P:
37152 case X86ISD::MCVTUI2P:
37153 case X86ISD::VFPROUND:
37154 case X86ISD::VMFPROUND:
37155 case X86ISD::CVTPS2PH:
37156 case X86ISD::MCVTPS2PH: {
37157 // Truncations/Conversions - upper elements are known zero.
37158 EVT SrcVT = Op.getOperand(0).getValueType();
37159 if (SrcVT.isVector()) {
37160 unsigned NumSrcElts = SrcVT.getVectorNumElements();
37161 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
37162 Known.setAllZero();
37163 }
37164 break;
37165 }
37172 // Strict Conversions - upper elements are known zero.
37173 EVT SrcVT = Op.getOperand(1).getValueType();
37174 if (SrcVT.isVector()) {
37175 unsigned NumSrcElts = SrcVT.getVectorNumElements();
37176 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
37177 Known.setAllZero();
37178 }
37179 break;
37180 }
37181 case X86ISD::MOVQ2DQ: {
37182 // Move from MMX to XMM. Upper half of XMM should be 0.
37183 if (DemandedElts.countr_zero() >= (NumElts / 2))
37184 Known.setAllZero();
37185 break;
37186 }
37188 APInt UndefElts;
37189 SmallVector<APInt, 16> EltBits;
37190 if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits,
37191 /*AllowWholeUndefs*/ false,
37192 /*AllowPartialUndefs*/ false)) {
37193 Known.Zero.setAllBits();
37194 Known.One.setAllBits();
37195 for (unsigned I = 0; I != NumElts; ++I) {
37196 if (!DemandedElts[I])
37197 continue;
37198 if (UndefElts[I]) {
37199 Known.resetAll();
37200 break;
37201 }
37202 KnownBits Known2 = KnownBits::makeConstant(EltBits[I]);
37203 Known = Known.intersectWith(Known2);
37204 }
37205 return;
37206 }
37207 break;
37208 }
37210 switch (Op->getConstantOperandVal(0)) {
37211 case Intrinsic::x86_sse2_psad_bw:
37212 case Intrinsic::x86_avx2_psad_bw:
37213 case Intrinsic::x86_avx512_psad_bw_512: {
37214 SDValue LHS = Op.getOperand(1);
37215 SDValue RHS = Op.getOperand(2);
37216 assert(VT.getScalarType() == MVT::i64 &&
37217 LHS.getValueType() == RHS.getValueType() &&
37218 LHS.getValueType().getScalarType() == MVT::i8 &&
37219 "Unexpected PSADBW types");
37220 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
37221 break;
37222 }
37223 }
37224 break;
37225 }
37226 }
37227
37228 // Handle target shuffles.
37229 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
37230 if (isTargetShuffle(Opc)) {
37233 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
37234 unsigned NumOps = Ops.size();
37235 unsigned NumElts = VT.getVectorNumElements();
37236 if (Mask.size() == NumElts) {
37237 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
37238 Known.Zero.setAllBits(); Known.One.setAllBits();
37239 for (unsigned i = 0; i != NumElts; ++i) {
37240 if (!DemandedElts[i])
37241 continue;
37242 int M = Mask[i];
37243 if (M == SM_SentinelUndef) {
37244 // For UNDEF elements, we don't know anything about the common state
37245 // of the shuffle result.
37246 Known.resetAll();
37247 break;
37248 }
37249 if (M == SM_SentinelZero) {
37250 Known.One.clearAllBits();
37251 continue;
37252 }
37253 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
37254 "Shuffle index out of range");
37255
37256 unsigned OpIdx = (unsigned)M / NumElts;
37257 unsigned EltIdx = (unsigned)M % NumElts;
37258 if (Ops[OpIdx].getValueType() != VT) {
37259 // TODO - handle target shuffle ops with different value types.
37260 Known.resetAll();
37261 break;
37262 }
37263 DemandedOps[OpIdx].setBit(EltIdx);
37264 }
37265 // Known bits are the values that are shared by every demanded element.
37266 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
37267 if (!DemandedOps[i])
37268 continue;
37269 KnownBits Known2 =
37270 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
37271 Known = Known.intersectWith(Known2);
37272 }
37273 }
37274 }
37275 }
37276}
37277
37279 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
37280 unsigned Depth) const {
37281 EVT VT = Op.getValueType();
37282 unsigned VTBits = VT.getScalarSizeInBits();
37283 unsigned Opcode = Op.getOpcode();
37284 switch (Opcode) {
37286 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
37287 return VTBits;
37288
37289 case X86ISD::VTRUNC: {
37290 SDValue Src = Op.getOperand(0);
37291 MVT SrcVT = Src.getSimpleValueType();
37292 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
37293 assert(VTBits < NumSrcBits && "Illegal truncation input type");
37294 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
37295 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
37296 if (Tmp > (NumSrcBits - VTBits))
37297 return Tmp - (NumSrcBits - VTBits);
37298 return 1;
37299 }
37300
37301 case X86ISD::PACKSS: {
37302 // PACKSS is just a truncation if the sign bits extend to the packed size.
37303 APInt DemandedLHS, DemandedRHS;
37304 getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
37305 DemandedRHS);
37306
37307 // Helper to detect PACKSSDW(BITCAST(PACKSSDW(X)),BITCAST(PACKSSDW(Y)))
37308 // patterns often used to compact vXi64 allsignbit patterns.
37309 auto NumSignBitsPACKSS = [&](SDValue V, const APInt &Elts) -> unsigned {
37311 if (BC.getOpcode() == X86ISD::PACKSS &&
37312 BC.getScalarValueSizeInBits() == 16 &&
37313 V.getScalarValueSizeInBits() == 32) {
37316 if (BC0.getScalarValueSizeInBits() == 64 &&
37317 BC1.getScalarValueSizeInBits() == 64 &&
37318 DAG.ComputeNumSignBits(BC0, Depth + 1) == 64 &&
37319 DAG.ComputeNumSignBits(BC1, Depth + 1) == 64)
37320 return 32;
37321 }
37322 return DAG.ComputeNumSignBits(V, Elts, Depth + 1);
37323 };
37324
37325 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
37326 unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
37327 if (!!DemandedLHS)
37328 Tmp0 = NumSignBitsPACKSS(Op.getOperand(0), DemandedLHS);
37329 if (!!DemandedRHS)
37330 Tmp1 = NumSignBitsPACKSS(Op.getOperand(1), DemandedRHS);
37331 unsigned Tmp = std::min(Tmp0, Tmp1);
37332 if (Tmp > (SrcBits - VTBits))
37333 return Tmp - (SrcBits - VTBits);
37334 return 1;
37335 }
37336
37337 case X86ISD::VBROADCAST: {
37338 SDValue Src = Op.getOperand(0);
37339 if (!Src.getSimpleValueType().isVector())
37340 return DAG.ComputeNumSignBits(Src, Depth + 1);
37341 break;
37342 }
37343
37344 case X86ISD::VSHLI: {
37345 SDValue Src = Op.getOperand(0);
37346 const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
37347 if (ShiftVal.uge(VTBits))
37348 return VTBits; // Shifted all bits out --> zero.
37349 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
37350 if (ShiftVal.uge(Tmp))
37351 return 1; // Shifted all sign bits out --> unknown.
37352 return Tmp - ShiftVal.getZExtValue();
37353 }
37354
37355 case X86ISD::VSRAI: {
37356 SDValue Src = Op.getOperand(0);
37357 APInt ShiftVal = Op.getConstantOperandAPInt(1);
37358 if (ShiftVal.uge(VTBits - 1))
37359 return VTBits; // Sign splat.
37360 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
37361 ShiftVal += Tmp;
37362 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
37363 }
37364
37365 case X86ISD::FSETCC:
37366 // cmpss/cmpsd return zero/all-bits result values in the bottom element.
37367 if (VT == MVT::f32 || VT == MVT::f64 ||
37368 ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
37369 return VTBits;
37370 break;
37371
37372 case X86ISD::PCMPGT:
37373 case X86ISD::PCMPEQ:
37374 case X86ISD::CMPP:
37375 case X86ISD::VPCOM:
37376 case X86ISD::VPCOMU:
37377 // Vector compares return zero/all-bits result values.
37378 return VTBits;
37379
37380 case X86ISD::ANDNP: {
37381 unsigned Tmp0 =
37382 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
37383 if (Tmp0 == 1) return 1; // Early out.
37384 unsigned Tmp1 =
37385 DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
37386 return std::min(Tmp0, Tmp1);
37387 }
37388
37389 case X86ISD::CMOV: {
37390 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
37391 if (Tmp0 == 1) return 1; // Early out.
37392 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
37393 return std::min(Tmp0, Tmp1);
37394 }
37395 }
37396
37397 // Handle target shuffles.
37398 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
37399 if (isTargetShuffle(Opcode)) {
37402 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
37403 unsigned NumOps = Ops.size();
37404 unsigned NumElts = VT.getVectorNumElements();
37405 if (Mask.size() == NumElts) {
37406 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
37407 for (unsigned i = 0; i != NumElts; ++i) {
37408 if (!DemandedElts[i])
37409 continue;
37410 int M = Mask[i];
37411 if (M == SM_SentinelUndef) {
37412 // For UNDEF elements, we don't know anything about the common state
37413 // of the shuffle result.
37414 return 1;
37415 } else if (M == SM_SentinelZero) {
37416 // Zero = all sign bits.
37417 continue;
37418 }
37419 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
37420 "Shuffle index out of range");
37421
37422 unsigned OpIdx = (unsigned)M / NumElts;
37423 unsigned EltIdx = (unsigned)M % NumElts;
37424 if (Ops[OpIdx].getValueType() != VT) {
37425 // TODO - handle target shuffle ops with different value types.
37426 return 1;
37427 }
37428 DemandedOps[OpIdx].setBit(EltIdx);
37429 }
37430 unsigned Tmp0 = VTBits;
37431 for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
37432 if (!DemandedOps[i])
37433 continue;
37434 unsigned Tmp1 =
37435 DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
37436 Tmp0 = std::min(Tmp0, Tmp1);
37437 }
37438 return Tmp0;
37439 }
37440 }
37441 }
37442
37443 // Fallback case.
37444 return 1;
37445}
37446
37448 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
37449 return N->getOperand(0);
37450 return N;
37451}
37452
37453// Helper to look for a normal load that can be narrowed into a vzload with the
37454// specified VT and memory VT. Returns SDValue() on failure.
37456 SelectionDAG &DAG) {
37457 // Can't if the load is volatile or atomic.
37458 if (!LN->isSimple())
37459 return SDValue();
37460
37461 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37462 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
37463 return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
37464 LN->getPointerInfo(), LN->getOriginalAlign(),
37465 LN->getMemOperand()->getFlags());
37466}
37467
37468// Attempt to match a combined shuffle mask against supported unary shuffle
37469// instructions.
37470// TODO: Investigate sharing more of this with shuffle lowering.
37471static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
37472 bool AllowFloatDomain, bool AllowIntDomain,
37473 SDValue V1, const SelectionDAG &DAG,
37474 const X86Subtarget &Subtarget, unsigned &Shuffle,
37475 MVT &SrcVT, MVT &DstVT) {
37476 unsigned NumMaskElts = Mask.size();
37477 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
37478
37479 // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
37480 if (Mask[0] == 0 &&
37481 (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {
37482 if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
37484 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
37485 Shuffle = X86ISD::VZEXT_MOVL;
37486 if (MaskEltSize == 16)
37487 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
37488 else
37489 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
37490 return true;
37491 }
37492 }
37493
37494 // Match against a ANY/SIGN/ZERO_EXTEND_VECTOR_INREG instruction.
37495 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
37496 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
37497 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
37498 unsigned MaxScale = 64 / MaskEltSize;
37499 bool UseSign = V1.getScalarValueSizeInBits() == MaskEltSize &&
37500 DAG.ComputeNumSignBits(V1) == MaskEltSize;
37501 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
37502 bool MatchAny = true;
37503 bool MatchZero = true;
37504 bool MatchSign = UseSign;
37505 unsigned NumDstElts = NumMaskElts / Scale;
37506 for (unsigned i = 0;
37507 i != NumDstElts && (MatchAny || MatchSign || MatchZero); ++i) {
37508 if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
37509 MatchAny = MatchSign = MatchZero = false;
37510 break;
37511 }
37512 unsigned Pos = (i * Scale) + 1;
37513 unsigned Len = Scale - 1;
37514 MatchAny &= isUndefInRange(Mask, Pos, Len);
37515 MatchZero &= isUndefOrZeroInRange(Mask, Pos, Len);
37516 MatchSign &= isUndefOrEqualInRange(Mask, (int)i, Pos, Len);
37517 }
37518 if (MatchAny || MatchSign || MatchZero) {
37519 assert((MatchSign || MatchZero) &&
37520 "Failed to match sext/zext but matched aext?");
37521 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
37522 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType()
37523 : MVT::getIntegerVT(MaskEltSize);
37524 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
37525
37526 Shuffle = unsigned(
37527 MatchAny ? ISD::ANY_EXTEND
37528 : (MatchSign ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND));
37529 if (SrcVT.getVectorNumElements() != NumDstElts)
37530 Shuffle = DAG.getOpcode_EXTEND_VECTOR_INREG(Shuffle);
37531
37532 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
37533 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
37534 return true;
37535 }
37536 }
37537 }
37538
37539 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
37540 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||
37541 (MaskEltSize == 16 && Subtarget.hasFP16())) &&
37542 isUndefOrEqual(Mask[0], 0) &&
37543 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
37544 Shuffle = X86ISD::VZEXT_MOVL;
37545 if (MaskEltSize == 16)
37546 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
37547 else
37548 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
37549 return true;
37550 }
37551
37552 // Check if we have SSE3 which will let us use MOVDDUP etc. The
37553 // instructions are no slower than UNPCKLPD but has the option to
37554 // fold the input operand into even an unaligned memory load.
37555 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
37556 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) {
37557 Shuffle = X86ISD::MOVDDUP;
37558 SrcVT = DstVT = MVT::v2f64;
37559 return true;
37560 }
37561 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
37562 Shuffle = X86ISD::MOVSLDUP;
37563 SrcVT = DstVT = MVT::v4f32;
37564 return true;
37565 }
37566 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) {
37567 Shuffle = X86ISD::MOVSHDUP;
37568 SrcVT = DstVT = MVT::v4f32;
37569 return true;
37570 }
37571 }
37572
37573 if (MaskVT.is256BitVector() && AllowFloatDomain) {
37574 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
37575 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
37576 Shuffle = X86ISD::MOVDDUP;
37577 SrcVT = DstVT = MVT::v4f64;
37578 return true;
37579 }
37580 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
37581 V1)) {
37582 Shuffle = X86ISD::MOVSLDUP;
37583 SrcVT = DstVT = MVT::v8f32;
37584 return true;
37585 }
37586 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG,
37587 V1)) {
37588 Shuffle = X86ISD::MOVSHDUP;
37589 SrcVT = DstVT = MVT::v8f32;
37590 return true;
37591 }
37592 }
37593
37594 if (MaskVT.is512BitVector() && AllowFloatDomain) {
37595 assert(Subtarget.hasAVX512() &&
37596 "AVX512 required for 512-bit vector shuffles");
37597 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
37598 V1)) {
37599 Shuffle = X86ISD::MOVDDUP;
37600 SrcVT = DstVT = MVT::v8f64;
37601 return true;
37602 }
37604 MaskVT, Mask,
37605 {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) {
37606 Shuffle = X86ISD::MOVSLDUP;
37607 SrcVT = DstVT = MVT::v16f32;
37608 return true;
37609 }
37611 MaskVT, Mask,
37612 {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) {
37613 Shuffle = X86ISD::MOVSHDUP;
37614 SrcVT = DstVT = MVT::v16f32;
37615 return true;
37616 }
37617 }
37618
37619 return false;
37620}
37621
37622// Attempt to match a combined shuffle mask against supported unary immediate
37623// permute instructions.
37624// TODO: Investigate sharing more of this with shuffle lowering.
37626 const APInt &Zeroable,
37627 bool AllowFloatDomain, bool AllowIntDomain,
37628 const SelectionDAG &DAG,
37629 const X86Subtarget &Subtarget,
37630 unsigned &Shuffle, MVT &ShuffleVT,
37631 unsigned &PermuteImm) {
37632 unsigned NumMaskElts = Mask.size();
37633 unsigned InputSizeInBits = MaskVT.getSizeInBits();
37634 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
37635 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
37636 bool ContainsZeros = isAnyZero(Mask);
37637
37638 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
37639 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
37640 // Check for lane crossing permutes.
37641 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
37642 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
37643 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
37644 Shuffle = X86ISD::VPERMI;
37645 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
37646 PermuteImm = getV4X86ShuffleImm(Mask);
37647 return true;
37648 }
37649 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
37650 SmallVector<int, 4> RepeatedMask;
37651 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
37652 Shuffle = X86ISD::VPERMI;
37653 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
37654 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
37655 return true;
37656 }
37657 }
37658 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
37659 // VPERMILPD can permute with a non-repeating shuffle.
37660 Shuffle = X86ISD::VPERMILPI;
37661 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
37662 PermuteImm = 0;
37663 for (int i = 0, e = Mask.size(); i != e; ++i) {
37664 int M = Mask[i];
37665 if (M == SM_SentinelUndef)
37666 continue;
37667 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
37668 PermuteImm |= (M & 1) << i;
37669 }
37670 return true;
37671 }
37672 }
37673
37674 // We are checking for shuffle match or shift match. Loop twice so we can
37675 // order which we try and match first depending on target preference.
37676 for (unsigned Order = 0; Order < 2; ++Order) {
37677 if (Subtarget.preferLowerShuffleAsShift() ? (Order == 1) : (Order == 0)) {
37678 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
37679 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
37680 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
37681 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
37682 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
37683 SmallVector<int, 4> RepeatedMask;
37684 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
37685 // Narrow the repeated mask to create 32-bit element permutes.
37686 SmallVector<int, 4> WordMask = RepeatedMask;
37687 if (MaskScalarSizeInBits == 64)
37688 narrowShuffleMaskElts(2, RepeatedMask, WordMask);
37689
37690 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
37691 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
37692 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
37693 PermuteImm = getV4X86ShuffleImm(WordMask);
37694 return true;
37695 }
37696 }
37697
37698 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
37699 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
37700 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
37701 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
37702 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
37703 SmallVector<int, 4> RepeatedMask;
37704 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
37705 ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
37706 ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
37707
37708 // PSHUFLW: permute lower 4 elements only.
37709 if (isUndefOrInRange(LoMask, 0, 4) &&
37710 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
37711 Shuffle = X86ISD::PSHUFLW;
37712 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
37713 PermuteImm = getV4X86ShuffleImm(LoMask);
37714 return true;
37715 }
37716
37717 // PSHUFHW: permute upper 4 elements only.
37718 if (isUndefOrInRange(HiMask, 4, 8) &&
37719 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
37720 // Offset the HiMask so that we can create the shuffle immediate.
37721 int OffsetHiMask[4];
37722 for (int i = 0; i != 4; ++i)
37723 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
37724
37725 Shuffle = X86ISD::PSHUFHW;
37726 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
37727 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
37728 return true;
37729 }
37730 }
37731 }
37732 } else {
37733 // Attempt to match against bit rotates.
37734 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
37735 ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
37736 Subtarget.hasAVX512())) {
37737 int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
37738 Subtarget, Mask);
37739 if (0 < RotateAmt) {
37740 Shuffle = X86ISD::VROTLI;
37741 PermuteImm = (unsigned)RotateAmt;
37742 return true;
37743 }
37744 }
37745 }
37746 // Attempt to match against byte/bit shifts.
37747 if (AllowIntDomain &&
37748 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
37749 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
37750 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
37751 int ShiftAmt =
37752 matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0,
37753 Zeroable, Subtarget);
37754 if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
37755 32 <= ShuffleVT.getScalarSizeInBits())) {
37756 // Byte shifts can be slower so only match them on second attempt.
37757 if (Order == 0 &&
37758 (Shuffle == X86ISD::VSHLDQ || Shuffle == X86ISD::VSRLDQ))
37759 continue;
37760
37761 PermuteImm = (unsigned)ShiftAmt;
37762 return true;
37763 }
37764
37765 }
37766 }
37767
37768 return false;
37769}
37770
37771// Attempt to match a combined unary shuffle mask against supported binary
37772// shuffle instructions.
37773// TODO: Investigate sharing more of this with shuffle lowering.
37774static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
37775 bool AllowFloatDomain, bool AllowIntDomain,
37776 SDValue &V1, SDValue &V2, const SDLoc &DL,
37777 SelectionDAG &DAG, const X86Subtarget &Subtarget,
37778 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
37779 bool IsUnary) {
37780 unsigned NumMaskElts = Mask.size();
37781 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
37782 unsigned SizeInBits = MaskVT.getSizeInBits();
37783
37784 if (MaskVT.is128BitVector()) {
37785 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&
37786 AllowFloatDomain) {
37787 V2 = V1;
37788 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
37789 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
37790 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
37791 return true;
37792 }
37793 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) &&
37794 AllowFloatDomain) {
37795 V2 = V1;
37796 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
37797 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
37798 return true;
37799 }
37800 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) &&
37801 Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
37802 std::swap(V1, V2);
37803 Shuffle = X86ISD::MOVSD;
37804 SrcVT = DstVT = MVT::v2f64;
37805 return true;
37806 }
37807 if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) &&
37808 (AllowFloatDomain || !Subtarget.hasSSE41())) {
37809 Shuffle = X86ISD::MOVSS;
37810 SrcVT = DstVT = MVT::v4f32;
37811 return true;
37812 }
37813 if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7},
37814 DAG) &&
37815 Subtarget.hasFP16()) {
37816 Shuffle = X86ISD::MOVSH;
37817 SrcVT = DstVT = MVT::v8f16;
37818 return true;
37819 }
37820 }
37821
37822 // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
37823 if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
37824 ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
37825 ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
37826 if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
37827 Subtarget)) {
37828 DstVT = MaskVT;
37829 return true;
37830 }
37831 }
37832 // TODO: Can we handle this inside matchShuffleWithPACK?
37833 if (MaskVT == MVT::v4i32 && Subtarget.hasSSE2() &&
37834 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2, 4, 6}, DAG) &&
37835 V1.getScalarValueSizeInBits() == 64 &&
37836 V2.getScalarValueSizeInBits() == 64) {
37837 // Use (SSE41) PACKUSWD if the leading zerobits goto the lowest 16-bits.
37838 unsigned MinLZV1 = DAG.computeKnownBits(V1).countMinLeadingZeros();
37839 unsigned MinLZV2 = DAG.computeKnownBits(V2).countMinLeadingZeros();
37840 if (Subtarget.hasSSE41() && MinLZV1 >= 48 && MinLZV2 >= 48) {
37841 SrcVT = MVT::v4i32;
37842 DstVT = MVT::v8i16;
37843 Shuffle = X86ISD::PACKUS;
37844 return true;
37845 }
37846 // Use PACKUSBW if the leading zerobits goto the lowest 8-bits.
37847 if (MinLZV1 >= 56 && MinLZV2 >= 56) {
37848 SrcVT = MVT::v8i16;
37849 DstVT = MVT::v16i8;
37850 Shuffle = X86ISD::PACKUS;
37851 return true;
37852 }
37853 // Use PACKSSWD if the signbits extend to the lowest 16-bits.
37854 if (DAG.ComputeNumSignBits(V1) > 48 && DAG.ComputeNumSignBits(V2) > 48) {
37855 SrcVT = MVT::v4i32;
37856 DstVT = MVT::v8i16;
37857 Shuffle = X86ISD::PACKSS;
37858 return true;
37859 }
37860 }
37861
37862 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
37863 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
37864 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
37865 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
37866 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
37867 (MaskVT.is512BitVector() && Subtarget.hasAVX512() &&
37868 (32 <= EltSizeInBits || Subtarget.hasBWI()))) {
37869 if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
37870 Subtarget)) {
37871 SrcVT = DstVT = MaskVT;
37872 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
37873 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
37874 return true;
37875 }
37876 }
37877
37878 // Attempt to match against a OR if we're performing a blend shuffle and the
37879 // non-blended source element is zero in each case.
37880 // TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits.
37881 if (SizeInBits == V1.getValueSizeInBits() &&
37882 SizeInBits == V2.getValueSizeInBits() &&
37883 (EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
37884 (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
37885 bool IsBlend = true;
37886 unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
37887 unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
37888 unsigned Scale1 = NumV1Elts / NumMaskElts;
37889 unsigned Scale2 = NumV2Elts / NumMaskElts;
37890 APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);
37891 APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);
37892 for (unsigned i = 0; i != NumMaskElts; ++i) {
37893 int M = Mask[i];
37894 if (M == SM_SentinelUndef)
37895 continue;
37896 if (M == SM_SentinelZero) {
37897 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
37898 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
37899 continue;
37900 }
37901 if (M == (int)i) {
37902 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
37903 continue;
37904 }
37905 if (M == (int)(i + NumMaskElts)) {
37906 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
37907 continue;
37908 }
37909 IsBlend = false;
37910 break;
37911 }
37912 if (IsBlend) {
37913 if (DAG.MaskedVectorIsZero(V1, DemandedZeroV1) &&
37914 DAG.MaskedVectorIsZero(V2, DemandedZeroV2)) {
37915 Shuffle = ISD::OR;
37916 SrcVT = DstVT = MaskVT.changeTypeToInteger();
37917 return true;
37918 }
37919 if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {
37920 // FIXME: handle mismatched sizes?
37921 // TODO: investigate if `ISD::OR` handling in
37922 // `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.
37923 auto computeKnownBitsElementWise = [&DAG](SDValue V) {
37924 unsigned NumElts = V.getValueType().getVectorNumElements();
37925 KnownBits Known(NumElts);
37926 for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
37927 APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
37928 KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);
37929 if (PeepholeKnown.isZero())
37930 Known.Zero.setBit(EltIdx);
37931 if (PeepholeKnown.isAllOnes())
37932 Known.One.setBit(EltIdx);
37933 }
37934 return Known;
37935 };
37936
37937 KnownBits V1Known = computeKnownBitsElementWise(V1);
37938 KnownBits V2Known = computeKnownBitsElementWise(V2);
37939
37940 for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {
37941 int M = Mask[i];
37942 if (M == SM_SentinelUndef)
37943 continue;
37944 if (M == SM_SentinelZero) {
37945 IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];
37946 continue;
37947 }
37948 if (M == (int)i) {
37949 IsBlend &= V2Known.Zero[i] || V1Known.One[i];
37950 continue;
37951 }
37952 if (M == (int)(i + NumMaskElts)) {
37953 IsBlend &= V1Known.Zero[i] || V2Known.One[i];
37954 continue;
37955 }
37956 llvm_unreachable("will not get here.");
37957 }
37958 if (IsBlend) {
37959 Shuffle = ISD::OR;
37960 SrcVT = DstVT = MaskVT.changeTypeToInteger();
37961 return true;
37962 }
37963 }
37964 }
37965 }
37966
37967 return false;
37968}
37969
37971 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
37972 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
37973 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
37974 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
37975 unsigned NumMaskElts = Mask.size();
37976 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
37977
37978 // Attempt to match against VALIGND/VALIGNQ rotate.
37979 if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
37980 ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
37981 (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
37982 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
37983 if (!isAnyZero(Mask)) {
37984 int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
37985 if (0 < Rotation) {
37986 Shuffle = X86ISD::VALIGN;
37987 if (EltSizeInBits == 64)
37988 ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);
37989 else
37990 ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);
37991 PermuteImm = Rotation;
37992 return true;
37993 }
37994 }
37995 }
37996
37997 // Attempt to match against PALIGNR byte rotate.
37998 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
37999 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
38000 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
38001 int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
38002 if (0 < ByteRotation) {
38003 Shuffle = X86ISD::PALIGNR;
38004 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
38005 PermuteImm = ByteRotation;
38006 return true;
38007 }
38008 }
38009
38010 // Attempt to combine to X86ISD::BLENDI.
38011 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
38012 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
38013 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
38014 uint64_t BlendMask = 0;
38015 bool ForceV1Zero = false, ForceV2Zero = false;
38016 SmallVector<int, 8> TargetMask(Mask);
38017 if (matchShuffleAsBlend(MaskVT, V1, V2, TargetMask, Zeroable, ForceV1Zero,
38018 ForceV2Zero, BlendMask)) {
38019 if (MaskVT == MVT::v16i16) {
38020 // We can only use v16i16 PBLENDW if the lanes are repeated.
38021 SmallVector<int, 8> RepeatedMask;
38022 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
38023 RepeatedMask)) {
38024 assert(RepeatedMask.size() == 8 &&
38025 "Repeated mask size doesn't match!");
38026 PermuteImm = 0;
38027 for (int i = 0; i < 8; ++i)
38028 if (RepeatedMask[i] >= 8)
38029 PermuteImm |= 1 << i;
38030 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
38031 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
38032 Shuffle = X86ISD::BLENDI;
38033 ShuffleVT = MaskVT;
38034 return true;
38035 }
38036 } else {
38037 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
38038 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
38039 PermuteImm = (unsigned)BlendMask;
38040 Shuffle = X86ISD::BLENDI;
38041 ShuffleVT = MaskVT;
38042 return true;
38043 }
38044 }
38045 }
38046
38047 // Attempt to combine to INSERTPS, but only if it has elements that need to
38048 // be set to zero.
38049 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
38050 MaskVT.is128BitVector() && isAnyZero(Mask) &&
38051 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
38052 Shuffle = X86ISD::INSERTPS;
38053 ShuffleVT = MVT::v4f32;
38054 return true;
38055 }
38056
38057 // Attempt to combine to SHUFPD.
38058 if (AllowFloatDomain && EltSizeInBits == 64 &&
38059 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
38060 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
38061 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
38062 bool ForceV1Zero = false, ForceV2Zero = false;
38063 if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
38064 PermuteImm, Mask, Zeroable)) {
38065 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
38066 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
38067 Shuffle = X86ISD::SHUFP;
38068 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
38069 return true;
38070 }
38071 }
38072
38073 // Attempt to combine to SHUFPS.
38074 if (AllowFloatDomain && EltSizeInBits == 32 &&
38075 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
38076 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
38077 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
38078 SmallVector<int, 4> RepeatedMask;
38079 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
38080 // Match each half of the repeated mask, to determine if its just
38081 // referencing one of the vectors, is zeroable or entirely undef.
38082 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
38083 int M0 = RepeatedMask[Offset];
38084 int M1 = RepeatedMask[Offset + 1];
38085
38086 if (isUndefInRange(RepeatedMask, Offset, 2)) {
38087 return DAG.getUNDEF(MaskVT);
38088 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
38089 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
38090 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
38091 return getZeroVector(MaskVT, Subtarget, DAG, DL);
38092 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
38093 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
38094 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
38095 return V1;
38096 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
38097 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
38098 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
38099 return V2;
38100 }
38101
38102 return SDValue();
38103 };
38104
38105 int ShufMask[4] = {-1, -1, -1, -1};
38106 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
38107 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
38108
38109 if (Lo && Hi) {
38110 V1 = Lo;
38111 V2 = Hi;
38112 Shuffle = X86ISD::SHUFP;
38113 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
38114 PermuteImm = getV4X86ShuffleImm(ShufMask);
38115 return true;
38116 }
38117 }
38118 }
38119
38120 // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
38121 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
38122 MaskVT.is128BitVector() &&
38123 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
38124 Shuffle = X86ISD::INSERTPS;
38125 ShuffleVT = MVT::v4f32;
38126 return true;
38127 }
38128
38129 return false;
38130}
38131
38133 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
38134 bool HasVariableMask, bool AllowVariableCrossLaneMask,
38135 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
38136 const X86Subtarget &Subtarget);
38137
38138/// Combine an arbitrary chain of shuffles into a single instruction if
38139/// possible.
38140///
38141/// This is the leaf of the recursive combine below. When we have found some
38142/// chain of single-use x86 shuffle instructions and accumulated the combined
38143/// shuffle mask represented by them, this will try to pattern match that mask
38144/// into either a single instruction if there is a special purpose instruction
38145/// for this operation, or into a PSHUFB instruction which is a fully general
38146/// instruction but should only be used to replace chains over a certain depth.
38148 ArrayRef<int> BaseMask, int Depth,
38149 bool HasVariableMask,
38150 bool AllowVariableCrossLaneMask,
38151 bool AllowVariablePerLaneMask,
38152 SelectionDAG &DAG,
38153 const X86Subtarget &Subtarget) {
38154 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
38155 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
38156 "Unexpected number of shuffle inputs!");
38157
38158 SDLoc DL(Root);
38159 MVT RootVT = Root.getSimpleValueType();
38160 unsigned RootSizeInBits = RootVT.getSizeInBits();
38161 unsigned NumRootElts = RootVT.getVectorNumElements();
38162
38163 // Canonicalize shuffle input op to the requested type.
38164 auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
38165 if (VT.getSizeInBits() > Op.getValueSizeInBits())
38166 Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());
38167 else if (VT.getSizeInBits() < Op.getValueSizeInBits())
38168 Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());
38169 return DAG.getBitcast(VT, Op);
38170 };
38171
38172 // Find the inputs that enter the chain. Note that multiple uses are OK
38173 // here, we're not going to remove the operands we find.
38174 bool UnaryShuffle = (Inputs.size() == 1);
38175 SDValue V1 = peekThroughBitcasts(Inputs[0]);
38176 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
38177 : peekThroughBitcasts(Inputs[1]));
38178
38179 MVT VT1 = V1.getSimpleValueType();
38180 MVT VT2 = V2.getSimpleValueType();
38181 assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&
38182 (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch");
38183
38184 SDValue Res;
38185
38186 unsigned NumBaseMaskElts = BaseMask.size();
38187 if (NumBaseMaskElts == 1) {
38188 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
38189 return CanonicalizeShuffleInput(RootVT, V1);
38190 }
38191
38192 bool OptForSize = DAG.shouldOptForSize();
38193 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
38194 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
38195 (RootVT.isFloatingPoint() && Depth >= 1) ||
38196 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
38197
38198 // Don't combine if we are a AVX512/EVEX target and the mask element size
38199 // is different from the root element size - this would prevent writemasks
38200 // from being reused.
38201 bool IsMaskedShuffle = false;
38202 if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
38203 if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&
38204 Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
38205 IsMaskedShuffle = true;
38206 }
38207 }
38208
38209 // If we are shuffling a splat (and not introducing zeros) then we can just
38210 // use it directly. This works for smaller elements as well as they already
38211 // repeat across each mask element.
38212 if (UnaryShuffle && !isAnyZero(BaseMask) &&
38213 V1.getValueSizeInBits() >= RootSizeInBits &&
38214 (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
38215 DAG.isSplatValue(V1, /*AllowUndefs*/ false)) {
38216 return CanonicalizeShuffleInput(RootVT, V1);
38217 }
38218
38219 SmallVector<int, 64> Mask(BaseMask);
38220
38221 // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
38222 // etc. can be simplified.
38223 if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {
38224 SmallVector<int> ScaledMask, IdentityMask;
38225 unsigned NumElts = VT1.getVectorNumElements();
38226 if (Mask.size() <= NumElts &&
38227 scaleShuffleElements(Mask, NumElts, ScaledMask)) {
38228 for (unsigned i = 0; i != NumElts; ++i)
38229 IdentityMask.push_back(i);
38230 if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1,
38231 V2))
38232 return CanonicalizeShuffleInput(RootVT, V1);
38233 }
38234 }
38235
38236 // Handle 128/256-bit lane shuffles of 512-bit vectors.
38237 if (RootVT.is512BitVector() &&
38238 (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
38239 // If the upper subvectors are zeroable, then an extract+insert is more
38240 // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
38241 // to zero the upper subvectors.
38242 if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
38243 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
38244 return SDValue(); // Nothing to do!
38245 assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&
38246 "Unexpected lane shuffle");
38247 Res = CanonicalizeShuffleInput(RootVT, V1);
38248 unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);
38249 bool UseZero = isAnyZero(Mask);
38250 Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
38251 return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
38252 }
38253
38254 // Narrow shuffle mask to v4x128.
38255 SmallVector<int, 4> ScaledMask;
38256 assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
38257 narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);
38258
38259 // Try to lower to vshuf64x2/vshuf32x4.
38260 auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
38261 ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,
38262 SelectionDAG &DAG) {
38263 int PermMask[4] = {-1, -1, -1, -1};
38264 // Ensure elements came from the same Op.
38265 SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
38266 for (int i = 0; i < 4; ++i) {
38267 assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value");
38268 if (ScaledMask[i] < 0)
38269 continue;
38270
38271 SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;
38272 unsigned OpIndex = i / 2;
38273 if (Ops[OpIndex].isUndef())
38274 Ops[OpIndex] = Op;
38275 else if (Ops[OpIndex] != Op)
38276 return SDValue();
38277
38278 PermMask[i] = ScaledMask[i] % 4;
38279 }
38280
38281 return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
38282 CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
38283 CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
38284 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
38285 };
38286
38287 // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
38288 // doesn't work because our mask is for 128 bits and we don't have an MVT
38289 // to match that.
38290 bool PreferPERMQ = UnaryShuffle && isUndefOrInRange(ScaledMask[0], 0, 2) &&
38291 isUndefOrInRange(ScaledMask[1], 0, 2) &&
38292 isUndefOrInRange(ScaledMask[2], 2, 4) &&
38293 isUndefOrInRange(ScaledMask[3], 2, 4) &&
38294 (ScaledMask[0] < 0 || ScaledMask[2] < 0 ||
38295 ScaledMask[0] == (ScaledMask[2] % 2)) &&
38296 (ScaledMask[1] < 0 || ScaledMask[3] < 0 ||
38297 ScaledMask[1] == (ScaledMask[3] % 2));
38298
38299 if (!isAnyZero(ScaledMask) && !PreferPERMQ) {
38300 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
38301 return SDValue(); // Nothing to do!
38302 MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
38303 if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))
38304 return DAG.getBitcast(RootVT, V);
38305 }
38306 }
38307
38308 // Handle 128-bit lane shuffles of 256-bit vectors.
38309 if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
38310 // If the upper half is zeroable, then an extract+insert is more optimal
38311 // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
38312 // zero the upper half.
38313 if (isUndefOrZero(Mask[1])) {
38314 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
38315 return SDValue(); // Nothing to do!
38316 assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle");
38317 Res = CanonicalizeShuffleInput(RootVT, V1);
38318 Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);
38319 return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,
38320 256);
38321 }
38322
38323 // If we're inserting the low subvector, an insert-subvector 'concat'
38324 // pattern is quicker than VPERM2X128.
38325 // TODO: Add AVX2 support instead of VPERMQ/VPERMPD.
38326 if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&
38327 !Subtarget.hasAVX2()) {
38328 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
38329 return SDValue(); // Nothing to do!
38330 SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);
38331 SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);
38332 Hi = extractSubVector(Hi, 0, DAG, DL, 128);
38333 return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);
38334 }
38335
38336 if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
38337 return SDValue(); // Nothing to do!
38338
38339 // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
38340 // we need to use the zeroing feature.
38341 // Prefer blends for sequential shuffles unless we are optimizing for size.
38342 if (UnaryShuffle &&
38343 !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&
38344 (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {
38345 unsigned PermMask = 0;
38346 PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);
38347 PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);
38348 return DAG.getNode(
38349 X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
38350 DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
38351 }
38352
38353 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
38354 return SDValue(); // Nothing to do!
38355
38356 // TODO - handle AVX512VL cases with X86ISD::SHUF128.
38357 if (!UnaryShuffle && !IsMaskedShuffle) {
38358 assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&
38359 "Unexpected shuffle sentinel value");
38360 // Prefer blends to X86ISD::VPERM2X128.
38361 if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
38362 unsigned PermMask = 0;
38363 PermMask |= ((Mask[0] & 3) << 0);
38364 PermMask |= ((Mask[1] & 3) << 4);
38365 SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
38366 SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
38367 return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
38368 CanonicalizeShuffleInput(RootVT, LHS),
38369 CanonicalizeShuffleInput(RootVT, RHS),
38370 DAG.getTargetConstant(PermMask, DL, MVT::i8));
38371 }
38372 }
38373 }
38374
38375 // For masks that have been widened to 128-bit elements or more,
38376 // narrow back down to 64-bit elements.
38377 if (BaseMaskEltSizeInBits > 64) {
38378 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
38379 int MaskScale = BaseMaskEltSizeInBits / 64;
38380 SmallVector<int, 64> ScaledMask;
38381 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
38382 Mask = std::move(ScaledMask);
38383 }
38384
38385 // For masked shuffles, we're trying to match the root width for better
38386 // writemask folding, attempt to scale the mask.
38387 // TODO - variable shuffles might need this to be widened again.
38388 if (IsMaskedShuffle && NumRootElts > Mask.size()) {
38389 assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size");
38390 int MaskScale = NumRootElts / Mask.size();
38391 SmallVector<int, 64> ScaledMask;
38392 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
38393 Mask = std::move(ScaledMask);
38394 }
38395
38396 unsigned NumMaskElts = Mask.size();
38397 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
38398 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38399
38400 // Determine the effective mask value type.
38401 FloatDomain &= (32 <= MaskEltSizeInBits);
38402 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
38403 : MVT::getIntegerVT(MaskEltSizeInBits);
38404 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
38405
38406 // Only allow legal mask types.
38407 if (!TLI.isTypeLegal(MaskVT))
38408 return SDValue();
38409
38410 // Attempt to match the mask against known shuffle patterns.
38411 MVT ShuffleSrcVT, ShuffleVT;
38412 unsigned Shuffle, PermuteImm;
38413
38414 // Which shuffle domains are permitted?
38415 // Permit domain crossing at higher combine depths.
38416 // TODO: Should we indicate which domain is preferred if both are allowed?
38417 bool AllowFloatDomain = FloatDomain || (Depth >= 3);
38418 bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
38419 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
38420
38421 // Determine zeroable mask elements.
38422 APInt KnownUndef, KnownZero;
38423 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
38424 APInt Zeroable = KnownUndef | KnownZero;
38425
38426 if (UnaryShuffle) {
38427 // Attempt to match against broadcast-from-vector.
38428 // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
38429 if ((Subtarget.hasAVX2() ||
38430 (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
38431 (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
38432 if (isUndefOrEqual(Mask, 0)) {
38433 if (V1.getValueType() == MaskVT &&
38435 X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {
38436 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
38437 return SDValue(); // Nothing to do!
38438 Res = V1.getOperand(0);
38439 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
38440 return DAG.getBitcast(RootVT, Res);
38441 }
38442 if (Subtarget.hasAVX2()) {
38443 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
38444 return SDValue(); // Nothing to do!
38445 Res = CanonicalizeShuffleInput(MaskVT, V1);
38446 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
38447 return DAG.getBitcast(RootVT, Res);
38448 }
38449 }
38450 }
38451
38452 if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,
38453 DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&
38454 (!IsMaskedShuffle ||
38455 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
38456 if (Depth == 0 && Root.getOpcode() == Shuffle)
38457 return SDValue(); // Nothing to do!
38458 Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
38459 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
38460 return DAG.getBitcast(RootVT, Res);
38461 }
38462
38463 if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
38464 AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,
38465 PermuteImm) &&
38466 (!IsMaskedShuffle ||
38467 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
38468 if (Depth == 0 && Root.getOpcode() == Shuffle)
38469 return SDValue(); // Nothing to do!
38470 Res = CanonicalizeShuffleInput(ShuffleVT, V1);
38471 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
38472 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
38473 return DAG.getBitcast(RootVT, Res);
38474 }
38475 }
38476
38477 // Attempt to combine to INSERTPS, but only if the inserted element has come
38478 // from a scalar.
38479 // TODO: Handle other insertions here as well?
38480 if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
38481 Subtarget.hasSSE41() &&
38482 !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) {
38483 if (MaskEltSizeInBits == 32) {
38484 SDValue SrcV1 = V1, SrcV2 = V2;
38485 if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
38486 DAG) &&
38487 SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
38488 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
38489 return SDValue(); // Nothing to do!
38490 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
38491 CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
38492 CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
38493 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
38494 return DAG.getBitcast(RootVT, Res);
38495 }
38496 }
38497 if (MaskEltSizeInBits == 64 &&
38498 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) &&
38499 V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&
38500 V2.getScalarValueSizeInBits() <= 32) {
38501 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
38502 return SDValue(); // Nothing to do!
38503 PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0);
38504 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
38505 CanonicalizeShuffleInput(MVT::v4f32, V1),
38506 CanonicalizeShuffleInput(MVT::v4f32, V2),
38507 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
38508 return DAG.getBitcast(RootVT, Res);
38509 }
38510 }
38511
38512 SDValue NewV1 = V1; // Save operands in case early exit happens.
38513 SDValue NewV2 = V2;
38514 if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
38515 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
38516 ShuffleVT, UnaryShuffle) &&
38517 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
38518 if (Depth == 0 && Root.getOpcode() == Shuffle)
38519 return SDValue(); // Nothing to do!
38520 NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
38521 NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
38522 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
38523 return DAG.getBitcast(RootVT, Res);
38524 }
38525
38526 NewV1 = V1; // Save operands in case early exit happens.
38527 NewV2 = V2;
38528 if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
38529 AllowIntDomain, NewV1, NewV2, DL, DAG,
38530 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
38531 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
38532 if (Depth == 0 && Root.getOpcode() == Shuffle)
38533 return SDValue(); // Nothing to do!
38534 NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
38535 NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
38536 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
38537 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
38538 return DAG.getBitcast(RootVT, Res);
38539 }
38540
38541 // Typically from here on, we need an integer version of MaskVT.
38542 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
38543 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
38544
38545 // Annoyingly, SSE4A instructions don't map into the above match helpers.
38546 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
38547 uint64_t BitLen, BitIdx;
38548 if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
38549 Zeroable)) {
38550 if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
38551 return SDValue(); // Nothing to do!
38552 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
38553 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
38554 DAG.getTargetConstant(BitLen, DL, MVT::i8),
38555 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
38556 return DAG.getBitcast(RootVT, Res);
38557 }
38558
38559 if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
38560 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
38561 return SDValue(); // Nothing to do!
38562 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
38563 V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
38564 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
38565 DAG.getTargetConstant(BitLen, DL, MVT::i8),
38566 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
38567 return DAG.getBitcast(RootVT, Res);
38568 }
38569 }
38570
38571 // Match shuffle against TRUNCATE patterns.
38572 if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
38573 // Match against a VTRUNC instruction, accounting for src/dst sizes.
38574 if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
38575 Subtarget)) {
38576 bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
38577 ShuffleSrcVT.getVectorNumElements();
38578 unsigned Opc =
38579 IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
38580 if (Depth == 0 && Root.getOpcode() == Opc)
38581 return SDValue(); // Nothing to do!
38582 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
38583 Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
38584 if (ShuffleVT.getSizeInBits() < RootSizeInBits)
38585 Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
38586 return DAG.getBitcast(RootVT, Res);
38587 }
38588
38589 // Do we need a more general binary truncation pattern?
38590 if (RootSizeInBits < 512 &&
38591 ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
38592 (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
38593 (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
38594 isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
38595 // Bail if this was already a truncation or PACK node.
38596 // We sometimes fail to match PACK if we demand known undef elements.
38597 if (Depth == 0 && (Root.getOpcode() == ISD::TRUNCATE ||
38598 Root.getOpcode() == X86ISD::PACKSS ||
38599 Root.getOpcode() == X86ISD::PACKUS))
38600 return SDValue(); // Nothing to do!
38601 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
38602 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
38603 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
38604 V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
38605 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
38606 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
38607 Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
38608 Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
38609 return DAG.getBitcast(RootVT, Res);
38610 }
38611 }
38612
38613 // Don't try to re-form single instruction chains under any circumstances now
38614 // that we've done encoding canonicalization for them.
38615 if (Depth < 1)
38616 return SDValue();
38617
38618 // Depth threshold above which we can efficiently use variable mask shuffles.
38619 int VariableCrossLaneShuffleDepth =
38620 Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
38621 int VariablePerLaneShuffleDepth =
38622 Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
38623 AllowVariableCrossLaneMask &=
38624 (Depth >= VariableCrossLaneShuffleDepth) || HasVariableMask;
38625 AllowVariablePerLaneMask &=
38626 (Depth >= VariablePerLaneShuffleDepth) || HasVariableMask;
38627 // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a
38628 // higher depth before combining them.
38629 bool AllowBWIVPERMV3 =
38630 (Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask);
38631
38632 bool MaskContainsZeros = isAnyZero(Mask);
38633
38634 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
38635 // If we have a single input lane-crossing shuffle then lower to VPERMV.
38636 if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
38637 if (Subtarget.hasAVX2() &&
38638 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
38639 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
38640 Res = CanonicalizeShuffleInput(MaskVT, V1);
38641 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
38642 return DAG.getBitcast(RootVT, Res);
38643 }
38644 // AVX512 variants (non-VLX will pad to 512-bit shuffles).
38645 if ((Subtarget.hasAVX512() &&
38646 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
38647 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
38648 (Subtarget.hasBWI() &&
38649 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
38650 (Subtarget.hasVBMI() &&
38651 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
38652 V1 = CanonicalizeShuffleInput(MaskVT, V1);
38653 V2 = DAG.getUNDEF(MaskVT);
38654 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
38655 return DAG.getBitcast(RootVT, Res);
38656 }
38657 }
38658
38659 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
38660 // vector as the second source (non-VLX will pad to 512-bit shuffles).
38661 if (UnaryShuffle && AllowVariableCrossLaneMask &&
38662 ((Subtarget.hasAVX512() &&
38663 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
38664 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
38665 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
38666 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
38667 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
38668 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
38669 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
38670 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
38671 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
38672 for (unsigned i = 0; i != NumMaskElts; ++i)
38673 if (Mask[i] == SM_SentinelZero)
38674 Mask[i] = NumMaskElts + i;
38675 V1 = CanonicalizeShuffleInput(MaskVT, V1);
38676 V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
38677 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
38678 return DAG.getBitcast(RootVT, Res);
38679 }
38680
38681 // If that failed and either input is extracted then try to combine as a
38682 // shuffle with the larger type.
38684 Inputs, Root, BaseMask, Depth, HasVariableMask,
38685 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG,
38686 Subtarget))
38687 return WideShuffle;
38688
38689 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
38690 // (non-VLX will pad to 512-bit shuffles).
38691 if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
38692 ((Subtarget.hasAVX512() &&
38693 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
38694 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
38695 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
38696 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
38697 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
38698 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
38699 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
38700 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
38701 V1 = CanonicalizeShuffleInput(MaskVT, V1);
38702 V2 = CanonicalizeShuffleInput(MaskVT, V2);
38703 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
38704 return DAG.getBitcast(RootVT, Res);
38705 }
38706 return SDValue();
38707 }
38708
38709 // See if we can combine a single input shuffle with zeros to a bit-mask,
38710 // which is much simpler than any shuffle.
38711 if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
38712 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
38713 TLI.isTypeLegal(MaskVT)) {
38714 APInt Zero = APInt::getZero(MaskEltSizeInBits);
38715 APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);
38716 APInt UndefElts(NumMaskElts, 0);
38717 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
38718 for (unsigned i = 0; i != NumMaskElts; ++i) {
38719 int M = Mask[i];
38720 if (M == SM_SentinelUndef) {
38721 UndefElts.setBit(i);
38722 continue;
38723 }
38724 if (M == SM_SentinelZero)
38725 continue;
38726 EltBits[i] = AllOnes;
38727 }
38728 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
38729 Res = CanonicalizeShuffleInput(MaskVT, V1);
38730 unsigned AndOpcode =
38732 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
38733 return DAG.getBitcast(RootVT, Res);
38734 }
38735
38736 // If we have a single input shuffle with different shuffle patterns in the
38737 // the 128-bit lanes use the variable mask to VPERMILPS.
38738 // TODO Combine other mask types at higher depths.
38739 if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
38740 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
38741 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
38742 SmallVector<SDValue, 16> VPermIdx;
38743 for (int M : Mask) {
38744 SDValue Idx =
38745 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
38746 VPermIdx.push_back(Idx);
38747 }
38748 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
38749 Res = CanonicalizeShuffleInput(MaskVT, V1);
38750 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
38751 return DAG.getBitcast(RootVT, Res);
38752 }
38753
38754 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
38755 // to VPERMIL2PD/VPERMIL2PS.
38756 if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
38757 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
38758 MaskVT == MVT::v8f32)) {
38759 // VPERMIL2 Operation.
38760 // Bits[3] - Match Bit.
38761 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
38762 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
38763 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
38764 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
38765 SmallVector<int, 8> VPerm2Idx;
38766 unsigned M2ZImm = 0;
38767 for (int M : Mask) {
38768 if (M == SM_SentinelUndef) {
38769 VPerm2Idx.push_back(-1);
38770 continue;
38771 }
38772 if (M == SM_SentinelZero) {
38773 M2ZImm = 2;
38774 VPerm2Idx.push_back(8);
38775 continue;
38776 }
38777 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
38778 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
38779 VPerm2Idx.push_back(Index);
38780 }
38781 V1 = CanonicalizeShuffleInput(MaskVT, V1);
38782 V2 = CanonicalizeShuffleInput(MaskVT, V2);
38783 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
38784 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
38785 DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
38786 return DAG.getBitcast(RootVT, Res);
38787 }
38788
38789 // If we have 3 or more shuffle instructions or a chain involving a variable
38790 // mask, we can replace them with a single PSHUFB instruction profitably.
38791 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
38792 // instructions, but in practice PSHUFB tends to be *very* fast so we're
38793 // more aggressive.
38794 if (UnaryShuffle && AllowVariablePerLaneMask &&
38795 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
38796 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
38797 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
38798 SmallVector<SDValue, 16> PSHUFBMask;
38799 int NumBytes = RootVT.getSizeInBits() / 8;
38800 int Ratio = NumBytes / NumMaskElts;
38801 for (int i = 0; i < NumBytes; ++i) {
38802 int M = Mask[i / Ratio];
38803 if (M == SM_SentinelUndef) {
38804 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
38805 continue;
38806 }
38807 if (M == SM_SentinelZero) {
38808 PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
38809 continue;
38810 }
38811 M = Ratio * M + i % Ratio;
38812 assert((M / 16) == (i / 16) && "Lane crossing detected");
38813 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
38814 }
38815 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
38816 Res = CanonicalizeShuffleInput(ByteVT, V1);
38817 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
38818 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
38819 return DAG.getBitcast(RootVT, Res);
38820 }
38821
38822 // With XOP, if we have a 128-bit binary input shuffle we can always combine
38823 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
38824 // slower than PSHUFB on targets that support both.
38825 if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
38826 Subtarget.hasXOP()) {
38827 // VPPERM Mask Operation
38828 // Bits[4:0] - Byte Index (0 - 31)
38829 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
38830 SmallVector<SDValue, 16> VPPERMMask;
38831 int NumBytes = 16;
38832 int Ratio = NumBytes / NumMaskElts;
38833 for (int i = 0; i < NumBytes; ++i) {
38834 int M = Mask[i / Ratio];
38835 if (M == SM_SentinelUndef) {
38836 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
38837 continue;
38838 }
38839 if (M == SM_SentinelZero) {
38840 VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
38841 continue;
38842 }
38843 M = Ratio * M + i % Ratio;
38844 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
38845 }
38846 MVT ByteVT = MVT::v16i8;
38847 V1 = CanonicalizeShuffleInput(ByteVT, V1);
38848 V2 = CanonicalizeShuffleInput(ByteVT, V2);
38849 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
38850 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
38851 return DAG.getBitcast(RootVT, Res);
38852 }
38853
38854 // If that failed and either input is extracted then try to combine as a
38855 // shuffle with the larger type.
38857 Inputs, Root, BaseMask, Depth, HasVariableMask,
38858 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget))
38859 return WideShuffle;
38860
38861 // If we have a dual input shuffle then lower to VPERMV3,
38862 // (non-VLX will pad to 512-bit shuffles)
38863 if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
38864 ((Subtarget.hasAVX512() &&
38865 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
38866 MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
38867 MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
38868 MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
38869 MaskVT == MVT::v16i32)) ||
38870 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
38871 (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
38872 MaskVT == MVT::v32i16)) ||
38873 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
38874 (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
38875 MaskVT == MVT::v64i8)))) {
38876 V1 = CanonicalizeShuffleInput(MaskVT, V1);
38877 V2 = CanonicalizeShuffleInput(MaskVT, V2);
38878 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
38879 return DAG.getBitcast(RootVT, Res);
38880 }
38881
38882 // Failed to find any combines.
38883 return SDValue();
38884}
38885
38886// Combine an arbitrary chain of shuffles + extract_subvectors into a single
38887// instruction if possible.
38888//
38889// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
38890// type size to attempt to combine:
38891// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
38892// -->
38893// extract_subvector(shuffle(x,y,m2),0)
38895 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
38896 bool HasVariableMask, bool AllowVariableCrossLaneMask,
38897 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
38898 const X86Subtarget &Subtarget) {
38899 unsigned NumMaskElts = BaseMask.size();
38900 unsigned NumInputs = Inputs.size();
38901 if (NumInputs == 0)
38902 return SDValue();
38903
38904 EVT RootVT = Root.getValueType();
38905 unsigned RootSizeInBits = RootVT.getSizeInBits();
38906 unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;
38907 assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
38908
38909 // Peek through extract_subvector to find widest legal vector.
38910 // TODO: Handle ISD::TRUNCATE
38911 unsigned WideSizeInBits = RootSizeInBits;
38912 for (unsigned I = 0; I != NumInputs; ++I) {
38913 SDValue Input = peekThroughBitcasts(Inputs[I]);
38914 while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR)
38915 Input = peekThroughBitcasts(Input.getOperand(0));
38916 if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&
38917 WideSizeInBits < Input.getValueSizeInBits())
38918 WideSizeInBits = Input.getValueSizeInBits();
38919 }
38920
38921 // Bail if we fail to find a source larger than the existing root.
38922 unsigned Scale = WideSizeInBits / RootSizeInBits;
38923 if (WideSizeInBits <= RootSizeInBits ||
38924 (WideSizeInBits % RootSizeInBits) != 0)
38925 return SDValue();
38926
38927 // Create new mask for larger type.
38928 SmallVector<int, 64> WideMask(BaseMask);
38929 for (int &M : WideMask) {
38930 if (M < 0)
38931 continue;
38932 M = (M % NumMaskElts) + ((M / NumMaskElts) * Scale * NumMaskElts);
38933 }
38934 WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
38935
38936 // Attempt to peek through inputs and adjust mask when we extract from an
38937 // upper subvector.
38938 int AdjustedMasks = 0;
38939 SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
38940 for (unsigned I = 0; I != NumInputs; ++I) {
38941 SDValue &Input = WideInputs[I];
38942 Input = peekThroughBitcasts(Input);
38943 while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
38944 Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {
38946 if (Idx != 0) {
38947 ++AdjustedMasks;
38948 unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();
38949 Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;
38950
38951 int lo = I * WideMask.size();
38952 int hi = (I + 1) * WideMask.size();
38953 for (int &M : WideMask)
38954 if (lo <= M && M < hi)
38955 M += Idx;
38956 }
38957 Input = peekThroughBitcasts(Input.getOperand(0));
38958 }
38959 }
38960
38961 // Remove unused/repeated shuffle source ops.
38962 resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
38963 assert(!WideInputs.empty() && "Shuffle with no inputs detected");
38964
38965 // Bail if we're always extracting from the lowest subvectors,
38966 // combineX86ShuffleChain should match this for the current width, or the
38967 // shuffle still references too many inputs.
38968 if (AdjustedMasks == 0 || WideInputs.size() > 2)
38969 return SDValue();
38970
38971 // Minor canonicalization of the accumulated shuffle mask to make it easier
38972 // to match below. All this does is detect masks with sequential pairs of
38973 // elements, and shrink them to the half-width mask. It does this in a loop
38974 // so it will reduce the size of the mask to the minimal width mask which
38975 // performs an equivalent shuffle.
38976 while (WideMask.size() > 1) {
38977 SmallVector<int, 64> WidenedMask;
38978 if (!canWidenShuffleElements(WideMask, WidenedMask))
38979 break;
38980 WideMask = std::move(WidenedMask);
38981 }
38982
38983 // Canonicalization of binary shuffle masks to improve pattern matching by
38984 // commuting the inputs.
38985 if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {
38987 std::swap(WideInputs[0], WideInputs[1]);
38988 }
38989
38990 // Increase depth for every upper subvector we've peeked through.
38991 Depth += AdjustedMasks;
38992
38993 // Attempt to combine wider chain.
38994 // TODO: Can we use a better Root?
38995 SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
38996 WideInputs.back().getValueSizeInBits()
38997 ? WideInputs.front()
38998 : WideInputs.back();
38999 assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&
39000 "WideRootSize mismatch");
39001
39002 if (SDValue WideShuffle =
39003 combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth,
39004 HasVariableMask, AllowVariableCrossLaneMask,
39005 AllowVariablePerLaneMask, DAG, Subtarget)) {
39006 WideShuffle =
39007 extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
39008 return DAG.getBitcast(RootVT, WideShuffle);
39009 }
39010
39011 return SDValue();
39012}
39013
39014// Canonicalize the combined shuffle mask chain with horizontal ops.
39015// NOTE: This may update the Ops and Mask.
39018 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
39019 const X86Subtarget &Subtarget) {
39020 if (Mask.empty() || Ops.empty())
39021 return SDValue();
39022
39024 for (SDValue Op : Ops)
39026
39027 // All ops must be the same horizop + type.
39028 SDValue BC0 = BC[0];
39029 EVT VT0 = BC0.getValueType();
39030 unsigned Opcode0 = BC0.getOpcode();
39031 if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
39032 return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
39033 }))
39034 return SDValue();
39035
39036 bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
39037 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
39038 bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
39039 if (!isHoriz && !isPack)
39040 return SDValue();
39041
39042 // Do all ops have a single use?
39043 bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
39044 return Op.hasOneUse() &&
39046 });
39047
39048 int NumElts = VT0.getVectorNumElements();
39049 int NumLanes = VT0.getSizeInBits() / 128;
39050 int NumEltsPerLane = NumElts / NumLanes;
39051 int NumHalfEltsPerLane = NumEltsPerLane / 2;
39052 MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
39053 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
39054
39055 if (NumEltsPerLane >= 4 &&
39056 (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
39057 SmallVector<int> LaneMask, ScaledMask;
39058 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
39059 scaleShuffleElements(LaneMask, 4, ScaledMask)) {
39060 // See if we can remove the shuffle by resorting the HOP chain so that
39061 // the HOP args are pre-shuffled.
39062 // TODO: Generalize to any sized/depth chain.
39063 // TODO: Add support for PACKSS/PACKUS.
39064 if (isHoriz) {
39065 // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
39066 auto GetHOpSrc = [&](int M) {
39067 if (M == SM_SentinelUndef)
39068 return DAG.getUNDEF(VT0);
39069 if (M == SM_SentinelZero)
39070 return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
39071 SDValue Src0 = BC[M / 4];
39072 SDValue Src1 = Src0.getOperand((M % 4) >= 2);
39073 if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
39074 return Src1.getOperand(M % 2);
39075 return SDValue();
39076 };
39077 SDValue M0 = GetHOpSrc(ScaledMask[0]);
39078 SDValue M1 = GetHOpSrc(ScaledMask[1]);
39079 SDValue M2 = GetHOpSrc(ScaledMask[2]);
39080 SDValue M3 = GetHOpSrc(ScaledMask[3]);
39081 if (M0 && M1 && M2 && M3) {
39082 SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
39083 SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
39084 return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
39085 }
39086 }
39087 // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
39088 if (Ops.size() >= 2) {
39089 SDValue LHS, RHS;
39090 auto GetHOpSrc = [&](int M, int &OutM) {
39091 // TODO: Support SM_SentinelZero
39092 if (M < 0)
39093 return M == SM_SentinelUndef;
39094 SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
39095 if (!LHS || LHS == Src) {
39096 LHS = Src;
39097 OutM = (M % 2);
39098 return true;
39099 }
39100 if (!RHS || RHS == Src) {
39101 RHS = Src;
39102 OutM = (M % 2) + 2;
39103 return true;
39104 }
39105 return false;
39106 };
39107 int PostMask[4] = {-1, -1, -1, -1};
39108 if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
39109 GetHOpSrc(ScaledMask[1], PostMask[1]) &&
39110 GetHOpSrc(ScaledMask[2], PostMask[2]) &&
39111 GetHOpSrc(ScaledMask[3], PostMask[3])) {
39112 LHS = DAG.getBitcast(SrcVT, LHS);
39113 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
39114 SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
39115 // Use SHUFPS for the permute so this will work on SSE2 targets,
39116 // shuffle combining and domain handling will simplify this later on.
39117 MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
39118 Res = DAG.getBitcast(ShuffleVT, Res);
39119 return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
39120 getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
39121 }
39122 }
39123 }
39124 }
39125
39126 if (2 < Ops.size())
39127 return SDValue();
39128
39129 SDValue BC1 = BC[BC.size() - 1];
39130 if (Mask.size() == VT0.getVectorNumElements()) {
39131 // Canonicalize binary shuffles of horizontal ops that use the
39132 // same sources to an unary shuffle.
39133 // TODO: Try to perform this fold even if the shuffle remains.
39134 if (Ops.size() == 2) {
39135 auto ContainsOps = [](SDValue HOp, SDValue Op) {
39136 return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
39137 };
39138 // Commute if all BC0's ops are contained in BC1.
39139 if (ContainsOps(BC1, BC0.getOperand(0)) &&
39140 ContainsOps(BC1, BC0.getOperand(1))) {
39142 std::swap(Ops[0], Ops[1]);
39143 std::swap(BC0, BC1);
39144 }
39145
39146 // If BC1 can be represented by BC0, then convert to unary shuffle.
39147 if (ContainsOps(BC0, BC1.getOperand(0)) &&
39148 ContainsOps(BC0, BC1.getOperand(1))) {
39149 for (int &M : Mask) {
39150 if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
39151 continue;
39152 int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
39153 M -= NumElts + (SubLane * NumHalfEltsPerLane);
39154 if (BC1.getOperand(SubLane) != BC0.getOperand(0))
39155 M += NumHalfEltsPerLane;
39156 }
39157 }
39158 }
39159
39160 // Canonicalize unary horizontal ops to only refer to lower halves.
39161 for (int i = 0; i != NumElts; ++i) {
39162 int &M = Mask[i];
39163 if (isUndefOrZero(M))
39164 continue;
39165 if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
39166 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
39167 M -= NumHalfEltsPerLane;
39168 if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
39169 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
39170 M -= NumHalfEltsPerLane;
39171 }
39172 }
39173
39174 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
39175 // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
39176 // represents the LHS/RHS inputs for the lower/upper halves.
39177 SmallVector<int, 16> TargetMask128, WideMask128;
39178 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
39179 scaleShuffleElements(TargetMask128, 2, WideMask128)) {
39180 assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle");
39181 bool SingleOp = (Ops.size() == 1);
39182 if (isPack || OneUseOps ||
39183 shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
39184 SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
39185 SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
39186 Lo = Lo.getOperand(WideMask128[0] & 1);
39187 Hi = Hi.getOperand(WideMask128[1] & 1);
39188 if (SingleOp) {
39189 SDValue Undef = DAG.getUNDEF(SrcVT);
39190 SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
39191 Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
39192 Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
39193 Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
39194 Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
39195 }
39196 return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
39197 }
39198 }
39199
39200 // If we are post-shuffling a 256-bit hop and not requiring the upper
39201 // elements, then try to narrow to a 128-bit hop directly.
39202 SmallVector<int, 16> WideMask64;
39203 if (Ops.size() == 1 && NumLanes == 2 &&
39204 scaleShuffleElements(Mask, 4, WideMask64) &&
39205 isUndefInRange(WideMask64, 2, 2)) {
39206 int M0 = WideMask64[0];
39207 int M1 = WideMask64[1];
39208 if (isInRange(M0, 0, 4) && isInRange(M1, 0, 4)) {
39210 unsigned Idx0 = (M0 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
39211 unsigned Idx1 = (M1 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
39212 SDValue V0 = extract128BitVector(BC[0].getOperand(M0 & 1), Idx0, DAG, DL);
39213 SDValue V1 = extract128BitVector(BC[0].getOperand(M1 & 1), Idx1, DAG, DL);
39214 SDValue Res = DAG.getNode(Opcode0, DL, HalfVT, V0, V1);
39215 return widenSubVector(Res, false, Subtarget, DAG, DL, 256);
39216 }
39217 }
39218
39219 return SDValue();
39220}
39221
39222// Attempt to constant fold all of the constant source ops.
39223// Returns true if the entire shuffle is folded to a constant.
39224// TODO: Extend this to merge multiple constant Ops and update the mask.
39226 ArrayRef<int> Mask, SDValue Root,
39227 bool HasVariableMask,
39228 SelectionDAG &DAG,
39229 const X86Subtarget &Subtarget) {
39230 MVT VT = Root.getSimpleValueType();
39231
39232 unsigned SizeInBits = VT.getSizeInBits();
39233 unsigned NumMaskElts = Mask.size();
39234 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
39235 unsigned NumOps = Ops.size();
39236
39237 // Extract constant bits from each source op.
39238 SmallVector<APInt, 16> UndefEltsOps(NumOps);
39239 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
39240 for (unsigned I = 0; I != NumOps; ++I)
39241 if (!getTargetConstantBitsFromNode(Ops[I], MaskSizeInBits, UndefEltsOps[I],
39242 RawBitsOps[I],
39243 /*AllowWholeUndefs*/ true,
39244 /*AllowPartialUndefs*/ true))
39245 return SDValue();
39246
39247 // If we're optimizing for size, only fold if at least one of the constants is
39248 // only used once or the combined shuffle has included a variable mask
39249 // shuffle, this is to avoid constant pool bloat.
39250 bool IsOptimizingSize = DAG.shouldOptForSize();
39251 if (IsOptimizingSize && !HasVariableMask &&
39252 llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); }))
39253 return SDValue();
39254
39255 // Shuffle the constant bits according to the mask.
39256 SDLoc DL(Root);
39257 APInt UndefElts(NumMaskElts, 0);
39258 APInt ZeroElts(NumMaskElts, 0);
39259 APInt ConstantElts(NumMaskElts, 0);
39260 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
39261 APInt::getZero(MaskSizeInBits));
39262 for (unsigned i = 0; i != NumMaskElts; ++i) {
39263 int M = Mask[i];
39264 if (M == SM_SentinelUndef) {
39265 UndefElts.setBit(i);
39266 continue;
39267 } else if (M == SM_SentinelZero) {
39268 ZeroElts.setBit(i);
39269 continue;
39270 }
39271 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
39272
39273 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
39274 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
39275
39276 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
39277 if (SrcUndefElts[SrcMaskIdx]) {
39278 UndefElts.setBit(i);
39279 continue;
39280 }
39281
39282 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
39283 APInt &Bits = SrcEltBits[SrcMaskIdx];
39284 if (!Bits) {
39285 ZeroElts.setBit(i);
39286 continue;
39287 }
39288
39289 ConstantElts.setBit(i);
39290 ConstantBitData[i] = Bits;
39291 }
39292 assert((UndefElts | ZeroElts | ConstantElts).isAllOnes());
39293
39294 // Attempt to create a zero vector.
39295 if ((UndefElts | ZeroElts).isAllOnes())
39296 return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);
39297
39298 // Create the constant data.
39299 MVT MaskSVT;
39300 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
39301 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
39302 else
39303 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
39304
39305 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
39306 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
39307 return SDValue();
39308
39309 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
39310 return DAG.getBitcast(VT, CstOp);
39311}
39312
39313namespace llvm {
39314 namespace X86 {
39315 enum {
39318 } // namespace X86
39319} // namespace llvm
39320
39321/// Fully generic combining of x86 shuffle instructions.
39322///
39323/// This should be the last combine run over the x86 shuffle instructions. Once
39324/// they have been fully optimized, this will recursively consider all chains
39325/// of single-use shuffle instructions, build a generic model of the cumulative
39326/// shuffle operation, and check for simpler instructions which implement this
39327/// operation. We use this primarily for two purposes:
39328///
39329/// 1) Collapse generic shuffles to specialized single instructions when
39330/// equivalent. In most cases, this is just an encoding size win, but
39331/// sometimes we will collapse multiple generic shuffles into a single
39332/// special-purpose shuffle.
39333/// 2) Look for sequences of shuffle instructions with 3 or more total
39334/// instructions, and replace them with the slightly more expensive SSSE3
39335/// PSHUFB instruction if available. We do this as the last combining step
39336/// to ensure we avoid using PSHUFB if we can implement the shuffle with
39337/// a suitable short sequence of other instructions. The PSHUFB will either
39338/// use a register or have to read from memory and so is slightly (but only
39339/// slightly) more expensive than the other shuffle instructions.
39340///
39341/// Because this is inherently a quadratic operation (for each shuffle in
39342/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
39343/// This should never be an issue in practice as the shuffle lowering doesn't
39344/// produce sequences of more than 8 instructions.
39345///
39346/// FIXME: We will currently miss some cases where the redundant shuffling
39347/// would simplify under the threshold for PSHUFB formation because of
39348/// combine-ordering. To fix this, we should do the redundant instruction
39349/// combining in this recursive walk.
39351 ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
39352 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
39353 unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask,
39354 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
39355 const X86Subtarget &Subtarget) {
39356 assert(!RootMask.empty() &&
39357 (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
39358 "Illegal shuffle root mask");
39359 MVT RootVT = Root.getSimpleValueType();
39360 assert(RootVT.isVector() && "Shuffles operate on vector types!");
39361 unsigned RootSizeInBits = RootVT.getSizeInBits();
39362
39363 // Bound the depth of our recursive combine because this is ultimately
39364 // quadratic in nature.
39365 if (Depth >= MaxDepth)
39366 return SDValue();
39367
39368 // Directly rip through bitcasts to find the underlying operand.
39369 SDValue Op = SrcOps[SrcOpIndex];
39371
39372 EVT VT = Op.getValueType();
39373 if (!VT.isVector() || !VT.isSimple())
39374 return SDValue(); // Bail if we hit a non-simple non-vector.
39375
39376 // FIXME: Just bail on f16 for now.
39377 if (VT.getVectorElementType() == MVT::f16)
39378 return SDValue();
39379
39380 assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&
39381 "Can only combine shuffles upto size of the root op.");
39382
39383 // Create a demanded elts mask from the referenced elements of Op.
39384 APInt OpDemandedElts = APInt::getZero(RootMask.size());
39385 for (int M : RootMask) {
39386 int BaseIdx = RootMask.size() * SrcOpIndex;
39387 if (isInRange(M, BaseIdx, BaseIdx + RootMask.size()))
39388 OpDemandedElts.setBit(M - BaseIdx);
39389 }
39390 if (RootSizeInBits != VT.getSizeInBits()) {
39391 // Op is smaller than Root - extract the demanded elts for the subvector.
39392 unsigned Scale = RootSizeInBits / VT.getSizeInBits();
39393 unsigned NumOpMaskElts = RootMask.size() / Scale;
39394 assert((RootMask.size() % Scale) == 0 && "Root mask size mismatch");
39395 assert(OpDemandedElts
39396 .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)
39397 .isZero() &&
39398 "Out of range elements referenced in root mask");
39399 OpDemandedElts = OpDemandedElts.extractBits(NumOpMaskElts, 0);
39400 }
39401 OpDemandedElts =
39402 APIntOps::ScaleBitMask(OpDemandedElts, VT.getVectorNumElements());
39403
39404 // Extract target shuffle mask and resolve sentinels and inputs.
39405 SmallVector<int, 64> OpMask;
39406 SmallVector<SDValue, 2> OpInputs;
39407 APInt OpUndef, OpZero;
39408 bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
39409 if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
39410 OpZero, DAG, Depth, false)) {
39411 // Shuffle inputs must not be larger than the shuffle result.
39412 // TODO: Relax this for single input faux shuffles (e.g. trunc).
39413 if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
39414 return OpInput.getValueSizeInBits() > VT.getSizeInBits();
39415 }))
39416 return SDValue();
39417 } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
39418 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
39419 !isNullConstant(Op.getOperand(1))) {
39420 SDValue SrcVec = Op.getOperand(0);
39421 int ExtractIdx = Op.getConstantOperandVal(1);
39422 unsigned NumElts = VT.getVectorNumElements();
39423 OpInputs.assign({SrcVec});
39424 OpMask.assign(NumElts, SM_SentinelUndef);
39425 std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
39426 OpZero = OpUndef = APInt::getZero(NumElts);
39427 } else {
39428 return SDValue();
39429 }
39430
39431 // If the shuffle result was smaller than the root, we need to adjust the
39432 // mask indices and pad the mask with undefs.
39433 if (RootSizeInBits > VT.getSizeInBits()) {
39434 unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
39435 unsigned OpMaskSize = OpMask.size();
39436 if (OpInputs.size() > 1) {
39437 unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
39438 for (int &M : OpMask) {
39439 if (M < 0)
39440 continue;
39441 int EltIdx = M % OpMaskSize;
39442 int OpIdx = M / OpMaskSize;
39443 M = (PaddedMaskSize * OpIdx) + EltIdx;
39444 }
39445 }
39446 OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
39447 OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
39448 OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
39449 }
39450
39453
39454 // We don't need to merge masks if the root is empty.
39455 bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
39456 if (EmptyRoot) {
39457 // Only resolve zeros if it will remove an input, otherwise we might end
39458 // up in an infinite loop.
39459 bool ResolveKnownZeros = true;
39460 if (!OpZero.isZero()) {
39461 APInt UsedInputs = APInt::getZero(OpInputs.size());
39462 for (int i = 0, e = OpMask.size(); i != e; ++i) {
39463 int M = OpMask[i];
39464 if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
39465 continue;
39466 UsedInputs.setBit(M / OpMask.size());
39467 if (UsedInputs.isAllOnes()) {
39468 ResolveKnownZeros = false;
39469 break;
39470 }
39471 }
39472 }
39473 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
39474 ResolveKnownZeros);
39475
39476 Mask = OpMask;
39477 Ops.append(OpInputs.begin(), OpInputs.end());
39478 } else {
39479 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
39480
39481 // Add the inputs to the Ops list, avoiding duplicates.
39482 Ops.append(SrcOps.begin(), SrcOps.end());
39483
39484 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
39485 // Attempt to find an existing match.
39486 SDValue InputBC = peekThroughBitcasts(Input);
39487 for (int i = 0, e = Ops.size(); i < e; ++i)
39488 if (InputBC == peekThroughBitcasts(Ops[i]))
39489 return i;
39490 // Match failed - should we replace an existing Op?
39491 if (InsertionPoint >= 0) {
39492 Ops[InsertionPoint] = Input;
39493 return InsertionPoint;
39494 }
39495 // Add to the end of the Ops list.
39496 Ops.push_back(Input);
39497 return Ops.size() - 1;
39498 };
39499
39500 SmallVector<int, 2> OpInputIdx;
39501 for (SDValue OpInput : OpInputs)
39502 OpInputIdx.push_back(
39503 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
39504
39505 assert(((RootMask.size() > OpMask.size() &&
39506 RootMask.size() % OpMask.size() == 0) ||
39507 (OpMask.size() > RootMask.size() &&
39508 OpMask.size() % RootMask.size() == 0) ||
39509 OpMask.size() == RootMask.size()) &&
39510 "The smaller number of elements must divide the larger.");
39511
39512 // This function can be performance-critical, so we rely on the power-of-2
39513 // knowledge that we have about the mask sizes to replace div/rem ops with
39514 // bit-masks and shifts.
39515 assert(llvm::has_single_bit<uint32_t>(RootMask.size()) &&
39516 "Non-power-of-2 shuffle mask sizes");
39517 assert(llvm::has_single_bit<uint32_t>(OpMask.size()) &&
39518 "Non-power-of-2 shuffle mask sizes");
39519 unsigned RootMaskSizeLog2 = llvm::countr_zero(RootMask.size());
39520 unsigned OpMaskSizeLog2 = llvm::countr_zero(OpMask.size());
39521
39522 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
39523 unsigned RootRatio =
39524 std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
39525 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
39526 assert((RootRatio == 1 || OpRatio == 1) &&
39527 "Must not have a ratio for both incoming and op masks!");
39528
39529 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
39530 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
39531 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
39532 unsigned RootRatioLog2 = llvm::countr_zero(RootRatio);
39533 unsigned OpRatioLog2 = llvm::countr_zero(OpRatio);
39534
39535 Mask.resize(MaskWidth, SM_SentinelUndef);
39536
39537 // Merge this shuffle operation's mask into our accumulated mask. Note that
39538 // this shuffle's mask will be the first applied to the input, followed by
39539 // the root mask to get us all the way to the root value arrangement. The
39540 // reason for this order is that we are recursing up the operation chain.
39541 for (unsigned i = 0; i < MaskWidth; ++i) {
39542 unsigned RootIdx = i >> RootRatioLog2;
39543 if (RootMask[RootIdx] < 0) {
39544 // This is a zero or undef lane, we're done.
39545 Mask[i] = RootMask[RootIdx];
39546 continue;
39547 }
39548
39549 unsigned RootMaskedIdx =
39550 RootRatio == 1
39551 ? RootMask[RootIdx]
39552 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
39553
39554 // Just insert the scaled root mask value if it references an input other
39555 // than the SrcOp we're currently inserting.
39556 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
39557 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
39558 Mask[i] = RootMaskedIdx;
39559 continue;
39560 }
39561
39562 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
39563 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
39564 if (OpMask[OpIdx] < 0) {
39565 // The incoming lanes are zero or undef, it doesn't matter which ones we
39566 // are using.
39567 Mask[i] = OpMask[OpIdx];
39568 continue;
39569 }
39570
39571 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
39572 unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
39573 : (OpMask[OpIdx] << OpRatioLog2) +
39574 (RootMaskedIdx & (OpRatio - 1));
39575
39576 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
39577 int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
39578 assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
39579 OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
39580
39581 Mask[i] = OpMaskedIdx;
39582 }
39583 }
39584
39585 // Peek through vector widenings and set out of bounds mask indices to undef.
39586 // TODO: Can resolveTargetShuffleInputsAndMask do some of this?
39587 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
39588 SDValue &Op = Ops[I];
39589 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op.getOperand(0).isUndef() &&
39590 isNullConstant(Op.getOperand(2))) {
39591 Op = Op.getOperand(1);
39592 unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();
39593 int Lo = I * Mask.size();
39594 int Hi = (I + 1) * Mask.size();
39595 int NewHi = Lo + (Mask.size() / Scale);
39596 for (int &M : Mask) {
39597 if (Lo <= M && NewHi <= M && M < Hi)
39598 M = SM_SentinelUndef;
39599 }
39600 }
39601 }
39602
39603 // Peek through any free extract_subvector nodes back to root size.
39604 for (SDValue &Op : Ops)
39605 while (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
39606 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
39607 isNullConstant(Op.getOperand(1)))
39608 Op = Op.getOperand(0);
39609
39610 // Remove unused/repeated shuffle source ops.
39612
39613 // Handle the all undef/zero/ones cases early.
39614 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
39615 return DAG.getUNDEF(RootVT);
39616 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
39617 return getZeroVector(RootVT, Subtarget, DAG, SDLoc(Root));
39618 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
39620 return getOnesVector(RootVT, DAG, SDLoc(Root));
39621
39622 assert(!Ops.empty() && "Shuffle with no inputs detected");
39623 HasVariableMask |= IsOpVariableMask;
39624
39625 // Update the list of shuffle nodes that have been combined so far.
39626 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
39627 SrcNodes.end());
39628 CombinedNodes.push_back(Op.getNode());
39629
39630 // See if we can recurse into each shuffle source op (if it's a target
39631 // shuffle). The source op should only be generally combined if it either has
39632 // a single use (i.e. current Op) or all its users have already been combined,
39633 // if not then we can still combine but should prevent generation of variable
39634 // shuffles to avoid constant pool bloat.
39635 // Don't recurse if we already have more source ops than we can combine in
39636 // the remaining recursion depth.
39637 if (Ops.size() < (MaxDepth - Depth)) {
39638 for (int i = 0, e = Ops.size(); i < e; ++i) {
39639 // For empty roots, we need to resolve zeroable elements before combining
39640 // them with other shuffles.
39641 SmallVector<int, 64> ResolvedMask = Mask;
39642 if (EmptyRoot)
39643 resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
39644 bool AllowCrossLaneVar = false;
39645 bool AllowPerLaneVar = false;
39646 if (Ops[i].getNode()->hasOneUse() ||
39647 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
39648 AllowCrossLaneVar = AllowVariableCrossLaneMask;
39649 AllowPerLaneVar = AllowVariablePerLaneMask;
39650 }
39652 Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,
39653 HasVariableMask, AllowCrossLaneVar, AllowPerLaneVar, DAG,
39654 Subtarget))
39655 return Res;
39656 }
39657 }
39658
39659 // Attempt to constant fold all of the constant source ops.
39661 Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
39662 return Cst;
39663
39664 // If constant fold failed and we only have constants - then we have
39665 // multiple uses by a single non-variable shuffle - just bail.
39666 if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
39667 APInt UndefElts;
39668 SmallVector<APInt> RawBits;
39669 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
39670 return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
39671 RawBits,
39672 /*AllowWholeUndefs*/ true,
39673 /*AllowPartialUndefs*/ true);
39674 })) {
39675 return SDValue();
39676 }
39677
39678 // Canonicalize the combined shuffle mask chain with horizontal ops.
39679 // NOTE: This will update the Ops and Mask.
39681 Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))
39682 return DAG.getBitcast(RootVT, HOp);
39683
39684 // Try to refine our inputs given our knowledge of target shuffle mask.
39685 for (auto I : enumerate(Ops)) {
39686 int OpIdx = I.index();
39687 SDValue &Op = I.value();
39688
39689 // What range of shuffle mask element values results in picking from Op?
39690 int Lo = OpIdx * Mask.size();
39691 int Hi = Lo + Mask.size();
39692
39693 // Which elements of Op do we demand, given the mask's granularity?
39694 APInt OpDemandedElts(Mask.size(), 0);
39695 for (int MaskElt : Mask) {
39696 if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?
39697 int OpEltIdx = MaskElt - Lo;
39698 OpDemandedElts.setBit(OpEltIdx);
39699 }
39700 }
39701
39702 // Is the shuffle result smaller than the root?
39703 if (Op.getValueSizeInBits() < RootSizeInBits) {
39704 // We padded the mask with undefs. But we now need to undo that.
39705 unsigned NumExpectedVectorElts = Mask.size();
39706 unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;
39707 unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;
39708 assert(!OpDemandedElts.extractBits(
39709 NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&
39710 "Demanding the virtual undef widening padding?");
39711 OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW
39712 }
39713
39714 // The Op itself may be of different VT, so we need to scale the mask.
39715 unsigned NumOpElts = Op.getValueType().getVectorNumElements();
39716 APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
39717
39718 // Can this operand be simplified any further, given it's demanded elements?
39719 if (SDValue NewOp =
39721 Op, OpScaledDemandedElts, DAG))
39722 Op = NewOp;
39723 }
39724 // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
39725
39726 // Widen any subvector shuffle inputs we've collected.
39727 // TODO: Remove this to avoid generating temporary nodes, we should only
39728 // widen once combineX86ShuffleChain has found a match.
39729 if (any_of(Ops, [RootSizeInBits](SDValue Op) {
39730 return Op.getValueSizeInBits() < RootSizeInBits;
39731 })) {
39732 for (SDValue &Op : Ops)
39733 if (Op.getValueSizeInBits() < RootSizeInBits)
39734 Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
39735 RootSizeInBits);
39736 // Reresolve - we might have repeated subvector sources.
39738 }
39739
39740 // We can only combine unary and binary shuffle mask cases.
39741 if (Ops.size() <= 2) {
39742 // Minor canonicalization of the accumulated shuffle mask to make it easier
39743 // to match below. All this does is detect masks with sequential pairs of
39744 // elements, and shrink them to the half-width mask. It does this in a loop
39745 // so it will reduce the size of the mask to the minimal width mask which
39746 // performs an equivalent shuffle.
39747 while (Mask.size() > 1) {
39748 SmallVector<int, 64> WidenedMask;
39749 if (!canWidenShuffleElements(Mask, WidenedMask))
39750 break;
39751 Mask = std::move(WidenedMask);
39752 }
39753
39754 // Canonicalization of binary shuffle masks to improve pattern matching by
39755 // commuting the inputs.
39756 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
39758 std::swap(Ops[0], Ops[1]);
39759 }
39760
39761 // Try to combine into a single shuffle instruction.
39762 if (SDValue Shuffle = combineX86ShuffleChain(
39763 Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
39764 AllowVariablePerLaneMask, DAG, Subtarget))
39765 return Shuffle;
39766
39767 // If all the operands come from the same larger vector, fallthrough and try
39768 // to use combineX86ShuffleChainWithExtract.
39771 if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||
39772 (RootSizeInBits / Mask.size()) != 64 ||
39773 LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
39774 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
39775 LHS.getOperand(0) != RHS.getOperand(0))
39776 return SDValue();
39777 }
39778
39779 // If that failed and any input is extracted then try to combine as a
39780 // shuffle with the larger type.
39782 Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
39783 AllowVariablePerLaneMask, DAG, Subtarget);
39784}
39785
39786/// Helper entry wrapper to combineX86ShufflesRecursively.
39788 const X86Subtarget &Subtarget) {
39790 {Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth,
39791 /*HasVarMask*/ false,
39792 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG,
39793 Subtarget);
39794}
39795
39796/// Get the PSHUF-style mask from PSHUF node.
39797///
39798/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
39799/// PSHUF-style masks that can be reused with such instructions.
39801 MVT VT = N.getSimpleValueType();
39804 bool HaveMask = getTargetShuffleMask(N, false, Ops, Mask);
39805 (void)HaveMask;
39806 assert(HaveMask);
39807
39808 // If we have more than 128-bits, only the low 128-bits of shuffle mask
39809 // matter. Check that the upper masks are repeats and remove them.
39810 if (VT.getSizeInBits() > 128) {
39811 int LaneElts = 128 / VT.getScalarSizeInBits();
39812#ifndef NDEBUG
39813 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
39814 for (int j = 0; j < LaneElts; ++j)
39815 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
39816 "Mask doesn't repeat in high 128-bit lanes!");
39817#endif
39818 Mask.resize(LaneElts);
39819 }
39820
39821 switch (N.getOpcode()) {
39822 case X86ISD::PSHUFD:
39823 return Mask;
39824 case X86ISD::PSHUFLW:
39825 Mask.resize(4);
39826 return Mask;
39827 case X86ISD::PSHUFHW:
39828 Mask.erase(Mask.begin(), Mask.begin() + 4);
39829 for (int &M : Mask)
39830 M -= 4;
39831 return Mask;
39832 default:
39833 llvm_unreachable("No valid shuffle instruction found!");
39834 }
39835}
39836
39837/// Search for a combinable shuffle across a chain ending in pshufd.
39838///
39839/// We walk up the chain and look for a combinable shuffle, skipping over
39840/// shuffles that we could hoist this shuffle's transformation past without
39841/// altering anything.
39844 const SDLoc &DL,
39845 SelectionDAG &DAG) {
39846 assert(N.getOpcode() == X86ISD::PSHUFD &&
39847 "Called with something other than an x86 128-bit half shuffle!");
39848
39849 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
39850 // of the shuffles in the chain so that we can form a fresh chain to replace
39851 // this one.
39853 SDValue V = N.getOperand(0);
39854 for (; V.hasOneUse(); V = V.getOperand(0)) {
39855 switch (V.getOpcode()) {
39856 default:
39857 return SDValue(); // Nothing combined!
39858
39859 case ISD::BITCAST:
39860 // Skip bitcasts as we always know the type for the target specific
39861 // instructions.
39862 continue;
39863
39864 case X86ISD::PSHUFD:
39865 // Found another dword shuffle.
39866 break;
39867
39868 case X86ISD::PSHUFLW:
39869 // Check that the low words (being shuffled) are the identity in the
39870 // dword shuffle, and the high words are self-contained.
39871 if (Mask[0] != 0 || Mask[1] != 1 ||
39872 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
39873 return SDValue();
39874
39875 Chain.push_back(V);
39876 continue;
39877
39878 case X86ISD::PSHUFHW:
39879 // Check that the high words (being shuffled) are the identity in the
39880 // dword shuffle, and the low words are self-contained.
39881 if (Mask[2] != 2 || Mask[3] != 3 ||
39882 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
39883 return SDValue();
39884
39885 Chain.push_back(V);
39886 continue;
39887
39888 case X86ISD::UNPCKL:
39889 case X86ISD::UNPCKH:
39890 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
39891 // shuffle into a preceding word shuffle.
39892 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
39893 V.getSimpleValueType().getVectorElementType() != MVT::i16)
39894 return SDValue();
39895
39896 // Search for a half-shuffle which we can combine with.
39897 unsigned CombineOp =
39898 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
39899 if (V.getOperand(0) != V.getOperand(1) ||
39900 !V->isOnlyUserOf(V.getOperand(0).getNode()))
39901 return SDValue();
39902 Chain.push_back(V);
39903 V = V.getOperand(0);
39904 do {
39905 switch (V.getOpcode()) {
39906 default:
39907 return SDValue(); // Nothing to combine.
39908
39909 case X86ISD::PSHUFLW:
39910 case X86ISD::PSHUFHW:
39911 if (V.getOpcode() == CombineOp)
39912 break;
39913
39914 Chain.push_back(V);
39915
39916 [[fallthrough]];
39917 case ISD::BITCAST:
39918 V = V.getOperand(0);
39919 continue;
39920 }
39921 break;
39922 } while (V.hasOneUse());
39923 break;
39924 }
39925 // Break out of the loop if we break out of the switch.
39926 break;
39927 }
39928
39929 if (!V.hasOneUse())
39930 // We fell out of the loop without finding a viable combining instruction.
39931 return SDValue();
39932
39933 // Merge this node's mask and our incoming mask.
39935 for (int &M : Mask)
39936 M = VMask[M];
39937 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
39938 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
39939
39940 // Rebuild the chain around this new shuffle.
39941 while (!Chain.empty()) {
39942 SDValue W = Chain.pop_back_val();
39943
39944 if (V.getValueType() != W.getOperand(0).getValueType())
39945 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
39946
39947 switch (W.getOpcode()) {
39948 default:
39949 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
39950
39951 case X86ISD::UNPCKL:
39952 case X86ISD::UNPCKH:
39953 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
39954 break;
39955
39956 case X86ISD::PSHUFD:
39957 case X86ISD::PSHUFLW:
39958 case X86ISD::PSHUFHW:
39959 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
39960 break;
39961 }
39962 }
39963 if (V.getValueType() != N.getValueType())
39964 V = DAG.getBitcast(N.getValueType(), V);
39965
39966 // Return the new chain to replace N.
39967 return V;
39968}
39969
39970// Attempt to commute shufps LHS loads:
39971// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
39973 SelectionDAG &DAG) {
39974 // TODO: Add vXf64 support.
39975 if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
39976 return SDValue();
39977
39978 // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
39979 auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
39980 if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
39981 return SDValue();
39982 SDValue N0 = V.getOperand(0);
39983 SDValue N1 = V.getOperand(1);
39984 unsigned Imm = V.getConstantOperandVal(2);
39985 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
39986 if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
39988 return SDValue();
39989 Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
39990 return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
39991 DAG.getTargetConstant(Imm, DL, MVT::i8));
39992 };
39993
39994 switch (N.getOpcode()) {
39995 case X86ISD::VPERMILPI:
39996 if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
39997 unsigned Imm = N.getConstantOperandVal(1);
39998 return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
39999 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
40000 }
40001 break;
40002 case X86ISD::SHUFP: {
40003 SDValue N0 = N.getOperand(0);
40004 SDValue N1 = N.getOperand(1);
40005 unsigned Imm = N.getConstantOperandVal(2);
40006 if (N0 == N1) {
40007 if (SDValue NewSHUFP = commuteSHUFP(N, N0))
40008 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
40009 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
40010 } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
40011 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
40012 DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
40013 } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
40014 return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
40015 DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
40016 }
40017 break;
40018 }
40019 }
40020
40021 return SDValue();
40022}
40023
40024// TODO - move this to TLI like isBinOp?
40025static bool isUnaryOp(unsigned Opcode) {
40026 switch (Opcode) {
40027 case ISD::CTLZ:
40028 case ISD::CTTZ:
40029 case ISD::CTPOP:
40030 return true;
40031 }
40032 return false;
40033}
40034
40035// Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
40036// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
40038 const SDLoc &DL) {
40039 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40040 EVT ShuffleVT = N.getValueType();
40041 unsigned Opc = N.getOpcode();
40042
40043 auto IsMergeableWithShuffle = [Opc, &DAG](SDValue Op, bool FoldShuf = true,
40044 bool FoldLoad = false) {
40045 // AllZeros/AllOnes constants are freely shuffled and will peek through
40046 // bitcasts. Other constant build vectors do not peek through bitcasts. Only
40047 // merge with target shuffles if it has one use so shuffle combining is
40048 // likely to kick in. Shuffles of splats are expected to be removed.
40049 return ISD::isBuildVectorAllOnes(Op.getNode()) ||
40050 ISD::isBuildVectorAllZeros(Op.getNode()) ||
40053 getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op)) ||
40054 (Op.getOpcode() == Opc && Op->hasOneUse()) ||
40055 (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||
40056 (FoldShuf && isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
40057 (FoldLoad && isShuffleFoldableLoad(Op)) ||
40058 DAG.isSplatValue(Op, /*AllowUndefs*/ false);
40059 };
40060 auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
40061 // Ensure we only shuffle whole vector src elements, unless its a logical
40062 // binops where we can more aggressively move shuffles from dst to src.
40063 return isLogicOp(BinOp) ||
40064 (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
40065 };
40066
40067 switch (Opc) {
40068 // Unary and Unary+Permute Shuffles.
40069 case X86ISD::PSHUFB: {
40070 // Don't merge PSHUFB if it contains zero'd elements.
40071 SmallVector<int> Mask;
40073 if (!getTargetShuffleMask(N, false, Ops, Mask))
40074 break;
40075 [[fallthrough]];
40076 }
40077 case X86ISD::VBROADCAST:
40078 case X86ISD::MOVDDUP:
40079 case X86ISD::PSHUFD:
40080 case X86ISD::PSHUFHW:
40081 case X86ISD::PSHUFLW:
40082 case X86ISD::VPERMI:
40083 case X86ISD::VPERMILPI: {
40084 if (N.getOperand(0).getValueType() == ShuffleVT &&
40085 N->isOnlyUserOf(N.getOperand(0).getNode())) {
40086 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
40087 unsigned SrcOpcode = N0.getOpcode();
40088 if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
40091 if (IsMergeableWithShuffle(Op00, Opc != X86ISD::VPERMI,
40092 Opc != X86ISD::PSHUFB) ||
40093 IsMergeableWithShuffle(Op01, Opc != X86ISD::VPERMI,
40094 Opc != X86ISD::PSHUFB)) {
40095 SDValue LHS, RHS;
40096 Op00 = DAG.getBitcast(ShuffleVT, Op00);
40097 Op01 = DAG.getBitcast(ShuffleVT, Op01);
40098 if (N.getNumOperands() == 2) {
40099 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
40100 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
40101 } else {
40102 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
40103 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
40104 }
40105 EVT OpVT = N0.getValueType();
40106 return DAG.getBitcast(ShuffleVT,
40107 DAG.getNode(SrcOpcode, DL, OpVT,
40108 DAG.getBitcast(OpVT, LHS),
40109 DAG.getBitcast(OpVT, RHS)));
40110 }
40111 }
40112 }
40113 break;
40114 }
40115 // Binary and Binary+Permute Shuffles.
40116 case X86ISD::INSERTPS: {
40117 // Don't merge INSERTPS if it contains zero'd elements.
40118 unsigned InsertPSMask = N.getConstantOperandVal(2);
40119 unsigned ZeroMask = InsertPSMask & 0xF;
40120 if (ZeroMask != 0)
40121 break;
40122 [[fallthrough]];
40123 }
40124 case X86ISD::MOVSD:
40125 case X86ISD::MOVSS:
40126 case X86ISD::BLENDI:
40127 case X86ISD::SHUFP:
40128 case X86ISD::UNPCKH:
40129 case X86ISD::UNPCKL: {
40130 if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
40131 N->isOnlyUserOf(N.getOperand(1).getNode())) {
40132 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
40133 SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
40134 unsigned SrcOpcode = N0.getOpcode();
40135 if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
40136 N0.getValueType() == N1.getValueType() &&
40137 IsSafeToMoveShuffle(N0, SrcOpcode) &&
40138 IsSafeToMoveShuffle(N1, SrcOpcode)) {
40143 // Ensure the total number of shuffles doesn't increase by folding this
40144 // shuffle through to the source ops.
40145 if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
40146 (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
40147 ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
40148 (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
40149 SDValue LHS, RHS;
40150 Op00 = DAG.getBitcast(ShuffleVT, Op00);
40151 Op10 = DAG.getBitcast(ShuffleVT, Op10);
40152 Op01 = DAG.getBitcast(ShuffleVT, Op01);
40153 Op11 = DAG.getBitcast(ShuffleVT, Op11);
40154 if (N.getNumOperands() == 3) {
40155 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
40156 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
40157 } else {
40158 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
40159 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
40160 }
40161 EVT OpVT = N0.getValueType();
40162 return DAG.getBitcast(ShuffleVT,
40163 DAG.getNode(SrcOpcode, DL, OpVT,
40164 DAG.getBitcast(OpVT, LHS),
40165 DAG.getBitcast(OpVT, RHS)));
40166 }
40167 }
40168 if (isUnaryOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
40169 N0.getValueType() == N1.getValueType() &&
40170 IsSafeToMoveShuffle(N0, SrcOpcode) &&
40171 IsSafeToMoveShuffle(N1, SrcOpcode)) {
40174 SDValue Res;
40175 Op00 = DAG.getBitcast(ShuffleVT, Op00);
40176 Op10 = DAG.getBitcast(ShuffleVT, Op10);
40177 if (N.getNumOperands() == 3) {
40178 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
40179 } else {
40180 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
40181 }
40182 EVT OpVT = N0.getValueType();
40183 return DAG.getBitcast(
40184 ShuffleVT,
40185 DAG.getNode(SrcOpcode, DL, OpVT, DAG.getBitcast(OpVT, Res)));
40186 }
40187 }
40188 break;
40189 }
40190 }
40191 return SDValue();
40192}
40193
40194/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
40196 SelectionDAG &DAG,
40197 const SDLoc &DL) {
40198 assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
40199
40200 MVT VT = V.getSimpleValueType();
40201 SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
40202 SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
40203 unsigned SrcOpc0 = Src0.getOpcode();
40204 unsigned SrcOpc1 = Src1.getOpcode();
40205 EVT SrcVT0 = Src0.getValueType();
40206 EVT SrcVT1 = Src1.getValueType();
40207
40208 if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
40209 return SDValue();
40210
40211 switch (SrcOpc0) {
40212 case X86ISD::MOVDDUP: {
40213 SDValue LHS = Src0.getOperand(0);
40214 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
40215 SDValue Res =
40216 DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
40217 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
40218 return DAG.getBitcast(VT, Res);
40219 }
40220 case X86ISD::VPERMILPI:
40221 // TODO: Handle v4f64 permutes with different low/high lane masks.
40222 if (SrcVT0 == MVT::v4f64) {
40223 uint64_t Mask = Src0.getConstantOperandVal(1);
40224 if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
40225 break;
40226 }
40227 [[fallthrough]];
40228 case X86ISD::VSHLI:
40229 case X86ISD::VSRLI:
40230 case X86ISD::VSRAI:
40231 case X86ISD::PSHUFD:
40232 if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
40233 SDValue LHS = Src0.getOperand(0);
40234 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
40235 SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
40236 V.getOperand(2));
40237 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
40238 return DAG.getBitcast(VT, Res);
40239 }
40240 break;
40241 }
40242
40243 return SDValue();
40244}
40245
40246/// Try to combine x86 target specific shuffles.
40248 SelectionDAG &DAG,
40250 const X86Subtarget &Subtarget) {
40251 MVT VT = N.getSimpleValueType();
40253 unsigned Opcode = N.getOpcode();
40254 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40255
40256 if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
40257 return R;
40258
40259 // Handle specific target shuffles.
40260 switch (Opcode) {
40261 case X86ISD::MOVDDUP: {
40262 SDValue Src = N.getOperand(0);
40263 // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
40264 if (VT == MVT::v2f64 && Src.hasOneUse() &&
40265 ISD::isNormalLoad(Src.getNode())) {
40266 LoadSDNode *LN = cast<LoadSDNode>(Src);
40267 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
40268 SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
40269 DCI.CombineTo(N.getNode(), Movddup);
40270 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
40272 return N; // Return N so it doesn't get rechecked!
40273 }
40274 }
40275
40276 return SDValue();
40277 }
40278 case X86ISD::VBROADCAST: {
40279 SDValue Src = N.getOperand(0);
40280 SDValue BC = peekThroughBitcasts(Src);
40281 EVT SrcVT = Src.getValueType();
40282 EVT BCVT = BC.getValueType();
40283
40284 // If broadcasting from another shuffle, attempt to simplify it.
40285 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
40286 if (isTargetShuffle(BC.getOpcode()) &&
40287 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
40288 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
40289 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
40291 for (unsigned i = 0; i != Scale; ++i)
40292 DemandedMask[i] = i;
40294 {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
40296 /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true,
40297 /*AllowPerLaneVarMask*/ true, DAG, Subtarget))
40298 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
40299 DAG.getBitcast(SrcVT, Res));
40300 }
40301
40302 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
40303 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
40304 if (Src.getOpcode() == ISD::BITCAST &&
40305 SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
40306 TLI.isTypeLegal(BCVT) &&
40308 BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
40309 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
40311 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
40312 }
40313
40314 // vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))
40315 // If we're re-broadcasting a smaller type then broadcast with that type and
40316 // bitcast.
40317 // TODO: Do this for any splat?
40318 if (Src.getOpcode() == ISD::BITCAST &&
40319 (BC.getOpcode() == X86ISD::VBROADCAST ||
40321 (VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits()) == 0 &&
40322 (VT.getSizeInBits() % BCVT.getSizeInBits()) == 0) {
40323 MVT NewVT =
40325 VT.getSizeInBits() / BCVT.getScalarSizeInBits());
40326 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
40327 }
40328
40329 // Reduce broadcast source vector to lowest 128-bits.
40330 if (SrcVT.getSizeInBits() > 128)
40331 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
40332 extract128BitVector(Src, 0, DAG, DL));
40333
40334 // broadcast(scalar_to_vector(x)) -> broadcast(x).
40335 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR &&
40336 Src.getValueType().getScalarType() == Src.getOperand(0).getValueType())
40337 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
40338
40339 // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
40340 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
40341 isNullConstant(Src.getOperand(1)) &&
40342 Src.getValueType() ==
40343 Src.getOperand(0).getValueType().getScalarType() &&
40344 TLI.isTypeLegal(Src.getOperand(0).getValueType()))
40345 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
40346
40347 // Share broadcast with the longest vector and extract low subvector (free).
40348 // Ensure the same SDValue from the SDNode use is being used.
40349 for (SDNode *User : Src->uses())
40350 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
40351 Src == User->getOperand(0) &&
40352 User->getValueSizeInBits(0).getFixedValue() >
40353 VT.getFixedSizeInBits()) {
40354 return extractSubVector(SDValue(User, 0), 0, DAG, DL,
40355 VT.getSizeInBits());
40356 }
40357
40358 // vbroadcast(scalarload X) -> vbroadcast_load X
40359 // For float loads, extract other uses of the scalar from the broadcast.
40360 if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
40361 ISD::isNormalLoad(Src.getNode())) {
40362 LoadSDNode *LN = cast<LoadSDNode>(Src);
40363 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40364 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
40365 SDValue BcastLd =
40367 LN->getMemoryVT(), LN->getMemOperand());
40368 // If the load value is used only by N, replace it via CombineTo N.
40369 bool NoReplaceExtract = Src.hasOneUse();
40370 DCI.CombineTo(N.getNode(), BcastLd);
40371 if (NoReplaceExtract) {
40372 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
40374 } else {
40375 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
40376 DAG.getIntPtrConstant(0, DL));
40377 DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
40378 }
40379 return N; // Return N so it doesn't get rechecked!
40380 }
40381
40382 // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
40383 // i16. So shrink it ourselves if we can make a broadcast_load.
40384 if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
40385 Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
40386 assert(Subtarget.hasAVX2() && "Expected AVX2");
40387 SDValue TruncIn = Src.getOperand(0);
40388
40389 // If this is a truncate of a non extending load we can just narrow it to
40390 // use a broadcast_load.
40391 if (ISD::isNormalLoad(TruncIn.getNode())) {
40392 LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
40393 // Unless its volatile or atomic.
40394 if (LN->isSimple()) {
40395 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40396 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
40397 SDValue BcastLd = DAG.getMemIntrinsicNode(
40398 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
40399 LN->getPointerInfo(), LN->getOriginalAlign(),
40400 LN->getMemOperand()->getFlags());
40401 DCI.CombineTo(N.getNode(), BcastLd);
40402 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
40403 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
40404 return N; // Return N so it doesn't get rechecked!
40405 }
40406 }
40407
40408 // If this is a truncate of an i16 extload, we can directly replace it.
40409 if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
40410 ISD::isEXTLoad(Src.getOperand(0).getNode())) {
40411 LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
40412 if (LN->getMemoryVT().getSizeInBits() == 16) {
40413 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40414 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
40415 SDValue BcastLd =
40417 LN->getMemoryVT(), LN->getMemOperand());
40418 DCI.CombineTo(N.getNode(), BcastLd);
40419 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
40420 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
40421 return N; // Return N so it doesn't get rechecked!
40422 }
40423 }
40424
40425 // If this is a truncate of load that has been shifted right, we can
40426 // offset the pointer and use a narrower load.
40427 if (TruncIn.getOpcode() == ISD::SRL &&
40428 TruncIn.getOperand(0).hasOneUse() &&
40429 isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
40430 ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
40431 LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
40432 unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
40433 // Make sure the shift amount and the load size are divisible by 16.
40434 // Don't do this if the load is volatile or atomic.
40435 if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
40436 LN->isSimple()) {
40437 unsigned Offset = ShiftAmt / 8;
40438 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40441 SDValue Ops[] = { LN->getChain(), Ptr };
40442 SDValue BcastLd = DAG.getMemIntrinsicNode(
40443 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
40445 LN->getOriginalAlign(),
40446 LN->getMemOperand()->getFlags());
40447 DCI.CombineTo(N.getNode(), BcastLd);
40448 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
40449 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
40450 return N; // Return N so it doesn't get rechecked!
40451 }
40452 }
40453 }
40454
40455 // vbroadcast(vzload X) -> vbroadcast_load X
40456 if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
40457 MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
40458 if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
40459 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40460 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
40461 SDValue BcastLd =
40463 LN->getMemoryVT(), LN->getMemOperand());
40464 DCI.CombineTo(N.getNode(), BcastLd);
40465 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
40467 return N; // Return N so it doesn't get rechecked!
40468 }
40469 }
40470
40471 // vbroadcast(vector load X) -> vbroadcast_load
40472 if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||
40473 SrcVT == MVT::v4i32) &&
40474 Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
40475 LoadSDNode *LN = cast<LoadSDNode>(Src);
40476 // Unless the load is volatile or atomic.
40477 if (LN->isSimple()) {
40478 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40479 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
40480 SDValue BcastLd = DAG.getMemIntrinsicNode(
40481 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),
40482 LN->getPointerInfo(), LN->getOriginalAlign(),
40483 LN->getMemOperand()->getFlags());
40484 DCI.CombineTo(N.getNode(), BcastLd);
40485 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
40487 return N; // Return N so it doesn't get rechecked!
40488 }
40489 }
40490
40491 return SDValue();
40492 }
40493 case X86ISD::VZEXT_MOVL: {
40494 SDValue N0 = N.getOperand(0);
40495
40496 // If this a vzmovl of a full vector load, replace it with a vzload, unless
40497 // the load is volatile.
40498 if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
40499 auto *LN = cast<LoadSDNode>(N0);
40500 if (SDValue VZLoad =
40501 narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
40502 DCI.CombineTo(N.getNode(), VZLoad);
40503 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
40505 return N;
40506 }
40507 }
40508
40509 // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
40510 // and can just use a VZEXT_LOAD.
40511 // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
40512 if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
40513 auto *LN = cast<MemSDNode>(N0);
40514 if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
40515 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40516 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
40517 SDValue VZLoad =
40519 LN->getMemoryVT(), LN->getMemOperand());
40520 DCI.CombineTo(N.getNode(), VZLoad);
40521 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
40523 return N;
40524 }
40525 }
40526
40527 // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
40528 // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
40529 // if the upper bits of the i64 are zero.
40530 if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
40531 N0.getOperand(0).hasOneUse() &&
40532 N0.getOperand(0).getValueType() == MVT::i64) {
40533 SDValue In = N0.getOperand(0);
40534 APInt Mask = APInt::getHighBitsSet(64, 32);
40535 if (DAG.MaskedValueIsZero(In, Mask)) {
40536 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
40537 MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
40538 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
40539 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
40540 return DAG.getBitcast(VT, Movl);
40541 }
40542 }
40543
40544 // Load a scalar integer constant directly to XMM instead of transferring an
40545 // immediate value from GPR.
40546 // vzext_movl (scalar_to_vector C) --> load [C,0...]
40547 if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
40548 if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
40549 // Create a vector constant - scalar constant followed by zeros.
40550 EVT ScalarVT = N0.getOperand(0).getValueType();
40551 Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
40552 unsigned NumElts = VT.getVectorNumElements();
40553 Constant *Zero = ConstantInt::getNullValue(ScalarTy);
40554 SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
40555 ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
40556
40557 // Load the vector constant from constant pool.
40558 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
40559 SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
40560 MachinePointerInfo MPI =
40562 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
40563 return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
40565 }
40566 }
40567
40568 // Pull subvector inserts into undef through VZEXT_MOVL by making it an
40569 // insert into a zero vector. This helps get VZEXT_MOVL closer to
40570 // scalar_to_vectors where 256/512 are canonicalized to an insert and a
40571 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
40572 if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
40574
40575 if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
40576 isNullConstant(V.getOperand(2))) {
40577 SDValue In = V.getOperand(1);
40579 In.getValueSizeInBits() /
40580 VT.getScalarSizeInBits());
40581 In = DAG.getBitcast(SubVT, In);
40582 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
40583 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
40584 getZeroVector(VT, Subtarget, DAG, DL), Movl,
40585 V.getOperand(2));
40586 }
40587 }
40588
40589 return SDValue();
40590 }
40591 case X86ISD::BLENDI: {
40592 SDValue N0 = N.getOperand(0);
40593 SDValue N1 = N.getOperand(1);
40594
40595 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
40596 // TODO: Handle MVT::v16i16 repeated blend mask.
40597 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
40598 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
40599 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
40600 if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
40601 SrcVT.getScalarSizeInBits() >= 32) {
40602 unsigned Size = VT.getVectorNumElements();
40603 unsigned NewSize = SrcVT.getVectorNumElements();
40604 APInt BlendMask = N.getConstantOperandAPInt(2).zextOrTrunc(Size);
40605 APInt NewBlendMask = APIntOps::ScaleBitMask(BlendMask, NewSize);
40606 return DAG.getBitcast(
40607 VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
40608 N1.getOperand(0),
40609 DAG.getTargetConstant(NewBlendMask.getZExtValue(),
40610 DL, MVT::i8)));
40611 }
40612 }
40613 return SDValue();
40614 }
40615 case X86ISD::SHUFP: {
40616 // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
40617 // This is a more relaxed shuffle combiner that can ignore oneuse limits.
40618 // TODO: Support types other than v4f32.
40619 if (VT == MVT::v4f32) {
40620 bool Updated = false;
40621 SmallVector<int> Mask;
40623 if (getTargetShuffleMask(N, false, Ops, Mask) && Ops.size() == 2) {
40624 for (int i = 0; i != 2; ++i) {
40625 SmallVector<SDValue> SubOps;
40626 SmallVector<int> SubMask, SubScaledMask;
40627 SDValue Sub = peekThroughBitcasts(Ops[i]);
40628 // TODO: Scaling might be easier if we specify the demanded elts.
40629 if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&
40630 scaleShuffleElements(SubMask, 4, SubScaledMask) &&
40631 SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {
40632 int Ofs = i * 2;
40633 Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);
40634 Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);
40635 Ops[i] = DAG.getBitcast(VT, SubOps[0]);
40636 Updated = true;
40637 }
40638 }
40639 }
40640 if (Updated) {
40641 for (int &M : Mask)
40642 M %= 4;
40643 Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
40644 return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);
40645 }
40646 }
40647 return SDValue();
40648 }
40649 case X86ISD::VPERMI: {
40650 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
40651 // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
40652 SDValue N0 = N.getOperand(0);
40653 SDValue N1 = N.getOperand(1);
40654 unsigned EltSizeInBits = VT.getScalarSizeInBits();
40655 if (N0.getOpcode() == ISD::BITCAST &&
40656 N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
40657 SDValue Src = N0.getOperand(0);
40658 EVT SrcVT = Src.getValueType();
40659 SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
40660 return DAG.getBitcast(VT, Res);
40661 }
40662 return SDValue();
40663 }
40664 case X86ISD::SHUF128: {
40665 // If we're permuting the upper 256-bits subvectors of a concatenation, then
40666 // see if we can peek through and access the subvector directly.
40667 if (VT.is512BitVector()) {
40668 // 512-bit mask uses 4 x i2 indices - if the msb is always set then only the
40669 // upper subvector is used.
40670 SDValue LHS = N->getOperand(0);
40671 SDValue RHS = N->getOperand(1);
40672 uint64_t Mask = N->getConstantOperandVal(2);
40673 SmallVector<SDValue> LHSOps, RHSOps;
40674 SDValue NewLHS, NewRHS;
40675 if ((Mask & 0x0A) == 0x0A &&
40676 collectConcatOps(LHS.getNode(), LHSOps, DAG) && LHSOps.size() == 2) {
40677 NewLHS = widenSubVector(LHSOps[1], false, Subtarget, DAG, DL, 512);
40678 Mask &= ~0x0A;
40679 }
40680 if ((Mask & 0xA0) == 0xA0 &&
40681 collectConcatOps(RHS.getNode(), RHSOps, DAG) && RHSOps.size() == 2) {
40682 NewRHS = widenSubVector(RHSOps[1], false, Subtarget, DAG, DL, 512);
40683 Mask &= ~0xA0;
40684 }
40685 if (NewLHS || NewRHS)
40686 return DAG.getNode(X86ISD::SHUF128, DL, VT, NewLHS ? NewLHS : LHS,
40687 NewRHS ? NewRHS : RHS,
40688 DAG.getTargetConstant(Mask, DL, MVT::i8));
40689 }
40690 return SDValue();
40691 }
40692 case X86ISD::VPERM2X128: {
40693 // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
40694 SDValue LHS = N->getOperand(0);
40695 SDValue RHS = N->getOperand(1);
40696 if (LHS.getOpcode() == ISD::BITCAST &&
40697 (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
40698 EVT SrcVT = LHS.getOperand(0).getValueType();
40699 if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
40700 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
40701 DAG.getBitcast(SrcVT, LHS),
40702 DAG.getBitcast(SrcVT, RHS),
40703 N->getOperand(2)));
40704 }
40705 }
40706
40707 // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
40709 return Res;
40710
40711 // Fold vperm2x128 subvector shuffle with an inner concat pattern.
40712 // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
40713 auto FindSubVector128 = [&](unsigned Idx) {
40714 if (Idx > 3)
40715 return SDValue();
40716 SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
40717 SmallVector<SDValue> SubOps;
40718 if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)
40719 return SubOps[Idx & 1];
40720 unsigned NumElts = Src.getValueType().getVectorNumElements();
40721 if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
40722 Src.getOperand(1).getValueSizeInBits() == 128 &&
40723 Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
40724 return Src.getOperand(1);
40725 }
40726 return SDValue();
40727 };
40728 unsigned Imm = N.getConstantOperandVal(2);
40729 if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
40730 if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
40731 MVT SubVT = VT.getHalfNumVectorElementsVT();
40732 SubLo = DAG.getBitcast(SubVT, SubLo);
40733 SubHi = DAG.getBitcast(SubVT, SubHi);
40734 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
40735 }
40736 }
40737 return SDValue();
40738 }
40739 case X86ISD::PSHUFD:
40740 case X86ISD::PSHUFLW:
40741 case X86ISD::PSHUFHW: {
40742 SDValue N0 = N.getOperand(0);
40743 SDValue N1 = N.getOperand(1);
40744 if (N0->hasOneUse()) {
40746 switch (V.getOpcode()) {
40747 case X86ISD::VSHL:
40748 case X86ISD::VSRL:
40749 case X86ISD::VSRA:
40750 case X86ISD::VSHLI:
40751 case X86ISD::VSRLI:
40752 case X86ISD::VSRAI:
40753 case X86ISD::VROTLI:
40754 case X86ISD::VROTRI: {
40755 MVT InnerVT = V.getSimpleValueType();
40756 if (InnerVT.getScalarSizeInBits() <= VT.getScalarSizeInBits()) {
40757 SDValue Res = DAG.getNode(Opcode, DL, VT,
40758 DAG.getBitcast(VT, V.getOperand(0)), N1);
40759 Res = DAG.getBitcast(InnerVT, Res);
40760 Res = DAG.getNode(V.getOpcode(), DL, InnerVT, Res, V.getOperand(1));
40761 return DAG.getBitcast(VT, Res);
40762 }
40763 break;
40764 }
40765 }
40766 }
40767
40768 Mask = getPSHUFShuffleMask(N);
40769 assert(Mask.size() == 4);
40770 break;
40771 }
40772 case X86ISD::MOVSD:
40773 case X86ISD::MOVSH:
40774 case X86ISD::MOVSS: {
40775 SDValue N0 = N.getOperand(0);
40776 SDValue N1 = N.getOperand(1);
40777
40778 // Canonicalize scalar FPOps:
40779 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
40780 // If commutable, allow OP(N1[0], N0[0]).
40781 unsigned Opcode1 = N1.getOpcode();
40782 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
40783 Opcode1 == ISD::FDIV) {
40784 SDValue N10 = N1.getOperand(0);
40785 SDValue N11 = N1.getOperand(1);
40786 if (N10 == N0 ||
40787 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
40788 if (N10 != N0)
40789 std::swap(N10, N11);
40790 MVT SVT = VT.getVectorElementType();
40791 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
40792 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
40793 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
40794 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
40795 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
40796 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
40797 }
40798 }
40799
40800 return SDValue();
40801 }
40802 case X86ISD::INSERTPS: {
40803 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
40804 SDValue Op0 = N.getOperand(0);
40805 SDValue Op1 = N.getOperand(1);
40806 unsigned InsertPSMask = N.getConstantOperandVal(2);
40807 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
40808 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
40809 unsigned ZeroMask = InsertPSMask & 0xF;
40810
40811 // If we zero out all elements from Op0 then we don't need to reference it.
40812 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
40813 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
40814 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
40815
40816 // If we zero out the element from Op1 then we don't need to reference it.
40817 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
40818 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
40819 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
40820
40821 // Attempt to merge insertps Op1 with an inner target shuffle node.
40822 SmallVector<int, 8> TargetMask1;
40824 APInt KnownUndef1, KnownZero1;
40825 if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
40826 KnownZero1)) {
40827 if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
40828 // Zero/UNDEF insertion - zero out element and remove dependency.
40829 InsertPSMask |= (1u << DstIdx);
40830 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
40831 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
40832 }
40833 // Update insertps mask srcidx and reference the source input directly.
40834 int M = TargetMask1[SrcIdx];
40835 assert(0 <= M && M < 8 && "Shuffle index out of range");
40836 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
40837 Op1 = Ops1[M < 4 ? 0 : 1];
40838 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
40839 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
40840 }
40841
40842 // Attempt to merge insertps Op0 with an inner target shuffle node.
40843 SmallVector<int, 8> TargetMask0;
40845 APInt KnownUndef0, KnownZero0;
40846 if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
40847 KnownZero0)) {
40848 bool Updated = false;
40849 bool UseInput00 = false;
40850 bool UseInput01 = false;
40851 for (int i = 0; i != 4; ++i) {
40852 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
40853 // No change if element is already zero or the inserted element.
40854 continue;
40855 }
40856
40857 if (KnownUndef0[i] || KnownZero0[i]) {
40858 // If the target mask is undef/zero then we must zero the element.
40859 InsertPSMask |= (1u << i);
40860 Updated = true;
40861 continue;
40862 }
40863
40864 // The input vector element must be inline.
40865 int M = TargetMask0[i];
40866 if (M != i && M != (i + 4))
40867 return SDValue();
40868
40869 // Determine which inputs of the target shuffle we're using.
40870 UseInput00 |= (0 <= M && M < 4);
40871 UseInput01 |= (4 <= M);
40872 }
40873
40874 // If we're not using both inputs of the target shuffle then use the
40875 // referenced input directly.
40876 if (UseInput00 && !UseInput01) {
40877 Updated = true;
40878 Op0 = Ops0[0];
40879 } else if (!UseInput00 && UseInput01) {
40880 Updated = true;
40881 Op0 = Ops0[1];
40882 }
40883
40884 if (Updated)
40885 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
40886 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
40887 }
40888
40889 // If we're inserting an element from a vbroadcast load, fold the
40890 // load into the X86insertps instruction. We need to convert the scalar
40891 // load to a vector and clear the source lane of the INSERTPS control.
40892 if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
40893 auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
40894 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
40895 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
40896 MemIntr->getBasePtr(),
40897 MemIntr->getMemOperand());
40898 SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
40900 Load),
40901 DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
40902 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
40903 return Insert;
40904 }
40905 }
40906
40907 return SDValue();
40908 }
40909 default:
40910 return SDValue();
40911 }
40912
40913 // Nuke no-op shuffles that show up after combining.
40914 if (isNoopShuffleMask(Mask))
40915 return N.getOperand(0);
40916
40917 // Look for simplifications involving one or two shuffle instructions.
40918 SDValue V = N.getOperand(0);
40919 switch (N.getOpcode()) {
40920 default:
40921 break;
40922 case X86ISD::PSHUFLW:
40923 case X86ISD::PSHUFHW:
40924 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
40925
40926 // See if this reduces to a PSHUFD which is no more expensive and can
40927 // combine with more operations. Note that it has to at least flip the
40928 // dwords as otherwise it would have been removed as a no-op.
40929 if (ArrayRef<int>(Mask).equals({2, 3, 0, 1})) {
40930 int DMask[] = {0, 1, 2, 3};
40931 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
40932 DMask[DOffset + 0] = DOffset + 1;
40933 DMask[DOffset + 1] = DOffset + 0;
40934 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
40935 V = DAG.getBitcast(DVT, V);
40936 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
40937 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
40938 return DAG.getBitcast(VT, V);
40939 }
40940
40941 // Look for shuffle patterns which can be implemented as a single unpack.
40942 // FIXME: This doesn't handle the location of the PSHUFD generically, and
40943 // only works when we have a PSHUFD followed by two half-shuffles.
40944 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
40945 (V.getOpcode() == X86ISD::PSHUFLW ||
40946 V.getOpcode() == X86ISD::PSHUFHW) &&
40947 V.getOpcode() != N.getOpcode() &&
40948 V.hasOneUse() && V.getOperand(0).hasOneUse()) {
40949 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
40950 if (D.getOpcode() == X86ISD::PSHUFD) {
40953 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
40954 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
40955 int WordMask[8];
40956 for (int i = 0; i < 4; ++i) {
40957 WordMask[i + NOffset] = Mask[i] + NOffset;
40958 WordMask[i + VOffset] = VMask[i] + VOffset;
40959 }
40960 // Map the word mask through the DWord mask.
40961 int MappedMask[8];
40962 for (int i = 0; i < 8; ++i)
40963 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
40964 if (ArrayRef<int>(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
40965 ArrayRef<int>(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
40966 // We can replace all three shuffles with an unpack.
40967 V = DAG.getBitcast(VT, D.getOperand(0));
40968 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
40970 DL, VT, V, V);
40971 }
40972 }
40973 }
40974
40975 break;
40976
40977 case X86ISD::PSHUFD:
40978 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DL, DAG))
40979 return NewN;
40980
40981 break;
40982 }
40983
40984 return SDValue();
40985}
40986
40987/// Checks if the shuffle mask takes subsequent elements
40988/// alternately from two vectors.
40989/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
40990static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
40991
40992 int ParitySrc[2] = {-1, -1};
40993 unsigned Size = Mask.size();
40994 for (unsigned i = 0; i != Size; ++i) {
40995 int M = Mask[i];
40996 if (M < 0)
40997 continue;
40998
40999 // Make sure we are using the matching element from the input.
41000 if ((M % Size) != i)
41001 return false;
41002
41003 // Make sure we use the same input for all elements of the same parity.
41004 int Src = M / Size;
41005 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
41006 return false;
41007 ParitySrc[i % 2] = Src;
41008 }
41009
41010 // Make sure each input is used.
41011 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
41012 return false;
41013
41014 Op0Even = ParitySrc[0] == 0;
41015 return true;
41016}
41017
41018/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
41019/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
41020/// are written to the parameters \p Opnd0 and \p Opnd1.
41021///
41022/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
41023/// so it is easier to generically match. We also insert dummy vector shuffle
41024/// nodes for the operands which explicitly discard the lanes which are unused
41025/// by this operation to try to flow through the rest of the combiner
41026/// the fact that they're unused.
41027static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
41028 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
41029 bool &IsSubAdd) {
41030
41031 EVT VT = N->getValueType(0);
41032 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41033 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
41035 return false;
41036
41037 // We only handle target-independent shuffles.
41038 // FIXME: It would be easy and harmless to use the target shuffle mask
41039 // extraction tool to support more.
41040 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
41041 return false;
41042
41043 SDValue V1 = N->getOperand(0);
41044 SDValue V2 = N->getOperand(1);
41045
41046 // Make sure we have an FADD and an FSUB.
41047 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
41048 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
41049 V1.getOpcode() == V2.getOpcode())
41050 return false;
41051
41052 // If there are other uses of these operations we can't fold them.
41053 if (!V1->hasOneUse() || !V2->hasOneUse())
41054 return false;
41055
41056 // Ensure that both operations have the same operands. Note that we can
41057 // commute the FADD operands.
41058 SDValue LHS, RHS;
41059 if (V1.getOpcode() == ISD::FSUB) {
41060 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
41061 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
41062 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
41063 return false;
41064 } else {
41065 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
41066 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
41067 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
41068 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
41069 return false;
41070 }
41071
41072 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
41073 bool Op0Even;
41074 if (!isAddSubOrSubAddMask(Mask, Op0Even))
41075 return false;
41076
41077 // It's a subadd if the vector in the even parity is an FADD.
41078 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
41079 : V2->getOpcode() == ISD::FADD;
41080
41081 Opnd0 = LHS;
41082 Opnd1 = RHS;
41083 return true;
41084}
41085
41086/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
41088 const X86Subtarget &Subtarget,
41089 SelectionDAG &DAG) {
41090 // We only handle target-independent shuffles.
41091 // FIXME: It would be easy and harmless to use the target shuffle mask
41092 // extraction tool to support more.
41093 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
41094 return SDValue();
41095
41096 MVT VT = N->getSimpleValueType(0);
41097 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41098 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
41099 return SDValue();
41100
41101 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
41102 SDValue Op0 = N->getOperand(0);
41103 SDValue Op1 = N->getOperand(1);
41104 SDValue FMAdd = Op0, FMSub = Op1;
41105 if (FMSub.getOpcode() != X86ISD::FMSUB)
41106 std::swap(FMAdd, FMSub);
41107
41108 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
41109 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
41110 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
41111 FMAdd.getOperand(2) != FMSub.getOperand(2))
41112 return SDValue();
41113
41114 // Check for correct shuffle mask.
41115 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
41116 bool Op0Even;
41117 if (!isAddSubOrSubAddMask(Mask, Op0Even))
41118 return SDValue();
41119
41120 // FMAddSub takes zeroth operand from FMSub node.
41121 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
41122 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
41123 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
41124 FMAdd.getOperand(2));
41125}
41126
41127/// Try to combine a shuffle into a target-specific add-sub or
41128/// mul-add-sub node.
41130 const X86Subtarget &Subtarget,
41131 SelectionDAG &DAG) {
41132 if (SDValue V = combineShuffleToFMAddSub(N, DL, Subtarget, DAG))
41133 return V;
41134
41135 SDValue Opnd0, Opnd1;
41136 bool IsSubAdd;
41137 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
41138 return SDValue();
41139
41140 MVT VT = N->getSimpleValueType(0);
41141
41142 // Try to generate X86ISD::FMADDSUB node here.
41143 SDValue Opnd2;
41144 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
41145 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
41146 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
41147 }
41148
41149 if (IsSubAdd)
41150 return SDValue();
41151
41152 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
41153 // the ADDSUB idiom has been successfully recognized. There are no known
41154 // X86 targets with 512-bit ADDSUB instructions!
41155 if (VT.is512BitVector())
41156 return SDValue();
41157
41158 // Do not generate X86ISD::ADDSUB node for FP16's vector types even though
41159 // the ADDSUB idiom has been successfully recognized. There are no known
41160 // X86 targets with FP16 ADDSUB instructions!
41161 if (VT.getVectorElementType() == MVT::f16)
41162 return SDValue();
41163
41164 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
41165}
41166
41167// We are looking for a shuffle where both sources are concatenated with undef
41168// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
41169// if we can express this as a single-source shuffle, that's preferable.
41171 SelectionDAG &DAG,
41172 const X86Subtarget &Subtarget) {
41173 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
41174 return SDValue();
41175
41176 EVT VT = N->getValueType(0);
41177
41178 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
41179 if (!VT.is128BitVector() && !VT.is256BitVector())
41180 return SDValue();
41181
41182 if (VT.getVectorElementType() != MVT::i32 &&
41183 VT.getVectorElementType() != MVT::i64 &&
41184 VT.getVectorElementType() != MVT::f32 &&
41185 VT.getVectorElementType() != MVT::f64)
41186 return SDValue();
41187
41188 SDValue N0 = N->getOperand(0);
41189 SDValue N1 = N->getOperand(1);
41190
41191 // Check that both sources are concats with undef.
41192 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
41193 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
41194 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
41195 !N1.getOperand(1).isUndef())
41196 return SDValue();
41197
41198 // Construct the new shuffle mask. Elements from the first source retain their
41199 // index, but elements from the second source no longer need to skip an undef.
41201 int NumElts = VT.getVectorNumElements();
41202
41203 auto *SVOp = cast<ShuffleVectorSDNode>(N);
41204 for (int Elt : SVOp->getMask())
41205 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
41206
41208 N1.getOperand(0));
41209 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
41210}
41211
41212/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
41213/// low half of each source vector and does not set any high half elements in
41214/// the destination vector, narrow the shuffle to half its original size.
41216 EVT VT = Shuf->getValueType(0);
41217 if (!DAG.getTargetLoweringInfo().isTypeLegal(Shuf->getValueType(0)))
41218 return SDValue();
41219 if (!VT.is256BitVector() && !VT.is512BitVector())
41220 return SDValue();
41221
41222 // See if we can ignore all of the high elements of the shuffle.
41223 ArrayRef<int> Mask = Shuf->getMask();
41224 if (!isUndefUpperHalf(Mask))
41225 return SDValue();
41226
41227 // Check if the shuffle mask accesses only the low half of each input vector
41228 // (half-index output is 0 or 2).
41229 int HalfIdx1, HalfIdx2;
41230 SmallVector<int, 8> HalfMask(Mask.size() / 2);
41231 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
41232 (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
41233 return SDValue();
41234
41235 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
41236 // The trick is knowing that all of the insert/extract are actually free
41237 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
41238 // of narrow inputs into a narrow output, and that is always cheaper than
41239 // the wide shuffle that we started with.
41240 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
41241 Shuf->getOperand(1), HalfMask, HalfIdx1,
41242 HalfIdx2, false, DAG, /*UseConcat*/ true);
41243}
41244
41247 const X86Subtarget &Subtarget) {
41248 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
41249 if (SDValue V = narrowShuffle(Shuf, DAG))
41250 return V;
41251
41252 // If we have legalized the vector types, look for blends of FADD and FSUB
41253 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
41254 SDLoc dl(N);
41255 EVT VT = N->getValueType(0);
41256 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41257 if (TLI.isTypeLegal(VT) && !isSoftF16(VT, Subtarget))
41258 if (SDValue AddSub =
41259 combineShuffleToAddSubOrFMAddSub(N, dl, Subtarget, DAG))
41260 return AddSub;
41261
41262 // Attempt to combine into a vector load/broadcast.
41264 VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
41265 return LD;
41266
41267 // For AVX2, we sometimes want to combine
41268 // (vector_shuffle <mask> (concat_vectors t1, undef)
41269 // (concat_vectors t2, undef))
41270 // Into:
41271 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
41272 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
41273 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, dl, DAG, Subtarget))
41274 return ShufConcat;
41275
41276 if (isTargetShuffle(N->getOpcode())) {
41277 SDValue Op(N, 0);
41278 if (SDValue Shuffle = combineTargetShuffle(Op, dl, DAG, DCI, Subtarget))
41279 return Shuffle;
41280
41281 // Try recursively combining arbitrary sequences of x86 shuffle
41282 // instructions into higher-order shuffles. We do this after combining
41283 // specific PSHUF instruction sequences into their minimal form so that we
41284 // can evaluate how many specialized shuffle instructions are involved in
41285 // a particular chain.
41286 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
41287 return Res;
41288
41289 // Simplify source operands based on shuffle mask.
41290 // TODO - merge this into combineX86ShufflesRecursively.
41291 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
41292 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))
41293 return SDValue(N, 0);
41294
41295 // Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
41296 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
41297 // Perform this after other shuffle combines to allow inner shuffles to be
41298 // combined away first.
41299 if (SDValue BinOp = canonicalizeShuffleWithOp(Op, DAG, dl))
41300 return BinOp;
41301 }
41302
41303 return SDValue();
41304}
41305
41306// Simplify variable target shuffle masks based on the demanded elements.
41307// TODO: Handle DemandedBits in mask indices as well?
41309 SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
41310 TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
41311 // If we're demanding all elements don't bother trying to simplify the mask.
41312 unsigned NumElts = DemandedElts.getBitWidth();
41313 if (DemandedElts.isAllOnes())
41314 return false;
41315
41316 SDValue Mask = Op.getOperand(MaskIndex);
41317 if (!Mask.hasOneUse())
41318 return false;
41319
41320 // Attempt to generically simplify the variable shuffle mask.
41321 APInt MaskUndef, MaskZero;
41322 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
41323 Depth + 1))
41324 return true;
41325
41326 // Attempt to extract+simplify a (constant pool load) shuffle mask.
41327 // TODO: Support other types from getTargetShuffleMaskIndices?
41329 EVT BCVT = BC.getValueType();
41330 auto *Load = dyn_cast<LoadSDNode>(BC);
41331 if (!Load || !Load->getBasePtr().hasOneUse())
41332 return false;
41333
41334 const Constant *C = getTargetConstantFromNode(Load);
41335 if (!C)
41336 return false;
41337
41338 Type *CTy = C->getType();
41339 if (!CTy->isVectorTy() ||
41340 CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
41341 return false;
41342
41343 // Handle scaling for i64 elements on 32-bit targets.
41344 unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
41345 if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
41346 return false;
41347 unsigned Scale = NumCstElts / NumElts;
41348
41349 // Simplify mask if we have an undemanded element that is not undef.
41350 bool Simplified = false;
41351 SmallVector<Constant *, 32> ConstVecOps;
41352 for (unsigned i = 0; i != NumCstElts; ++i) {
41353 Constant *Elt = C->getAggregateElement(i);
41354 if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
41355 ConstVecOps.push_back(UndefValue::get(Elt->getType()));
41356 Simplified = true;
41357 continue;
41358 }
41359 ConstVecOps.push_back(Elt);
41360 }
41361 if (!Simplified)
41362 return false;
41363
41364 // Generate new constant pool entry + legalize immediately for the load.
41365 SDLoc DL(Op);
41366 SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
41367 SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
41368 SDValue NewMask = TLO.DAG.getLoad(
41369 BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
41371 Load->getAlign());
41372 return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
41373}
41374
41376 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
41377 TargetLoweringOpt &TLO, unsigned Depth) const {
41378 int NumElts = DemandedElts.getBitWidth();
41379 unsigned Opc = Op.getOpcode();
41380 EVT VT = Op.getValueType();
41381
41382 // Handle special case opcodes.
41383 switch (Opc) {
41384 case X86ISD::PMULDQ:
41385 case X86ISD::PMULUDQ: {
41386 APInt LHSUndef, LHSZero;
41387 APInt RHSUndef, RHSZero;
41388 SDValue LHS = Op.getOperand(0);
41389 SDValue RHS = Op.getOperand(1);
41390 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
41391 Depth + 1))
41392 return true;
41393 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
41394 Depth + 1))
41395 return true;
41396 // Multiply by zero.
41397 KnownZero = LHSZero | RHSZero;
41398 break;
41399 }
41400 case X86ISD::VPMADDWD: {
41401 APInt LHSUndef, LHSZero;
41402 APInt RHSUndef, RHSZero;
41403 SDValue LHS = Op.getOperand(0);
41404 SDValue RHS = Op.getOperand(1);
41405 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);
41406
41407 if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,
41408 Depth + 1))
41409 return true;
41410 if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,
41411 Depth + 1))
41412 return true;
41413
41414 // TODO: Multiply by zero.
41415
41416 // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.
41417 APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;
41418 if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,
41419 Depth + 1))
41420 return true;
41421 APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;
41422 if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,
41423 Depth + 1))
41424 return true;
41425 break;
41426 }
41427 case X86ISD::PSADBW: {
41428 SDValue LHS = Op.getOperand(0);
41429 SDValue RHS = Op.getOperand(1);
41430 assert(VT.getScalarType() == MVT::i64 &&
41431 LHS.getValueType() == RHS.getValueType() &&
41432 LHS.getValueType().getScalarType() == MVT::i8 &&
41433 "Unexpected PSADBW types");
41434
41435 // Aggressively peek through ops to get at the demanded elts.
41436 if (!DemandedElts.isAllOnes()) {
41437 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
41438 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
41440 LHS, DemandedSrcElts, TLO.DAG, Depth + 1);
41442 RHS, DemandedSrcElts, TLO.DAG, Depth + 1);
41443 if (NewLHS || NewRHS) {
41444 NewLHS = NewLHS ? NewLHS : LHS;
41445 NewRHS = NewRHS ? NewRHS : RHS;
41446 return TLO.CombineTo(
41447 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
41448 }
41449 }
41450 break;
41451 }
41452 case X86ISD::VSHL:
41453 case X86ISD::VSRL:
41454 case X86ISD::VSRA: {
41455 // We only need the bottom 64-bits of the (128-bit) shift amount.
41456 SDValue Amt = Op.getOperand(1);
41457 MVT AmtVT = Amt.getSimpleValueType();
41458 assert(AmtVT.is128BitVector() && "Unexpected value type");
41459
41460 // If we reuse the shift amount just for sse shift amounts then we know that
41461 // only the bottom 64-bits are only ever used.
41462 bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
41463 unsigned UseOpc = Use->getOpcode();
41464 return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
41465 UseOpc == X86ISD::VSRA) &&
41466 Use->getOperand(0) != Amt;
41467 });
41468
41469 APInt AmtUndef, AmtZero;
41470 unsigned NumAmtElts = AmtVT.getVectorNumElements();
41471 APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
41472 if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
41473 Depth + 1, AssumeSingleUse))
41474 return true;
41475 [[fallthrough]];
41476 }
41477 case X86ISD::VSHLI:
41478 case X86ISD::VSRLI:
41479 case X86ISD::VSRAI: {
41480 SDValue Src = Op.getOperand(0);
41481 APInt SrcUndef;
41482 if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
41483 Depth + 1))
41484 return true;
41485
41486 // Fold shift(0,x) -> 0
41487 if (DemandedElts.isSubsetOf(KnownZero))
41488 return TLO.CombineTo(
41489 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
41490
41491 // Aggressively peek through ops to get at the demanded elts.
41492 if (!DemandedElts.isAllOnes())
41494 Src, DemandedElts, TLO.DAG, Depth + 1))
41495 return TLO.CombineTo(
41496 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
41497 break;
41498 }
41499 case X86ISD::VPSHA:
41500 case X86ISD::VPSHL:
41501 case X86ISD::VSHLV:
41502 case X86ISD::VSRLV:
41503 case X86ISD::VSRAV: {
41504 APInt LHSUndef, LHSZero;
41505 APInt RHSUndef, RHSZero;
41506 SDValue LHS = Op.getOperand(0);
41507 SDValue RHS = Op.getOperand(1);
41508 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
41509 Depth + 1))
41510 return true;
41511
41512 // Fold shift(0,x) -> 0
41513 if (DemandedElts.isSubsetOf(LHSZero))
41514 return TLO.CombineTo(
41515 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
41516
41517 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
41518 Depth + 1))
41519 return true;
41520
41521 KnownZero = LHSZero;
41522 break;
41523 }
41524 case X86ISD::PCMPEQ:
41525 case X86ISD::PCMPGT: {
41526 APInt LHSUndef, LHSZero;
41527 APInt RHSUndef, RHSZero;
41528 SDValue LHS = Op.getOperand(0);
41529 SDValue RHS = Op.getOperand(1);
41530 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
41531 Depth + 1))
41532 return true;
41533 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
41534 Depth + 1))
41535 return true;
41536 break;
41537 }
41538 case X86ISD::KSHIFTL: {
41539 SDValue Src = Op.getOperand(0);
41540 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
41541 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
41542 unsigned ShiftAmt = Amt->getZExtValue();
41543
41544 if (ShiftAmt == 0)
41545 return TLO.CombineTo(Op, Src);
41546
41547 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
41548 // single shift. We can do this if the bottom bits (which are shifted
41549 // out) are never demanded.
41550 if (Src.getOpcode() == X86ISD::KSHIFTR) {
41551 if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
41552 unsigned C1 = Src.getConstantOperandVal(1);
41553 unsigned NewOpc = X86ISD::KSHIFTL;
41554 int Diff = ShiftAmt - C1;
41555 if (Diff < 0) {
41556 Diff = -Diff;
41557 NewOpc = X86ISD::KSHIFTR;
41558 }
41559
41560 SDLoc dl(Op);
41561 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
41562 return TLO.CombineTo(
41563 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
41564 }
41565 }
41566
41567 APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
41568 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
41569 Depth + 1))
41570 return true;
41571
41572 KnownUndef <<= ShiftAmt;
41573 KnownZero <<= ShiftAmt;
41574 KnownZero.setLowBits(ShiftAmt);
41575 break;
41576 }
41577 case X86ISD::KSHIFTR: {
41578 SDValue Src = Op.getOperand(0);
41579 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
41580 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
41581 unsigned ShiftAmt = Amt->getZExtValue();
41582
41583 if (ShiftAmt == 0)
41584 return TLO.CombineTo(Op, Src);
41585
41586 // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
41587 // single shift. We can do this if the top bits (which are shifted
41588 // out) are never demanded.
41589 if (Src.getOpcode() == X86ISD::KSHIFTL) {
41590 if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
41591 unsigned C1 = Src.getConstantOperandVal(1);
41592 unsigned NewOpc = X86ISD::KSHIFTR;
41593 int Diff = ShiftAmt - C1;
41594 if (Diff < 0) {
41595 Diff = -Diff;
41596 NewOpc = X86ISD::KSHIFTL;
41597 }
41598
41599 SDLoc dl(Op);
41600 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
41601 return TLO.CombineTo(
41602 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
41603 }
41604 }
41605
41606 APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
41607 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
41608 Depth + 1))
41609 return true;
41610
41611 KnownUndef.lshrInPlace(ShiftAmt);
41612 KnownZero.lshrInPlace(ShiftAmt);
41613 KnownZero.setHighBits(ShiftAmt);
41614 break;
41615 }
41616 case X86ISD::ANDNP: {
41617 // ANDNP = (~LHS & RHS);
41618 SDValue LHS = Op.getOperand(0);
41619 SDValue RHS = Op.getOperand(1);
41620
41621 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
41622 APInt UndefElts;
41623 SmallVector<APInt> EltBits;
41624 int NumElts = VT.getVectorNumElements();
41625 int EltSizeInBits = VT.getScalarSizeInBits();
41626 APInt OpBits = APInt::getAllOnes(EltSizeInBits);
41627 APInt OpElts = DemandedElts;
41628 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
41629 EltBits)) {
41630 OpBits.clearAllBits();
41631 OpElts.clearAllBits();
41632 for (int I = 0; I != NumElts; ++I) {
41633 if (!DemandedElts[I])
41634 continue;
41635 if (UndefElts[I]) {
41636 // We can't assume an undef src element gives an undef dst - the
41637 // other src might be zero.
41638 OpBits.setAllBits();
41639 OpElts.setBit(I);
41640 } else if ((Invert && !EltBits[I].isAllOnes()) ||
41641 (!Invert && !EltBits[I].isZero())) {
41642 OpBits |= Invert ? ~EltBits[I] : EltBits[I];
41643 OpElts.setBit(I);
41644 }
41645 }
41646 }
41647 return std::make_pair(OpBits, OpElts);
41648 };
41649 APInt BitsLHS, EltsLHS;
41650 APInt BitsRHS, EltsRHS;
41651 std::tie(BitsLHS, EltsLHS) = GetDemandedMasks(RHS);
41652 std::tie(BitsRHS, EltsRHS) = GetDemandedMasks(LHS, true);
41653
41654 APInt LHSUndef, LHSZero;
41655 APInt RHSUndef, RHSZero;
41656 if (SimplifyDemandedVectorElts(LHS, EltsLHS, LHSUndef, LHSZero, TLO,
41657 Depth + 1))
41658 return true;
41659 if (SimplifyDemandedVectorElts(RHS, EltsRHS, RHSUndef, RHSZero, TLO,
41660 Depth + 1))
41661 return true;
41662
41663 if (!DemandedElts.isAllOnes()) {
41664 SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS,
41665 TLO.DAG, Depth + 1);
41666 SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS,
41667 TLO.DAG, Depth + 1);
41668 if (NewLHS || NewRHS) {
41669 NewLHS = NewLHS ? NewLHS : LHS;
41670 NewRHS = NewRHS ? NewRHS : RHS;
41671 return TLO.CombineTo(
41672 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
41673 }
41674 }
41675 break;
41676 }
41677 case X86ISD::CVTSI2P:
41678 case X86ISD::CVTUI2P:
41679 case X86ISD::CVTPH2PS:
41680 case X86ISD::CVTPS2PH: {
41681 SDValue Src = Op.getOperand(0);
41682 MVT SrcVT = Src.getSimpleValueType();
41683 APInt SrcUndef, SrcZero;
41684 APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
41685 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
41686 Depth + 1))
41687 return true;
41688 break;
41689 }
41690 case X86ISD::PACKSS:
41691 case X86ISD::PACKUS: {
41692 SDValue N0 = Op.getOperand(0);
41693 SDValue N1 = Op.getOperand(1);
41694
41695 APInt DemandedLHS, DemandedRHS;
41696 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
41697
41698 APInt LHSUndef, LHSZero;
41699 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
41700 Depth + 1))
41701 return true;
41702 APInt RHSUndef, RHSZero;
41703 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
41704 Depth + 1))
41705 return true;
41706
41707 // TODO - pass on known zero/undef.
41708
41709 // Aggressively peek through ops to get at the demanded elts.
41710 // TODO - we should do this for all target/faux shuffles ops.
41711 if (!DemandedElts.isAllOnes()) {
41712 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
41713 TLO.DAG, Depth + 1);
41714 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
41715 TLO.DAG, Depth + 1);
41716 if (NewN0 || NewN1) {
41717 NewN0 = NewN0 ? NewN0 : N0;
41718 NewN1 = NewN1 ? NewN1 : N1;
41719 return TLO.CombineTo(Op,
41720 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
41721 }
41722 }
41723 break;
41724 }
41725 case X86ISD::HADD:
41726 case X86ISD::HSUB:
41727 case X86ISD::FHADD:
41728 case X86ISD::FHSUB: {
41729 SDValue N0 = Op.getOperand(0);
41730 SDValue N1 = Op.getOperand(1);
41731
41732 APInt DemandedLHS, DemandedRHS;
41733 getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
41734
41735 APInt LHSUndef, LHSZero;
41736 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
41737 Depth + 1))
41738 return true;
41739 APInt RHSUndef, RHSZero;
41740 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
41741 Depth + 1))
41742 return true;
41743
41744 // TODO - pass on known zero/undef.
41745
41746 // Aggressively peek through ops to get at the demanded elts.
41747 // TODO: Handle repeated operands.
41748 if (N0 != N1 && !DemandedElts.isAllOnes()) {
41749 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
41750 TLO.DAG, Depth + 1);
41751 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
41752 TLO.DAG, Depth + 1);
41753 if (NewN0 || NewN1) {
41754 NewN0 = NewN0 ? NewN0 : N0;
41755 NewN1 = NewN1 ? NewN1 : N1;
41756 return TLO.CombineTo(Op,
41757 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
41758 }
41759 }
41760 break;
41761 }
41762 case X86ISD::VTRUNC:
41763 case X86ISD::VTRUNCS:
41764 case X86ISD::VTRUNCUS: {
41765 SDValue Src = Op.getOperand(0);
41766 MVT SrcVT = Src.getSimpleValueType();
41767 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
41768 APInt SrcUndef, SrcZero;
41769 if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
41770 Depth + 1))
41771 return true;
41772 KnownZero = SrcZero.zextOrTrunc(NumElts);
41773 KnownUndef = SrcUndef.zextOrTrunc(NumElts);
41774 break;
41775 }
41776 case X86ISD::BLENDV: {
41777 APInt SelUndef, SelZero;
41778 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
41779 SelZero, TLO, Depth + 1))
41780 return true;
41781
41782 // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
41783 APInt LHSUndef, LHSZero;
41784 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
41785 LHSZero, TLO, Depth + 1))
41786 return true;
41787
41788 APInt RHSUndef, RHSZero;
41789 if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
41790 RHSZero, TLO, Depth + 1))
41791 return true;
41792
41793 KnownZero = LHSZero & RHSZero;
41794 KnownUndef = LHSUndef & RHSUndef;
41795 break;
41796 }
41797 case X86ISD::VZEXT_MOVL: {
41798 // If upper demanded elements are already zero then we have nothing to do.
41799 SDValue Src = Op.getOperand(0);
41800 APInt DemandedUpperElts = DemandedElts;
41801 DemandedUpperElts.clearLowBits(1);
41802 if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1))
41803 return TLO.CombineTo(Op, Src);
41804 break;
41805 }
41806 case X86ISD::VZEXT_LOAD: {
41807 // If upper demanded elements are not demanded then simplify to a
41808 // scalar_to_vector(load()).
41810 if (DemandedElts == 1 && Op.getValue(1).use_empty() && isTypeLegal(SVT)) {
41811 SDLoc DL(Op);
41812 auto *Mem = cast<MemSDNode>(Op);
41813 SDValue Elt = TLO.DAG.getLoad(SVT, DL, Mem->getChain(), Mem->getBasePtr(),
41814 Mem->getMemOperand());
41815 SDValue Vec = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Elt);
41816 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Vec));
41817 }
41818 break;
41819 }
41820 case X86ISD::VBROADCAST: {
41821 SDValue Src = Op.getOperand(0);
41822 MVT SrcVT = Src.getSimpleValueType();
41823 if (!SrcVT.isVector())
41824 break;
41825 // Don't bother broadcasting if we just need the 0'th element.
41826 if (DemandedElts == 1) {
41827 if (Src.getValueType() != VT)
41828 Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
41829 SDLoc(Op));
41830 return TLO.CombineTo(Op, Src);
41831 }
41832 APInt SrcUndef, SrcZero;
41833 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
41834 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
41835 Depth + 1))
41836 return true;
41837 // Aggressively peek through src to get at the demanded elt.
41838 // TODO - we should do this for all target/faux shuffles ops.
41840 Src, SrcElts, TLO.DAG, Depth + 1))
41841 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
41842 break;
41843 }
41844 case X86ISD::VPERMV:
41845 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
41846 Depth))
41847 return true;
41848 break;
41849 case X86ISD::PSHUFB:
41850 case X86ISD::VPERMV3:
41851 case X86ISD::VPERMILPV:
41852 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
41853 Depth))
41854 return true;
41855 break;
41856 case X86ISD::VPPERM:
41857 case X86ISD::VPERMIL2:
41858 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
41859 Depth))
41860 return true;
41861 break;
41862 }
41863
41864 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
41865 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
41866 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
41867 if ((VT.is256BitVector() || VT.is512BitVector()) &&
41868 DemandedElts.lshr(NumElts / 2) == 0) {
41869 unsigned SizeInBits = VT.getSizeInBits();
41870 unsigned ExtSizeInBits = SizeInBits / 2;
41871
41872 // See if 512-bit ops only use the bottom 128-bits.
41873 if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
41874 ExtSizeInBits = SizeInBits / 4;
41875
41876 switch (Opc) {
41877 // Scalar broadcast.
41878 case X86ISD::VBROADCAST: {
41879 SDLoc DL(Op);
41880 SDValue Src = Op.getOperand(0);
41881 if (Src.getValueSizeInBits() > ExtSizeInBits)
41882 Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
41883 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
41884 ExtSizeInBits / VT.getScalarSizeInBits());
41885 SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
41886 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
41887 TLO.DAG, DL, ExtSizeInBits));
41888 }
41890 SDLoc DL(Op);
41891 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
41892 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
41893 ExtSizeInBits / VT.getScalarSizeInBits());
41894 SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
41895 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
41896 SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
41897 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
41898 MemIntr->getMemOperand());
41900 Bcst.getValue(1));
41901 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
41902 TLO.DAG, DL, ExtSizeInBits));
41903 }
41904 // Subvector broadcast.
41906 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
41907 EVT MemVT = MemIntr->getMemoryVT();
41908 if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
41909 SDLoc DL(Op);
41910 SDValue Ld =
41911 TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
41912 MemIntr->getBasePtr(), MemIntr->getMemOperand());
41914 Ld.getValue(1));
41915 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
41916 TLO.DAG, DL, ExtSizeInBits));
41917 } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
41918 SDLoc DL(Op);
41919 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
41920 ExtSizeInBits / VT.getScalarSizeInBits());
41921 if (SDValue BcstLd =
41922 getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))
41923 return TLO.CombineTo(Op,
41924 insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,
41925 TLO.DAG, DL, ExtSizeInBits));
41926 }
41927 break;
41928 }
41929 // Byte shifts by immediate.
41930 case X86ISD::VSHLDQ:
41931 case X86ISD::VSRLDQ:
41932 // Shift by uniform.
41933 case X86ISD::VSHL:
41934 case X86ISD::VSRL:
41935 case X86ISD::VSRA:
41936 // Shift by immediate.
41937 case X86ISD::VSHLI:
41938 case X86ISD::VSRLI:
41939 case X86ISD::VSRAI: {
41940 SDLoc DL(Op);
41941 SDValue Ext0 =
41942 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
41943 SDValue ExtOp =
41944 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
41945 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
41946 SDValue Insert =
41947 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
41948 return TLO.CombineTo(Op, Insert);
41949 }
41950 case X86ISD::VPERMI: {
41951 // Simplify PERMPD/PERMQ to extract_subvector.
41952 // TODO: This should be done in shuffle combining.
41953 if (VT == MVT::v4f64 || VT == MVT::v4i64) {
41955 DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
41956 if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
41957 SDLoc DL(Op);
41958 SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
41959 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
41960 SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
41961 return TLO.CombineTo(Op, Insert);
41962 }
41963 }
41964 break;
41965 }
41966 case X86ISD::VPERM2X128: {
41967 // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
41968 SDLoc DL(Op);
41969 unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
41970 if (LoMask & 0x8)
41971 return TLO.CombineTo(
41972 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
41973 unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
41974 unsigned SrcIdx = (LoMask & 0x2) >> 1;
41975 SDValue ExtOp =
41976 extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
41977 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
41978 SDValue Insert =
41979 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
41980 return TLO.CombineTo(Op, Insert);
41981 }
41982 // Zero upper elements.
41983 case X86ISD::VZEXT_MOVL:
41984 // Target unary shuffles by immediate:
41985 case X86ISD::PSHUFD:
41986 case X86ISD::PSHUFLW:
41987 case X86ISD::PSHUFHW:
41988 case X86ISD::VPERMILPI:
41989 // (Non-Lane Crossing) Target Shuffles.
41990 case X86ISD::VPERMILPV:
41991 case X86ISD::VPERMIL2:
41992 case X86ISD::PSHUFB:
41993 case X86ISD::UNPCKL:
41994 case X86ISD::UNPCKH:
41995 case X86ISD::BLENDI:
41996 // Integer ops.
41997 case X86ISD::PACKSS:
41998 case X86ISD::PACKUS:
41999 case X86ISD::PCMPEQ:
42000 case X86ISD::PCMPGT:
42001 case X86ISD::PMULUDQ:
42002 case X86ISD::PMULDQ:
42003 case X86ISD::VSHLV:
42004 case X86ISD::VSRLV:
42005 case X86ISD::VSRAV:
42006 // Float ops.
42007 case X86ISD::FMAX:
42008 case X86ISD::FMIN:
42009 case X86ISD::FMAXC:
42010 case X86ISD::FMINC:
42011 // Horizontal Ops.
42012 case X86ISD::HADD:
42013 case X86ISD::HSUB:
42014 case X86ISD::FHADD:
42015 case X86ISD::FHSUB: {
42016 SDLoc DL(Op);
42018 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
42019 SDValue SrcOp = Op.getOperand(i);
42020 EVT SrcVT = SrcOp.getValueType();
42021 assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&
42022 "Unsupported vector size");
42023 Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
42024 ExtSizeInBits)
42025 : SrcOp);
42026 }
42027 MVT ExtVT = VT.getSimpleVT();
42028 ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
42029 ExtSizeInBits / ExtVT.getScalarSizeInBits());
42030 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
42031 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
42032 SDValue Insert =
42033 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
42034 return TLO.CombineTo(Op, Insert);
42035 }
42036 }
42037 }
42038
42039 // For splats, unless we *only* demand the 0'th element,
42040 // stop attempts at simplification here, we aren't going to improve things,
42041 // this is better than any potential shuffle.
42042 if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))
42043 return false;
42044
42045 // Get target/faux shuffle mask.
42046 APInt OpUndef, OpZero;
42047 SmallVector<int, 64> OpMask;
42048 SmallVector<SDValue, 2> OpInputs;
42049 if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
42050 OpZero, TLO.DAG, Depth, false))
42051 return false;
42052
42053 // Shuffle inputs must be the same size as the result.
42054 if (OpMask.size() != (unsigned)NumElts ||
42055 llvm::any_of(OpInputs, [VT](SDValue V) {
42056 return VT.getSizeInBits() != V.getValueSizeInBits() ||
42057 !V.getValueType().isVector();
42058 }))
42059 return false;
42060
42061 KnownZero = OpZero;
42062 KnownUndef = OpUndef;
42063
42064 // Check if shuffle mask can be simplified to undef/zero/identity.
42065 int NumSrcs = OpInputs.size();
42066 for (int i = 0; i != NumElts; ++i)
42067 if (!DemandedElts[i])
42068 OpMask[i] = SM_SentinelUndef;
42069
42070 if (isUndefInRange(OpMask, 0, NumElts)) {
42071 KnownUndef.setAllBits();
42072 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
42073 }
42074 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
42075 KnownZero.setAllBits();
42076 return TLO.CombineTo(
42077 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
42078 }
42079 for (int Src = 0; Src != NumSrcs; ++Src)
42080 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
42081 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
42082
42083 // Attempt to simplify inputs.
42084 for (int Src = 0; Src != NumSrcs; ++Src) {
42085 // TODO: Support inputs of different types.
42086 if (OpInputs[Src].getValueType() != VT)
42087 continue;
42088
42089 int Lo = Src * NumElts;
42090 APInt SrcElts = APInt::getZero(NumElts);
42091 for (int i = 0; i != NumElts; ++i)
42092 if (DemandedElts[i]) {
42093 int M = OpMask[i] - Lo;
42094 if (0 <= M && M < NumElts)
42095 SrcElts.setBit(M);
42096 }
42097
42098 // TODO - Propagate input undef/zero elts.
42099 APInt SrcUndef, SrcZero;
42100 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
42101 TLO, Depth + 1))
42102 return true;
42103 }
42104
42105 // If we don't demand all elements, then attempt to combine to a simpler
42106 // shuffle.
42107 // We need to convert the depth to something combineX86ShufflesRecursively
42108 // can handle - so pretend its Depth == 0 again, and reduce the max depth
42109 // to match. This prevents combineX86ShuffleChain from returning a
42110 // combined shuffle that's the same as the original root, causing an
42111 // infinite loop.
42112 if (!DemandedElts.isAllOnes()) {
42113 assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
42114
42115 SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
42116 for (int i = 0; i != NumElts; ++i)
42117 if (DemandedElts[i])
42118 DemandedMask[i] = i;
42119
42121 {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,
42122 /*HasVarMask*/ false,
42123 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, TLO.DAG,
42124 Subtarget);
42125 if (NewShuffle)
42126 return TLO.CombineTo(Op, NewShuffle);
42127 }
42128
42129 return false;
42130}
42131
42133 SDValue Op, const APInt &OriginalDemandedBits,
42134 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
42135 unsigned Depth) const {
42136 EVT VT = Op.getValueType();
42137 unsigned BitWidth = OriginalDemandedBits.getBitWidth();
42138 unsigned Opc = Op.getOpcode();
42139 switch(Opc) {
42140 case X86ISD::VTRUNC: {
42141 KnownBits KnownOp;
42142 SDValue Src = Op.getOperand(0);
42143 MVT SrcVT = Src.getSimpleValueType();
42144
42145 // Simplify the input, using demanded bit information.
42146 APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
42147 APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
42148 if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
42149 return true;
42150 break;
42151 }
42152 case X86ISD::PMULDQ:
42153 case X86ISD::PMULUDQ: {
42154 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
42155 KnownBits KnownLHS, KnownRHS;
42156 SDValue LHS = Op.getOperand(0);
42157 SDValue RHS = Op.getOperand(1);
42158
42159 // Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.
42160 // FIXME: Can we bound this better?
42161 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
42162 APInt DemandedMaskLHS = APInt::getAllOnes(64);
42163 APInt DemandedMaskRHS = APInt::getAllOnes(64);
42164
42165 bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512();
42166 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS))
42167 DemandedMaskLHS = DemandedMask;
42168 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS))
42169 DemandedMaskRHS = DemandedMask;
42170
42171 if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts,
42172 KnownLHS, TLO, Depth + 1))
42173 return true;
42174 if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts,
42175 KnownRHS, TLO, Depth + 1))
42176 return true;
42177
42178 // PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'.
42179 KnownRHS = KnownRHS.trunc(32);
42180 if (Opc == X86ISD::PMULUDQ && KnownRHS.isConstant() &&
42181 KnownRHS.getConstant().isOne()) {
42182 SDLoc DL(Op);
42183 SDValue Mask = TLO.DAG.getConstant(DemandedMask, DL, VT);
42184 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, DL, VT, LHS, Mask));
42185 }
42186
42187 // Aggressively peek through ops to get at the demanded low bits.
42189 LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
42191 RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
42192 if (DemandedLHS || DemandedRHS) {
42193 DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
42194 DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
42195 return TLO.CombineTo(
42196 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
42197 }
42198 break;
42199 }
42200 case X86ISD::ANDNP: {
42201 KnownBits Known2;
42202 SDValue Op0 = Op.getOperand(0);
42203 SDValue Op1 = Op.getOperand(1);
42204
42205 if (SimplifyDemandedBits(Op1, OriginalDemandedBits, OriginalDemandedElts,
42206 Known, TLO, Depth + 1))
42207 return true;
42208 assert(!Known.hasConflict() && "Bits known to be one AND zero?");
42209
42210 if (SimplifyDemandedBits(Op0, ~Known.Zero & OriginalDemandedBits,
42211 OriginalDemandedElts, Known2, TLO, Depth + 1))
42212 return true;
42213 assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
42214
42215 // If the RHS is a constant, see if we can simplify it.
42216 if (ShrinkDemandedConstant(Op, ~Known2.One & OriginalDemandedBits,
42217 OriginalDemandedElts, TLO))
42218 return true;
42219
42220 // ANDNP = (~Op0 & Op1);
42221 Known.One &= Known2.Zero;
42222 Known.Zero |= Known2.One;
42223 break;
42224 }
42225 case X86ISD::VSHLI: {
42226 SDValue Op0 = Op.getOperand(0);
42227
42228 unsigned ShAmt = Op.getConstantOperandVal(1);
42229 if (ShAmt >= BitWidth)
42230 break;
42231
42232 APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
42233
42234 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
42235 // single shift. We can do this if the bottom bits (which are shifted
42236 // out) are never demanded.
42237 if (Op0.getOpcode() == X86ISD::VSRLI &&
42238 OriginalDemandedBits.countr_zero() >= ShAmt) {
42239 unsigned Shift2Amt = Op0.getConstantOperandVal(1);
42240 if (Shift2Amt < BitWidth) {
42241 int Diff = ShAmt - Shift2Amt;
42242 if (Diff == 0)
42243 return TLO.CombineTo(Op, Op0.getOperand(0));
42244
42245 unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
42246 SDValue NewShift = TLO.DAG.getNode(
42247 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
42248 TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
42249 return TLO.CombineTo(Op, NewShift);
42250 }
42251 }
42252
42253 // If we are only demanding sign bits then we can use the shift source directly.
42254 unsigned NumSignBits =
42255 TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
42256 unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
42257 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
42258 return TLO.CombineTo(Op, Op0);
42259
42260 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
42261 TLO, Depth + 1))
42262 return true;
42263
42264 assert(!Known.hasConflict() && "Bits known to be one AND zero?");
42265 Known.Zero <<= ShAmt;
42266 Known.One <<= ShAmt;
42267
42268 // Low bits known zero.
42269 Known.Zero.setLowBits(ShAmt);
42270 return false;
42271 }
42272 case X86ISD::VSRLI: {
42273 unsigned ShAmt = Op.getConstantOperandVal(1);
42274 if (ShAmt >= BitWidth)
42275 break;
42276
42277 APInt DemandedMask = OriginalDemandedBits << ShAmt;
42278
42279 if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
42280 OriginalDemandedElts, Known, TLO, Depth + 1))
42281 return true;
42282
42283 assert(!Known.hasConflict() && "Bits known to be one AND zero?");
42284 Known.Zero.lshrInPlace(ShAmt);
42285 Known.One.lshrInPlace(ShAmt);
42286
42287 // High bits known zero.
42288 Known.Zero.setHighBits(ShAmt);
42289 return false;
42290 }
42291 case X86ISD::VSRAI: {
42292 SDValue Op0 = Op.getOperand(0);
42293 SDValue Op1 = Op.getOperand(1);
42294
42295 unsigned ShAmt = Op1->getAsZExtVal();
42296 if (ShAmt >= BitWidth)
42297 break;
42298
42299 APInt DemandedMask = OriginalDemandedBits << ShAmt;
42300
42301 // If we just want the sign bit then we don't need to shift it.
42302 if (OriginalDemandedBits.isSignMask())
42303 return TLO.CombineTo(Op, Op0);
42304
42305 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
42306 if (Op0.getOpcode() == X86ISD::VSHLI &&
42307 Op.getOperand(1) == Op0.getOperand(1)) {
42308 SDValue Op00 = Op0.getOperand(0);
42309 unsigned NumSignBits =
42310 TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
42311 if (ShAmt < NumSignBits)
42312 return TLO.CombineTo(Op, Op00);
42313 }
42314
42315 // If any of the demanded bits are produced by the sign extension, we also
42316 // demand the input sign bit.
42317 if (OriginalDemandedBits.countl_zero() < ShAmt)
42318 DemandedMask.setSignBit();
42319
42320 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
42321 TLO, Depth + 1))
42322 return true;
42323
42324 assert(!Known.hasConflict() && "Bits known to be one AND zero?");
42325 Known.Zero.lshrInPlace(ShAmt);
42326 Known.One.lshrInPlace(ShAmt);
42327
42328 // If the input sign bit is known to be zero, or if none of the top bits
42329 // are demanded, turn this into an unsigned shift right.
42330 if (Known.Zero[BitWidth - ShAmt - 1] ||
42331 OriginalDemandedBits.countl_zero() >= ShAmt)
42332 return TLO.CombineTo(
42333 Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
42334
42335 // High bits are known one.
42336 if (Known.One[BitWidth - ShAmt - 1])
42337 Known.One.setHighBits(ShAmt);
42338 return false;
42339 }
42340 case X86ISD::BLENDV: {
42341 SDValue Sel = Op.getOperand(0);
42342 SDValue LHS = Op.getOperand(1);
42343 SDValue RHS = Op.getOperand(2);
42344
42345 APInt SignMask = APInt::getSignMask(BitWidth);
42347 Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
42349 LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
42351 RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
42352
42353 if (NewSel || NewLHS || NewRHS) {
42354 NewSel = NewSel ? NewSel : Sel;
42355 NewLHS = NewLHS ? NewLHS : LHS;
42356 NewRHS = NewRHS ? NewRHS : RHS;
42357 return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,
42358 NewSel, NewLHS, NewRHS));
42359 }
42360 break;
42361 }
42362 case X86ISD::PEXTRB:
42363 case X86ISD::PEXTRW: {
42364 SDValue Vec = Op.getOperand(0);
42365 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
42366 MVT VecVT = Vec.getSimpleValueType();
42367 unsigned NumVecElts = VecVT.getVectorNumElements();
42368
42369 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
42370 unsigned Idx = CIdx->getZExtValue();
42371 unsigned VecBitWidth = VecVT.getScalarSizeInBits();
42372
42373 // If we demand no bits from the vector then we must have demanded
42374 // bits from the implict zext - simplify to zero.
42375 APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
42376 if (DemandedVecBits == 0)
42377 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
42378
42379 APInt KnownUndef, KnownZero;
42380 APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
42381 if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
42382 KnownZero, TLO, Depth + 1))
42383 return true;
42384
42385 KnownBits KnownVec;
42386 if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
42387 KnownVec, TLO, Depth + 1))
42388 return true;
42389
42391 Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
42392 return TLO.CombineTo(
42393 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
42394
42395 Known = KnownVec.zext(BitWidth);
42396 return false;
42397 }
42398 break;
42399 }
42400 case X86ISD::PINSRB:
42401 case X86ISD::PINSRW: {
42402 SDValue Vec = Op.getOperand(0);
42403 SDValue Scl = Op.getOperand(1);
42404 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
42405 MVT VecVT = Vec.getSimpleValueType();
42406
42407 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
42408 unsigned Idx = CIdx->getZExtValue();
42409 if (!OriginalDemandedElts[Idx])
42410 return TLO.CombineTo(Op, Vec);
42411
42412 KnownBits KnownVec;
42413 APInt DemandedVecElts(OriginalDemandedElts);
42414 DemandedVecElts.clearBit(Idx);
42415 if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
42416 KnownVec, TLO, Depth + 1))
42417 return true;
42418
42419 KnownBits KnownScl;
42420 unsigned NumSclBits = Scl.getScalarValueSizeInBits();
42421 APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
42422 if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
42423 return true;
42424
42425 KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
42426 Known = KnownVec.intersectWith(KnownScl);
42427 return false;
42428 }
42429 break;
42430 }
42431 case X86ISD::PACKSS:
42432 // PACKSS saturates to MIN/MAX integer values. So if we just want the
42433 // sign bit then we can just ask for the source operands sign bit.
42434 // TODO - add known bits handling.
42435 if (OriginalDemandedBits.isSignMask()) {
42436 APInt DemandedLHS, DemandedRHS;
42437 getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
42438
42439 KnownBits KnownLHS, KnownRHS;
42440 APInt SignMask = APInt::getSignMask(BitWidth * 2);
42441 if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
42442 KnownLHS, TLO, Depth + 1))
42443 return true;
42444 if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
42445 KnownRHS, TLO, Depth + 1))
42446 return true;
42447
42448 // Attempt to avoid multi-use ops if we don't need anything from them.
42450 Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
42452 Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
42453 if (DemandedOp0 || DemandedOp1) {
42454 SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
42455 SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
42456 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
42457 }
42458 }
42459 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
42460 break;
42461 case X86ISD::VBROADCAST: {
42462 SDValue Src = Op.getOperand(0);
42463 MVT SrcVT = Src.getSimpleValueType();
42464 APInt DemandedElts = APInt::getOneBitSet(
42465 SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
42466 if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
42467 TLO, Depth + 1))
42468 return true;
42469 // If we don't need the upper bits, attempt to narrow the broadcast source.
42470 // Don't attempt this on AVX512 as it might affect broadcast folding.
42471 // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
42472 if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
42473 OriginalDemandedBits.countl_zero() >= (BitWidth / 2) &&
42474 Src->hasOneUse()) {
42475 MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
42476 SDValue NewSrc =
42477 TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
42478 MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
42479 SDValue NewBcst =
42480 TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
42481 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
42482 }
42483 break;
42484 }
42485 case X86ISD::PCMPGT:
42486 // icmp sgt(0, R) == ashr(R, BitWidth-1).
42487 // iff we only need the sign bit then we can use R directly.
42488 if (OriginalDemandedBits.isSignMask() &&
42489 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
42490 return TLO.CombineTo(Op, Op.getOperand(1));
42491 break;
42492 case X86ISD::MOVMSK: {
42493 SDValue Src = Op.getOperand(0);
42494 MVT SrcVT = Src.getSimpleValueType();
42495 unsigned SrcBits = SrcVT.getScalarSizeInBits();
42496 unsigned NumElts = SrcVT.getVectorNumElements();
42497
42498 // If we don't need the sign bits at all just return zero.
42499 if (OriginalDemandedBits.countr_zero() >= NumElts)
42500 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
42501
42502 // See if we only demand bits from the lower 128-bit vector.
42503 if (SrcVT.is256BitVector() &&
42504 OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {
42505 SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));
42506 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
42507 }
42508
42509 // Only demand the vector elements of the sign bits we need.
42510 APInt KnownUndef, KnownZero;
42511 APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
42512 if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
42513 TLO, Depth + 1))
42514 return true;
42515
42516 Known.Zero = KnownZero.zext(BitWidth);
42517 Known.Zero.setHighBits(BitWidth - NumElts);
42518
42519 // MOVMSK only uses the MSB from each vector element.
42520 KnownBits KnownSrc;
42521 APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
42522 if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
42523 Depth + 1))
42524 return true;
42525
42526 if (KnownSrc.One[SrcBits - 1])
42527 Known.One.setLowBits(NumElts);
42528 else if (KnownSrc.Zero[SrcBits - 1])
42529 Known.Zero.setLowBits(NumElts);
42530
42531 // Attempt to avoid multi-use os if we don't need anything from it.
42533 Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
42534 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
42535 return false;
42536 }
42537 case X86ISD::TESTP: {
42538 SDValue Op0 = Op.getOperand(0);
42539 SDValue Op1 = Op.getOperand(1);
42540 MVT OpVT = Op0.getSimpleValueType();
42541 assert((OpVT.getVectorElementType() == MVT::f32 ||
42542 OpVT.getVectorElementType() == MVT::f64) &&
42543 "Illegal vector type for X86ISD::TESTP");
42544
42545 // TESTPS/TESTPD only demands the sign bits of ALL the elements.
42546 KnownBits KnownSrc;
42547 APInt SignMask = APInt::getSignMask(OpVT.getScalarSizeInBits());
42548 bool AssumeSingleUse = (Op0 == Op1) && Op->isOnlyUserOf(Op0.getNode());
42549 return SimplifyDemandedBits(Op0, SignMask, KnownSrc, TLO, Depth + 1,
42550 AssumeSingleUse) ||
42551 SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1,
42552 AssumeSingleUse);
42553 }
42554 case X86ISD::BEXTR:
42555 case X86ISD::BEXTRI: {
42556 SDValue Op0 = Op.getOperand(0);
42557 SDValue Op1 = Op.getOperand(1);
42558
42559 // Only bottom 16-bits of the control bits are required.
42560 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
42561 // NOTE: SimplifyDemandedBits won't do this for constants.
42562 uint64_t Val1 = Cst1->getZExtValue();
42563 uint64_t MaskedVal1 = Val1 & 0xFFFF;
42564 if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
42565 SDLoc DL(Op);
42566 return TLO.CombineTo(
42567 Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
42568 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
42569 }
42570
42571 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
42572 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
42573
42574 // If the length is 0, the result is 0.
42575 if (Length == 0) {
42576 Known.setAllZero();
42577 return false;
42578 }
42579
42580 if ((Shift + Length) <= BitWidth) {
42581 APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
42582 if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
42583 return true;
42584
42585 Known = Known.extractBits(Length, Shift);
42586 Known = Known.zextOrTrunc(BitWidth);
42587 return false;
42588 }
42589 } else {
42590 assert(Opc == X86ISD::BEXTR && "Unexpected opcode!");
42591 KnownBits Known1;
42592 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
42593 if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
42594 return true;
42595
42596 // If the length is 0, replace with 0.
42597 KnownBits LengthBits = Known1.extractBits(8, 8);
42598 if (LengthBits.isZero())
42599 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
42600 }
42601
42602 break;
42603 }
42604 case X86ISD::PDEP: {
42605 SDValue Op0 = Op.getOperand(0);
42606 SDValue Op1 = Op.getOperand(1);
42607
42608 unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();
42609 APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
42610
42611 // If the demanded bits has leading zeroes, we don't demand those from the
42612 // mask.
42613 if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
42614 return true;
42615
42616 // The number of possible 1s in the mask determines the number of LSBs of
42617 // operand 0 used. Undemanded bits from the mask don't matter so filter
42618 // them before counting.
42619 KnownBits Known2;
42620 uint64_t Count = (~Known.Zero & LoMask).popcount();
42621 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
42622 if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
42623 return true;
42624
42625 // Zeroes are retained from the mask, but not ones.
42626 Known.One.clearAllBits();
42627 // The result will have at least as many trailing zeros as the non-mask
42628 // operand since bits can only map to the same or higher bit position.
42629 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
42630 return false;
42631 }
42632 }
42633
42635 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
42636}
42637
42639 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
42640 SelectionDAG &DAG, unsigned Depth) const {
42641 int NumElts = DemandedElts.getBitWidth();
42642 unsigned Opc = Op.getOpcode();
42643 EVT VT = Op.getValueType();
42644
42645 switch (Opc) {
42646 case X86ISD::PINSRB:
42647 case X86ISD::PINSRW: {
42648 // If we don't demand the inserted element, return the base vector.
42649 SDValue Vec = Op.getOperand(0);
42650 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
42651 MVT VecVT = Vec.getSimpleValueType();
42652 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
42653 !DemandedElts[CIdx->getZExtValue()])
42654 return Vec;
42655 break;
42656 }
42657 case X86ISD::VSHLI: {
42658 // If we are only demanding sign bits then we can use the shift source
42659 // directly.
42660 SDValue Op0 = Op.getOperand(0);
42661 unsigned ShAmt = Op.getConstantOperandVal(1);
42662 unsigned BitWidth = DemandedBits.getBitWidth();
42663 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
42664 unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();
42665 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
42666 return Op0;
42667 break;
42668 }
42669 case X86ISD::VSRAI:
42670 // iff we only need the sign bit then we can use the source directly.
42671 // TODO: generalize where we only demand extended signbits.
42672 if (DemandedBits.isSignMask())
42673 return Op.getOperand(0);
42674 break;
42675 case X86ISD::PCMPGT:
42676 // icmp sgt(0, R) == ashr(R, BitWidth-1).
42677 // iff we only need the sign bit then we can use R directly.
42678 if (DemandedBits.isSignMask() &&
42679 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
42680 return Op.getOperand(1);
42681 break;
42682 case X86ISD::BLENDV: {
42683 // BLENDV: Cond (MSB) ? LHS : RHS
42684 SDValue Cond = Op.getOperand(0);
42685 SDValue LHS = Op.getOperand(1);
42686 SDValue RHS = Op.getOperand(2);
42687
42688 KnownBits CondKnown = DAG.computeKnownBits(Cond, DemandedElts, Depth + 1);
42689 if (CondKnown.isNegative())
42690 return LHS;
42691 if (CondKnown.isNonNegative())
42692 return RHS;
42693 break;
42694 }
42695 case X86ISD::ANDNP: {
42696 // ANDNP = (~LHS & RHS);
42697 SDValue LHS = Op.getOperand(0);
42698 SDValue RHS = Op.getOperand(1);
42699
42700 KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1);
42701 KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1);
42702
42703 // If all of the demanded bits are known 0 on LHS and known 0 on RHS, then
42704 // the (inverted) LHS bits cannot contribute to the result of the 'andn' in
42705 // this context, so return RHS.
42706 if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero))
42707 return RHS;
42708 break;
42709 }
42710 }
42711
42712 APInt ShuffleUndef, ShuffleZero;
42713 SmallVector<int, 16> ShuffleMask;
42715 if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
42716 ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
42717 // If all the demanded elts are from one operand and are inline,
42718 // then we can use the operand directly.
42719 int NumOps = ShuffleOps.size();
42720 if (ShuffleMask.size() == (unsigned)NumElts &&
42722 return VT.getSizeInBits() == V.getValueSizeInBits();
42723 })) {
42724
42725 if (DemandedElts.isSubsetOf(ShuffleUndef))
42726 return DAG.getUNDEF(VT);
42727 if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
42728 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
42729
42730 // Bitmask that indicates which ops have only been accessed 'inline'.
42731 APInt IdentityOp = APInt::getAllOnes(NumOps);
42732 for (int i = 0; i != NumElts; ++i) {
42733 int M = ShuffleMask[i];
42734 if (!DemandedElts[i] || ShuffleUndef[i])
42735 continue;
42736 int OpIdx = M / NumElts;
42737 int EltIdx = M % NumElts;
42738 if (M < 0 || EltIdx != i) {
42739 IdentityOp.clearAllBits();
42740 break;
42741 }
42742 IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
42743 if (IdentityOp == 0)
42744 break;
42745 }
42746 assert((IdentityOp == 0 || IdentityOp.popcount() == 1) &&
42747 "Multiple identity shuffles detected");
42748
42749 if (IdentityOp != 0)
42750 return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countr_zero()]);
42751 }
42752 }
42753
42755 Op, DemandedBits, DemandedElts, DAG, Depth);
42756}
42757
42759 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
42760 bool PoisonOnly, unsigned Depth) const {
42761 unsigned NumElts = DemandedElts.getBitWidth();
42762
42763 // TODO: Add more target shuffles.
42764 switch (Op.getOpcode()) {
42765 case X86ISD::PSHUFD:
42766 case X86ISD::VPERMILPI: {
42769 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
42770 SmallVector<APInt, 2> DemandedSrcElts(Ops.size(),
42771 APInt::getZero(NumElts));
42772 for (auto M : enumerate(Mask)) {
42773 if (!DemandedElts[M.index()] || M.value() == SM_SentinelZero)
42774 continue;
42775 if (M.value() == SM_SentinelUndef)
42776 return false;
42777 assert(0 <= M.value() && M.value() < (int)(Ops.size() * NumElts) &&
42778 "Shuffle mask index out of range");
42779 DemandedSrcElts[M.value() / NumElts].setBit(M.value() % NumElts);
42780 }
42781 for (auto Op : enumerate(Ops))
42782 if (!DemandedSrcElts[Op.index()].isZero() &&
42784 Op.value(), DemandedSrcElts[Op.index()], PoisonOnly, Depth + 1))
42785 return false;
42786 return true;
42787 }
42788 break;
42789 }
42790 }
42792 Op, DemandedElts, DAG, PoisonOnly, Depth);
42793}
42794
42796 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
42797 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
42798
42799 // TODO: Add more target shuffles.
42800 switch (Op.getOpcode()) {
42801 case X86ISD::PSHUFD:
42802 case X86ISD::VPERMILPI:
42803 case X86ISD::UNPCKH:
42804 case X86ISD::UNPCKL:
42805 return false;
42806 }
42808 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
42809}
42810
42812 const APInt &DemandedElts,
42813 APInt &UndefElts,
42814 const SelectionDAG &DAG,
42815 unsigned Depth) const {
42816 unsigned NumElts = DemandedElts.getBitWidth();
42817 unsigned Opc = Op.getOpcode();
42818
42819 switch (Opc) {
42820 case X86ISD::VBROADCAST:
42822 UndefElts = APInt::getZero(NumElts);
42823 return true;
42824 }
42825
42826 return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,
42827 DAG, Depth);
42828}
42829
42830// Helper to peek through bitops/trunc/setcc to determine size of source vector.
42831// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
42832static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
42833 bool AllowTruncate) {
42834 switch (Src.getOpcode()) {
42835 case ISD::TRUNCATE:
42836 if (!AllowTruncate)
42837 return false;
42838 [[fallthrough]];
42839 case ISD::SETCC:
42840 return Src.getOperand(0).getValueSizeInBits() == Size;
42841 case ISD::FREEZE:
42842 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate);
42843 case ISD::AND:
42844 case ISD::XOR:
42845 case ISD::OR:
42846 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate) &&
42847 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate);
42848 case ISD::SELECT:
42849 case ISD::VSELECT:
42850 return Src.getOperand(0).getScalarValueSizeInBits() == 1 &&
42851 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate) &&
42852 checkBitcastSrcVectorSize(Src.getOperand(2), Size, AllowTruncate);
42853 case ISD::BUILD_VECTOR:
42854 return ISD::isBuildVectorAllZeros(Src.getNode()) ||
42855 ISD::isBuildVectorAllOnes(Src.getNode());
42856 }
42857 return false;
42858}
42859
42860// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
42861static unsigned getAltBitOpcode(unsigned Opcode) {
42862 switch(Opcode) {
42863 // clang-format off
42864 case ISD::AND: return X86ISD::FAND;
42865 case ISD::OR: return X86ISD::FOR;
42866 case ISD::XOR: return X86ISD::FXOR;
42867 case X86ISD::ANDNP: return X86ISD::FANDN;
42868 // clang-format on
42869 }
42870 llvm_unreachable("Unknown bitwise opcode");
42871}
42872
42873// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
42875 const SDLoc &DL) {
42876 EVT SrcVT = Src.getValueType();
42877 if (SrcVT != MVT::v4i1)
42878 return SDValue();
42879
42880 switch (Src.getOpcode()) {
42881 case ISD::SETCC:
42882 if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
42883 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
42884 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
42885 SDValue Op0 = Src.getOperand(0);
42886 if (ISD::isNormalLoad(Op0.getNode()))
42887 return DAG.getBitcast(MVT::v4f32, Op0);
42888 if (Op0.getOpcode() == ISD::BITCAST &&
42889 Op0.getOperand(0).getValueType() == MVT::v4f32)
42890 return Op0.getOperand(0);
42891 }
42892 break;
42893 case ISD::AND:
42894 case ISD::XOR:
42895 case ISD::OR: {
42896 SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
42897 SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
42898 if (Op0 && Op1)
42899 return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
42900 Op1);
42901 break;
42902 }
42903 }
42904 return SDValue();
42905}
42906
42907// Helper to push sign extension of vXi1 SETCC result through bitops.
42909 SDValue Src, const SDLoc &DL) {
42910 switch (Src.getOpcode()) {
42911 case ISD::SETCC:
42912 case ISD::FREEZE:
42913 case ISD::TRUNCATE:
42914 case ISD::BUILD_VECTOR:
42915 return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
42916 case ISD::AND:
42917 case ISD::XOR:
42918 case ISD::OR:
42919 return DAG.getNode(
42920 Src.getOpcode(), DL, SExtVT,
42921 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
42922 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
42923 case ISD::SELECT:
42924 case ISD::VSELECT:
42925 return DAG.getSelect(
42926 DL, SExtVT, Src.getOperand(0),
42927 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL),
42928 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(2), DL));
42929 }
42930 llvm_unreachable("Unexpected node type for vXi1 sign extension");
42931}
42932
42933// Try to match patterns such as
42934// (i16 bitcast (v16i1 x))
42935// ->
42936// (i16 movmsk (16i8 sext (v16i1 x)))
42937// before the illegal vector is scalarized on subtargets that don't have legal
42938// vxi1 types.
42940 const SDLoc &DL,
42941 const X86Subtarget &Subtarget) {
42942 EVT SrcVT = Src.getValueType();
42943 if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
42944 return SDValue();
42945
42946 // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
42947 // legalization destroys the v4i32 type.
42948 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
42949 if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
42950 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
42951 DAG.getBitcast(MVT::v4f32, V));
42952 return DAG.getZExtOrTrunc(V, DL, VT);
42953 }
42954 }
42955
42956 // If the input is a truncate from v16i8 or v32i8 go ahead and use a
42957 // movmskb even with avx512. This will be better than truncating to vXi1 and
42958 // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
42959 // vpcmpeqb/vpcmpgtb.
42960 bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
42961 (Src.getOperand(0).getValueType() == MVT::v16i8 ||
42962 Src.getOperand(0).getValueType() == MVT::v32i8 ||
42963 Src.getOperand(0).getValueType() == MVT::v64i8);
42964
42965 // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
42966 // directly with vpmovmskb/vmovmskps/vmovmskpd.
42967 if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
42968 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
42969 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
42970 EVT CmpVT = Src.getOperand(0).getValueType();
42971 EVT EltVT = CmpVT.getVectorElementType();
42972 if (CmpVT.getSizeInBits() <= 256 &&
42973 (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
42974 PreferMovMsk = true;
42975 }
42976
42977 // With AVX512 vxi1 types are legal and we prefer using k-regs.
42978 // MOVMSK is supported in SSE2 or later.
42979 if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
42980 return SDValue();
42981
42982 // If the upper ops of a concatenation are undef, then try to bitcast the
42983 // lower op and extend.
42984 SmallVector<SDValue, 4> SubSrcOps;
42985 if (collectConcatOps(Src.getNode(), SubSrcOps, DAG) &&
42986 SubSrcOps.size() >= 2) {
42987 SDValue LowerOp = SubSrcOps[0];
42988 ArrayRef<SDValue> UpperOps(std::next(SubSrcOps.begin()), SubSrcOps.end());
42989 if (LowerOp.getOpcode() == ISD::SETCC &&
42990 all_of(UpperOps, [](SDValue Op) { return Op.isUndef(); })) {
42991 EVT SubVT = VT.getIntegerVT(
42992 *DAG.getContext(), LowerOp.getValueType().getVectorMinNumElements());
42993 if (SDValue V = combineBitcastvxi1(DAG, SubVT, LowerOp, DL, Subtarget)) {
42994 EVT IntVT = VT.getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
42995 return DAG.getBitcast(VT, DAG.getNode(ISD::ANY_EXTEND, DL, IntVT, V));
42996 }
42997 }
42998 }
42999
43000 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
43001 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
43002 // v8i16 and v16i16.
43003 // For these two cases, we can shuffle the upper element bytes to a
43004 // consecutive sequence at the start of the vector and treat the results as
43005 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
43006 // for v16i16 this is not the case, because the shuffle is expensive, so we
43007 // avoid sign-extending to this type entirely.
43008 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
43009 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
43010 MVT SExtVT;
43011 bool PropagateSExt = false;
43012 switch (SrcVT.getSimpleVT().SimpleTy) {
43013 default:
43014 return SDValue();
43015 case MVT::v2i1:
43016 SExtVT = MVT::v2i64;
43017 break;
43018 case MVT::v4i1:
43019 SExtVT = MVT::v4i32;
43020 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
43021 // sign-extend to a 256-bit operation to avoid truncation.
43022 if (Subtarget.hasAVX() &&
43023 checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2())) {
43024 SExtVT = MVT::v4i64;
43025 PropagateSExt = true;
43026 }
43027 break;
43028 case MVT::v8i1:
43029 SExtVT = MVT::v8i16;
43030 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
43031 // sign-extend to a 256-bit operation to match the compare.
43032 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
43033 // 256-bit because the shuffle is cheaper than sign extending the result of
43034 // the compare.
43035 if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true) ||
43036 checkBitcastSrcVectorSize(Src, 512, true))) {
43037 SExtVT = MVT::v8i32;
43038 PropagateSExt = true;
43039 }
43040 break;
43041 case MVT::v16i1:
43042 SExtVT = MVT::v16i8;
43043 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
43044 // it is not profitable to sign-extend to 256-bit because this will
43045 // require an extra cross-lane shuffle which is more expensive than
43046 // truncating the result of the compare to 128-bits.
43047 break;
43048 case MVT::v32i1:
43049 SExtVT = MVT::v32i8;
43050 break;
43051 case MVT::v64i1:
43052 // If we have AVX512F, but not AVX512BW and the input is truncated from
43053 // v64i8 checked earlier. Then split the input and make two pmovmskbs.
43054 if (Subtarget.hasAVX512()) {
43055 if (Subtarget.hasBWI())
43056 return SDValue();
43057 SExtVT = MVT::v64i8;
43058 break;
43059 }
43060 // Split if this is a <64 x i8> comparison result.
43061 if (checkBitcastSrcVectorSize(Src, 512, false)) {
43062 SExtVT = MVT::v64i8;
43063 break;
43064 }
43065 return SDValue();
43066 };
43067
43068 SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
43069 : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
43070
43071 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
43072 V = getPMOVMSKB(DL, V, DAG, Subtarget);
43073 } else {
43074 if (SExtVT == MVT::v8i16) {
43075 V = widenSubVector(V, false, Subtarget, DAG, DL, 256);
43076 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v16i8, V);
43077 }
43078 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
43079 }
43080
43081 EVT IntVT =
43083 V = DAG.getZExtOrTrunc(V, DL, IntVT);
43084 return DAG.getBitcast(VT, V);
43085}
43086
43087// Convert a vXi1 constant build vector to the same width scalar integer.
43089 EVT SrcVT = Op.getValueType();
43090 assert(SrcVT.getVectorElementType() == MVT::i1 &&
43091 "Expected a vXi1 vector");
43093 "Expected a constant build vector");
43094
43095 APInt Imm(SrcVT.getVectorNumElements(), 0);
43096 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
43097 SDValue In = Op.getOperand(Idx);
43098 if (!In.isUndef() && (In->getAsZExtVal() & 0x1))
43099 Imm.setBit(Idx);
43100 }
43101 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
43102 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
43103}
43104
43107 const X86Subtarget &Subtarget) {
43108 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
43109
43110 if (!DCI.isBeforeLegalizeOps())
43111 return SDValue();
43112
43113 // Only do this if we have k-registers.
43114 if (!Subtarget.hasAVX512())
43115 return SDValue();
43116
43117 EVT DstVT = N->getValueType(0);
43118 SDValue Op = N->getOperand(0);
43119 EVT SrcVT = Op.getValueType();
43120
43121 if (!Op.hasOneUse())
43122 return SDValue();
43123
43124 // Look for logic ops.
43125 if (Op.getOpcode() != ISD::AND &&
43126 Op.getOpcode() != ISD::OR &&
43127 Op.getOpcode() != ISD::XOR)
43128 return SDValue();
43129
43130 // Make sure we have a bitcast between mask registers and a scalar type.
43131 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
43132 DstVT.isScalarInteger()) &&
43133 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
43134 SrcVT.isScalarInteger()))
43135 return SDValue();
43136
43137 SDValue LHS = Op.getOperand(0);
43138 SDValue RHS = Op.getOperand(1);
43139
43140 if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
43141 LHS.getOperand(0).getValueType() == DstVT)
43142 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
43143 DAG.getBitcast(DstVT, RHS));
43144
43145 if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
43146 RHS.getOperand(0).getValueType() == DstVT)
43147 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
43148 DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
43149
43150 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
43151 // Most of these have to move a constant from the scalar domain anyway.
43154 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
43155 DAG.getBitcast(DstVT, LHS), RHS);
43156 }
43157
43158 return SDValue();
43159}
43160
43162 const X86Subtarget &Subtarget) {
43163 SDLoc DL(BV);
43164 unsigned NumElts = BV->getNumOperands();
43165 SDValue Splat = BV->getSplatValue();
43166
43167 // Build MMX element from integer GPR or SSE float values.
43168 auto CreateMMXElement = [&](SDValue V) {
43169 if (V.isUndef())
43170 return DAG.getUNDEF(MVT::x86mmx);
43171 if (V.getValueType().isFloatingPoint()) {
43172 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
43173 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
43174 V = DAG.getBitcast(MVT::v2i64, V);
43175 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
43176 }
43177 V = DAG.getBitcast(MVT::i32, V);
43178 } else {
43179 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
43180 }
43181 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
43182 };
43183
43184 // Convert build vector ops to MMX data in the bottom elements.
43186
43187 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43188
43189 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
43190 if (Splat) {
43191 if (Splat.isUndef())
43192 return DAG.getUNDEF(MVT::x86mmx);
43193
43194 Splat = CreateMMXElement(Splat);
43195
43196 if (Subtarget.hasSSE1()) {
43197 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
43198 if (NumElts == 8)
43199 Splat = DAG.getNode(
43200 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
43201 DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
43202 TLI.getPointerTy(DAG.getDataLayout())),
43203 Splat, Splat);
43204
43205 // Use PSHUFW to repeat 16-bit elements.
43206 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
43207 return DAG.getNode(
43208 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
43209 DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
43210 TLI.getPointerTy(DAG.getDataLayout())),
43211 Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
43212 }
43213 Ops.append(NumElts, Splat);
43214 } else {
43215 for (unsigned i = 0; i != NumElts; ++i)
43216 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
43217 }
43218
43219 // Use tree of PUNPCKLs to build up general MMX vector.
43220 while (Ops.size() > 1) {
43221 unsigned NumOps = Ops.size();
43222 unsigned IntrinOp =
43223 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
43224 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
43225 : Intrinsic::x86_mmx_punpcklbw));
43226 SDValue Intrin = DAG.getTargetConstant(
43227 IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
43228 for (unsigned i = 0; i != NumOps; i += 2)
43229 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
43230 Ops[i], Ops[i + 1]);
43231 Ops.resize(NumOps / 2);
43232 }
43233
43234 return Ops[0];
43235}
43236
43237// Recursive function that attempts to find if a bool vector node was originally
43238// a vector/float/double that got truncated/extended/bitcast to/from a scalar
43239// integer. If so, replace the scalar ops with bool vector equivalents back down
43240// the chain.
43242 SelectionDAG &DAG,
43243 const X86Subtarget &Subtarget) {
43244 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43245 unsigned Opc = V.getOpcode();
43246 switch (Opc) {
43247 case ISD::BITCAST: {
43248 // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
43249 SDValue Src = V.getOperand(0);
43250 EVT SrcVT = Src.getValueType();
43251 if (SrcVT.isVector() || SrcVT.isFloatingPoint())
43252 return DAG.getBitcast(VT, Src);
43253 break;
43254 }
43255 case ISD::TRUNCATE: {
43256 // If we find a suitable source, a truncated scalar becomes a subvector.
43257 SDValue Src = V.getOperand(0);
43258 EVT NewSrcVT =
43259 EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
43260 if (TLI.isTypeLegal(NewSrcVT))
43261 if (SDValue N0 =
43262 combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
43263 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
43264 DAG.getIntPtrConstant(0, DL));
43265 break;
43266 }
43267 case ISD::ANY_EXTEND:
43268 case ISD::ZERO_EXTEND: {
43269 // If we find a suitable source, an extended scalar becomes a subvector.
43270 SDValue Src = V.getOperand(0);
43271 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
43272 Src.getScalarValueSizeInBits());
43273 if (TLI.isTypeLegal(NewSrcVT))
43274 if (SDValue N0 =
43275 combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
43276 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
43277 Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
43278 : DAG.getConstant(0, DL, VT),
43279 N0, DAG.getIntPtrConstant(0, DL));
43280 break;
43281 }
43282 case ISD::OR: {
43283 // If we find suitable sources, we can just move an OR to the vector domain.
43284 SDValue Src0 = V.getOperand(0);
43285 SDValue Src1 = V.getOperand(1);
43286 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
43287 if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))
43288 return DAG.getNode(Opc, DL, VT, N0, N1);
43289 break;
43290 }
43291 case ISD::SHL: {
43292 // If we find a suitable source, a SHL becomes a KSHIFTL.
43293 SDValue Src0 = V.getOperand(0);
43294 if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
43295 ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
43296 break;
43297
43298 if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
43299 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
43300 return DAG.getNode(
43301 X86ISD::KSHIFTL, DL, VT, N0,
43302 DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
43303 break;
43304 }
43305 }
43306 return SDValue();
43307}
43308
43311 const X86Subtarget &Subtarget) {
43312 SDValue N0 = N->getOperand(0);
43313 EVT VT = N->getValueType(0);
43314 EVT SrcVT = N0.getValueType();
43315 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43316
43317 // Try to match patterns such as
43318 // (i16 bitcast (v16i1 x))
43319 // ->
43320 // (i16 movmsk (16i8 sext (v16i1 x)))
43321 // before the setcc result is scalarized on subtargets that don't have legal
43322 // vxi1 types.
43323 if (DCI.isBeforeLegalize()) {
43324 SDLoc dl(N);
43325 if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
43326 return V;
43327
43328 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
43329 // type, widen both sides to avoid a trip through memory.
43330 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
43331 Subtarget.hasAVX512()) {
43332 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
43333 N0 = DAG.getBitcast(MVT::v8i1, N0);
43334 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
43335 DAG.getIntPtrConstant(0, dl));
43336 }
43337
43338 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
43339 // type, widen both sides to avoid a trip through memory.
43340 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
43341 Subtarget.hasAVX512()) {
43342 // Use zeros for the widening if we already have some zeroes. This can
43343 // allow SimplifyDemandedBits to remove scalar ANDs that may be down
43344 // stream of this.
43345 // FIXME: It might make sense to detect a concat_vectors with a mix of
43346 // zeroes and undef and turn it into insert_subvector for i1 vectors as
43347 // a separate combine. What we can't do is canonicalize the operands of
43348 // such a concat or we'll get into a loop with SimplifyDemandedBits.
43349 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
43350 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
43351 if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
43352 SrcVT = LastOp.getValueType();
43353 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
43354 SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());
43355 Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
43356 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
43357 N0 = DAG.getBitcast(MVT::i8, N0);
43358 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
43359 }
43360 }
43361
43362 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
43363 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
43364 Ops[0] = N0;
43365 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
43366 N0 = DAG.getBitcast(MVT::i8, N0);
43367 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
43368 }
43369 } else {
43370 // If we're bitcasting from iX to vXi1, see if the integer originally
43371 // began as a vXi1 and whether we can remove the bitcast entirely.
43372 if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
43373 SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
43374 if (SDValue V =
43375 combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
43376 return V;
43377 }
43378 }
43379
43380 // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
43381 // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
43382 // due to insert_subvector legalization on KNL. By promoting the copy to i16
43383 // we can help with known bits propagation from the vXi1 domain to the
43384 // scalar domain.
43385 if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
43386 !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
43387 N0.getOperand(0).getValueType() == MVT::v16i1 &&
43389 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
43390 DAG.getBitcast(MVT::i16, N0.getOperand(0)));
43391
43392 // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
43393 // and the vbroadcast_load are both integer or both fp. In some cases this
43394 // will remove the bitcast entirely.
43395 if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
43396 VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
43397 auto *BCast = cast<MemIntrinsicSDNode>(N0);
43398 unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
43399 unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
43400 // Don't swap i8/i16 since don't have fp types that size.
43401 if (MemSize >= 32) {
43402 MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
43403 : MVT::getIntegerVT(MemSize);
43404 MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
43405 : MVT::getIntegerVT(SrcVTSize);
43406 LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
43407
43408 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
43409 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
43410 SDValue ResNode =
43412 MemVT, BCast->getMemOperand());
43413 DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
43414 return DAG.getBitcast(VT, ResNode);
43415 }
43416 }
43417
43418 // Since MMX types are special and don't usually play with other vector types,
43419 // it's better to handle them early to be sure we emit efficient code by
43420 // avoiding store-load conversions.
43421 if (VT == MVT::x86mmx) {
43422 // Detect MMX constant vectors.
43423 APInt UndefElts;
43424 SmallVector<APInt, 1> EltBits;
43425 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits,
43426 /*AllowWholeUndefs*/ true,
43427 /*AllowPartialUndefs*/ true)) {
43428 SDLoc DL(N0);
43429 // Handle zero-extension of i32 with MOVD.
43430 if (EltBits[0].countl_zero() >= 32)
43431 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
43432 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
43433 // Else, bitcast to a double.
43434 // TODO - investigate supporting sext 32-bit immediates on x86_64.
43435 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
43436 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
43437 }
43438
43439 // Detect bitcasts to x86mmx low word.
43440 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
43441 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
43442 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
43443 bool LowUndef = true, AllUndefOrZero = true;
43444 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
43445 SDValue Op = N0.getOperand(i);
43446 LowUndef &= Op.isUndef() || (i >= e/2);
43447 AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
43448 }
43449 if (AllUndefOrZero) {
43450 SDValue N00 = N0.getOperand(0);
43451 SDLoc dl(N00);
43452 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
43453 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
43454 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
43455 }
43456 }
43457
43458 // Detect bitcasts of 64-bit build vectors and convert to a
43459 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
43460 // lowest element.
43461 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
43462 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
43463 SrcVT == MVT::v8i8))
43464 return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
43465
43466 // Detect bitcasts between element or subvector extraction to x86mmx.
43467 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
43469 isNullConstant(N0.getOperand(1))) {
43470 SDValue N00 = N0.getOperand(0);
43471 if (N00.getValueType().is128BitVector())
43472 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
43473 DAG.getBitcast(MVT::v2i64, N00));
43474 }
43475
43476 // Detect bitcasts from FP_TO_SINT to x86mmx.
43477 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
43478 SDLoc DL(N0);
43479 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
43480 DAG.getUNDEF(MVT::v2i32));
43481 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
43482 DAG.getBitcast(MVT::v2i64, Res));
43483 }
43484 }
43485
43486 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
43487 // most of these to scalar anyway.
43488 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
43489 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
43491 return combinevXi1ConstantToInteger(N0, DAG);
43492 }
43493
43494 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
43495 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
43496 isa<ConstantSDNode>(N0)) {
43497 auto *C = cast<ConstantSDNode>(N0);
43498 if (C->isAllOnes())
43499 return DAG.getConstant(1, SDLoc(N0), VT);
43500 if (C->isZero())
43501 return DAG.getConstant(0, SDLoc(N0), VT);
43502 }
43503
43504 // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
43505 // Turn it into a sign bit compare that produces a k-register. This avoids
43506 // a trip through a GPR.
43507 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
43508 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
43510 unsigned NumElts = VT.getVectorNumElements();
43511 SDValue Src = N0;
43512
43513 // Peek through truncate.
43514 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
43515 Src = N0.getOperand(0);
43516
43517 if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
43518 SDValue MovmskIn = Src.getOperand(0);
43519 MVT MovmskVT = MovmskIn.getSimpleValueType();
43520 unsigned MovMskElts = MovmskVT.getVectorNumElements();
43521
43522 // We allow extra bits of the movmsk to be used since they are known zero.
43523 // We can't convert a VPMOVMSKB without avx512bw.
43524 if (MovMskElts <= NumElts &&
43525 (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
43526 EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
43527 MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
43528 SDLoc dl(N);
43529 MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
43530 SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
43531 DAG.getConstant(0, dl, IntVT), ISD::SETLT);
43532 if (EVT(CmpVT) == VT)
43533 return Cmp;
43534
43535 // Pad with zeroes up to original VT to replace the zeroes that were
43536 // being used from the MOVMSK.
43537 unsigned NumConcats = NumElts / MovMskElts;
43538 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
43539 Ops[0] = Cmp;
43540 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
43541 }
43542 }
43543 }
43544
43545 // Try to remove bitcasts from input and output of mask arithmetic to
43546 // remove GPR<->K-register crossings.
43547 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
43548 return V;
43549
43550 // Convert a bitcasted integer logic operation that has one bitcasted
43551 // floating-point operand into a floating-point logic operation. This may
43552 // create a load of a constant, but that is cheaper than materializing the
43553 // constant in an integer register and transferring it to an SSE register or
43554 // transferring the SSE operand to integer register and back.
43555 unsigned FPOpcode;
43556 switch (N0.getOpcode()) {
43557 // clang-format off
43558 case ISD::AND: FPOpcode = X86ISD::FAND; break;
43559 case ISD::OR: FPOpcode = X86ISD::FOR; break;
43560 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
43561 default: return SDValue();
43562 // clang-format on
43563 }
43564
43565 // Check if we have a bitcast from another integer type as well.
43566 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
43567 (Subtarget.hasSSE2() && VT == MVT::f64) ||
43568 (Subtarget.hasFP16() && VT == MVT::f16) ||
43569 (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
43570 TLI.isTypeLegal(VT))))
43571 return SDValue();
43572
43573 SDValue LogicOp0 = N0.getOperand(0);
43574 SDValue LogicOp1 = N0.getOperand(1);
43575 SDLoc DL0(N0);
43576
43577 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
43578 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
43579 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
43580 LogicOp0.getOperand(0).getValueType() == VT &&
43581 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
43582 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
43583 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
43584 return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
43585 }
43586 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
43587 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
43588 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
43589 LogicOp1.getOperand(0).getValueType() == VT &&
43590 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
43591 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
43592 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
43593 return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
43594 }
43595
43596 return SDValue();
43597}
43598
43599// (mul (zext a), (sext, b))
43600static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,
43601 SDValue &Op1) {
43602 Op0 = Mul.getOperand(0);
43603 Op1 = Mul.getOperand(1);
43604
43605 // The operand1 should be signed extend
43606 if (Op0.getOpcode() == ISD::SIGN_EXTEND)
43607 std::swap(Op0, Op1);
43608
43609 auto IsFreeTruncation = [](SDValue &Op) -> bool {
43610 if ((Op.getOpcode() == ISD::ZERO_EXTEND ||
43611 Op.getOpcode() == ISD::SIGN_EXTEND) &&
43612 Op.getOperand(0).getScalarValueSizeInBits() <= 8)
43613 return true;
43614
43615 auto *BV = dyn_cast<BuildVectorSDNode>(Op);
43616 return (BV && BV->isConstant());
43617 };
43618
43619 // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned
43620 // value, we need to check Op0 is zero extended value. Op1 should be signed
43621 // value, so we just check the signed bits.
43622 if ((IsFreeTruncation(Op0) &&
43623 DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&
43624 (IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))
43625 return true;
43626
43627 return false;
43628}
43629
43630// Given a ABS node, detect the following pattern:
43631// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
43632// This is useful as it is the input into a SAD pattern.
43633static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
43634 SDValue AbsOp1 = Abs->getOperand(0);
43635 if (AbsOp1.getOpcode() != ISD::SUB)
43636 return false;
43637
43638 Op0 = AbsOp1.getOperand(0);
43639 Op1 = AbsOp1.getOperand(1);
43640
43641 // Check if the operands of the sub are zero-extended from vectors of i8.
43642 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
43643 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
43644 Op1.getOpcode() != ISD::ZERO_EXTEND ||
43645 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
43646 return false;
43647
43648 return true;
43649}
43650
43652 unsigned &LogBias, const SDLoc &DL,
43653 const X86Subtarget &Subtarget) {
43654 // Extend or truncate to MVT::i8 first.
43655 MVT Vi8VT =
43656 MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());
43657 LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);
43658 RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);
43659
43660 // VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element
43661 // C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].
43662 // The src A, B element type is i8, but the dst C element type is i32.
43663 // When we calculate the reduce stage, we use src vector type vXi8 for it
43664 // so we need logbias 2 to avoid extra 2 stages.
43665 LogBias = 2;
43666
43667 unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());
43668 if (Subtarget.hasVNNI() && !Subtarget.hasVLX())
43669 RegSize = std::max(512u, RegSize);
43670
43671 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
43672 // fill in the missing vector elements with 0.
43673 unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();
43674 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));
43675 Ops[0] = LHS;
43676 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
43677 SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
43678 Ops[0] = RHS;
43679 SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
43680
43681 // Actually build the DotProduct, split as 256/512 bits for
43682 // AVXVNNI/AVX512VNNI.
43683 auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
43684 ArrayRef<SDValue> Ops) {
43685 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
43686 return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);
43687 };
43688 MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
43689 SDValue Zero = DAG.getConstant(0, DL, DpVT);
43690
43691 return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},
43692 DpBuilder, false);
43693}
43694
43695// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
43696// to these zexts.
43697static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
43698 const SDValue &Zext1, const SDLoc &DL,
43699 const X86Subtarget &Subtarget) {
43700 // Find the appropriate width for the PSADBW.
43701 EVT InVT = Zext0.getOperand(0).getValueType();
43702 unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());
43703
43704 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
43705 // fill in the missing vector elements with 0.
43706 unsigned NumConcat = RegSize / InVT.getSizeInBits();
43707 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
43708 Ops[0] = Zext0.getOperand(0);
43709 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
43710 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
43711 Ops[0] = Zext1.getOperand(0);
43712 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
43713
43714 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
43715 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
43716 ArrayRef<SDValue> Ops) {
43717 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
43718 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
43719 };
43720 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
43721 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
43722 PSADBWBuilder);
43723}
43724
43725// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
43726// PHMINPOSUW.
43728 const X86Subtarget &Subtarget) {
43729 // Bail without SSE41.
43730 if (!Subtarget.hasSSE41())
43731 return SDValue();
43732
43733 EVT ExtractVT = Extract->getValueType(0);
43734 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
43735 return SDValue();
43736
43737 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
43738 ISD::NodeType BinOp;
43739 SDValue Src = DAG.matchBinOpReduction(
43740 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
43741 if (!Src)
43742 return SDValue();
43743
43744 EVT SrcVT = Src.getValueType();
43745 EVT SrcSVT = SrcVT.getScalarType();
43746 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
43747 return SDValue();
43748
43749 SDLoc DL(Extract);
43750 SDValue MinPos = Src;
43751
43752 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
43753 while (SrcVT.getSizeInBits() > 128) {
43754 SDValue Lo, Hi;
43755 std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
43756 SrcVT = Lo.getValueType();
43757 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
43758 }
43759 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
43760 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
43761 "Unexpected value type");
43762
43763 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
43764 // to flip the value accordingly.
43765 SDValue Mask;
43766 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
43767 if (BinOp == ISD::SMAX)
43768 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
43769 else if (BinOp == ISD::SMIN)
43770 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
43771 else if (BinOp == ISD::UMAX)
43772 Mask = DAG.getAllOnesConstant(DL, SrcVT);
43773
43774 if (Mask)
43775 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
43776
43777 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
43778 // shuffling each upper element down and insert zeros. This means that the
43779 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
43780 // ready for the PHMINPOS.
43781 if (ExtractVT == MVT::i8) {
43783 SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
43784 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
43785 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
43786 }
43787
43788 // Perform the PHMINPOS on a v8i16 vector,
43789 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
43790 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
43791 MinPos = DAG.getBitcast(SrcVT, MinPos);
43792
43793 if (Mask)
43794 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
43795
43796 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
43797 DAG.getIntPtrConstant(0, DL));
43798}
43799
43800// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
43802 const X86Subtarget &Subtarget) {
43803 // Bail without SSE2.
43804 if (!Subtarget.hasSSE2())
43805 return SDValue();
43806
43807 EVT ExtractVT = Extract->getValueType(0);
43808 unsigned BitWidth = ExtractVT.getSizeInBits();
43809 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
43810 ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
43811 return SDValue();
43812
43813 // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
43814 ISD::NodeType BinOp;
43815 SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
43816 if (!Match && ExtractVT == MVT::i1)
43817 Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
43818 if (!Match)
43819 return SDValue();
43820
43821 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
43822 // which we can't support here for now.
43823 if (Match.getScalarValueSizeInBits() != BitWidth)
43824 return SDValue();
43825
43826 SDValue Movmsk;
43827 SDLoc DL(Extract);
43828 EVT MatchVT = Match.getValueType();
43829 unsigned NumElts = MatchVT.getVectorNumElements();
43830 unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
43831 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43832 LLVMContext &Ctx = *DAG.getContext();
43833
43834 if (ExtractVT == MVT::i1) {
43835 // Special case for (pre-legalization) vXi1 reductions.
43836 if (NumElts > 64 || !isPowerOf2_32(NumElts))
43837 return SDValue();
43838 if (Match.getOpcode() == ISD::SETCC) {
43839 ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get();
43840 if ((BinOp == ISD::AND && CC == ISD::CondCode::SETEQ) ||
43841 (BinOp == ISD::OR && CC == ISD::CondCode::SETNE)) {
43842 // For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y.
43843 // For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y.
43844 X86::CondCode X86CC;
43845 SDValue LHS = DAG.getFreeze(Match.getOperand(0));
43846 SDValue RHS = DAG.getFreeze(Match.getOperand(1));
43847 APInt Mask = APInt::getAllOnes(LHS.getScalarValueSizeInBits());
43848 if (SDValue V = LowerVectorAllEqual(DL, LHS, RHS, CC, Mask, Subtarget,
43849 DAG, X86CC))
43850 return DAG.getNode(ISD::TRUNCATE, DL, ExtractVT,
43851 getSETCC(X86CC, V, DL, DAG));
43852 }
43853 }
43854 if (TLI.isTypeLegal(MatchVT)) {
43855 // If this is a legal AVX512 predicate type then we can just bitcast.
43856 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
43857 Movmsk = DAG.getBitcast(MovmskVT, Match);
43858 } else {
43859 // Use combineBitcastvxi1 to create the MOVMSK.
43860 while (NumElts > MaxElts) {
43861 SDValue Lo, Hi;
43862 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
43863 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
43864 NumElts /= 2;
43865 }
43866 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
43867 Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
43868 }
43869 if (!Movmsk)
43870 return SDValue();
43871 Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
43872 } else {
43873 // FIXME: Better handling of k-registers or 512-bit vectors?
43874 unsigned MatchSizeInBits = Match.getValueSizeInBits();
43875 if (!(MatchSizeInBits == 128 ||
43876 (MatchSizeInBits == 256 && Subtarget.hasAVX())))
43877 return SDValue();
43878
43879 // Make sure this isn't a vector of 1 element. The perf win from using
43880 // MOVMSK diminishes with less elements in the reduction, but it is
43881 // generally better to get the comparison over to the GPRs as soon as
43882 // possible to reduce the number of vector ops.
43883 if (Match.getValueType().getVectorNumElements() < 2)
43884 return SDValue();
43885
43886 // Check that we are extracting a reduction of all sign bits.
43887 if (DAG.ComputeNumSignBits(Match) != BitWidth)
43888 return SDValue();
43889
43890 if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
43891 SDValue Lo, Hi;
43892 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
43893 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
43894 MatchSizeInBits = Match.getValueSizeInBits();
43895 }
43896
43897 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
43898 MVT MaskSrcVT;
43899 if (64 == BitWidth || 32 == BitWidth)
43901 MatchSizeInBits / BitWidth);
43902 else
43903 MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
43904
43905 SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
43906 Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
43907 NumElts = MaskSrcVT.getVectorNumElements();
43908 }
43909 assert((NumElts <= 32 || NumElts == 64) &&
43910 "Not expecting more than 64 elements");
43911
43912 MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
43913 if (BinOp == ISD::XOR) {
43914 // parity -> (PARITY(MOVMSK X))
43915 SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
43916 return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
43917 }
43918
43919 SDValue CmpC;
43920 ISD::CondCode CondCode;
43921 if (BinOp == ISD::OR) {
43922 // any_of -> MOVMSK != 0
43923 CmpC = DAG.getConstant(0, DL, CmpVT);
43924 CondCode = ISD::CondCode::SETNE;
43925 } else {
43926 // all_of -> MOVMSK == ((1 << NumElts) - 1)
43927 CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
43928 DL, CmpVT);
43929 CondCode = ISD::CondCode::SETEQ;
43930 }
43931
43932 // The setcc produces an i8 of 0/1, so extend that to the result width and
43933 // negate to get the final 0/-1 mask value.
43934 EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), Ctx, CmpVT);
43935 SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
43936 SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
43937 return DAG.getNegative(Zext, DL, ExtractVT);
43938}
43939
43941 const X86Subtarget &Subtarget) {
43942 if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())
43943 return SDValue();
43944
43945 EVT ExtractVT = Extract->getValueType(0);
43946 // Verify the type we're extracting is i32, as the output element type of
43947 // vpdpbusd is i32.
43948 if (ExtractVT != MVT::i32)
43949 return SDValue();
43950
43951 EVT VT = Extract->getOperand(0).getValueType();
43953 return SDValue();
43954
43955 // Match shuffle + add pyramid.
43956 ISD::NodeType BinOp;
43957 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
43958
43959 // We can't combine to vpdpbusd for zext, because each of the 4 multiplies
43960 // done by vpdpbusd compute a signed 16-bit product that will be sign extended
43961 // before adding into the accumulator.
43962 // TODO:
43963 // We also need to verify that the multiply has at least 2x the number of bits
43964 // of the input. We shouldn't match
43965 // (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).
43966 // if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))
43967 // Root = Root.getOperand(0);
43968
43969 // If there was a match, we want Root to be a mul.
43970 if (!Root || Root.getOpcode() != ISD::MUL)
43971 return SDValue();
43972
43973 // Check whether we have an extend and mul pattern
43974 SDValue LHS, RHS;
43975 if (!detectExtMul(DAG, Root, LHS, RHS))
43976 return SDValue();
43977
43978 // Create the dot product instruction.
43979 SDLoc DL(Extract);
43980 unsigned StageBias;
43981 SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);
43982
43983 // If the original vector was wider than 4 elements, sum over the results
43984 // in the DP vector.
43985 unsigned Stages = Log2_32(VT.getVectorNumElements());
43986 EVT DpVT = DP.getValueType();
43987
43988 if (Stages > StageBias) {
43989 unsigned DpElems = DpVT.getVectorNumElements();
43990
43991 for (unsigned i = Stages - StageBias; i > 0; --i) {
43992 SmallVector<int, 16> Mask(DpElems, -1);
43993 for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
43994 Mask[j] = MaskEnd + j;
43995
43996 SDValue Shuffle =
43997 DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);
43998 DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);
43999 }
44000 }
44001
44002 // Return the lowest ExtractSizeInBits bits.
44003 EVT ResVT =
44004 EVT::getVectorVT(*DAG.getContext(), ExtractVT,
44005 DpVT.getSizeInBits() / ExtractVT.getSizeInBits());
44006 DP = DAG.getBitcast(ResVT, DP);
44007 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,
44008 Extract->getOperand(1));
44009}
44010
44012 const X86Subtarget &Subtarget) {
44013 // PSADBW is only supported on SSE2 and up.
44014 if (!Subtarget.hasSSE2())
44015 return SDValue();
44016
44017 EVT ExtractVT = Extract->getValueType(0);
44018 // Verify the type we're extracting is either i32 or i64.
44019 // FIXME: Could support other types, but this is what we have coverage for.
44020 if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)
44021 return SDValue();
44022
44023 EVT VT = Extract->getOperand(0).getValueType();
44025 return SDValue();
44026
44027 // Match shuffle + add pyramid.
44028 ISD::NodeType BinOp;
44029 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
44030
44031 // The operand is expected to be zero extended from i8
44032 // (verified in detectZextAbsDiff).
44033 // In order to convert to i64 and above, additional any/zero/sign
44034 // extend is expected.
44035 // The zero extend from 32 bit has no mathematical effect on the result.
44036 // Also the sign extend is basically zero extend
44037 // (extends the sign bit which is zero).
44038 // So it is correct to skip the sign/zero extend instruction.
44039 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
44040 Root.getOpcode() == ISD::ZERO_EXTEND ||
44041 Root.getOpcode() == ISD::ANY_EXTEND))
44042 Root = Root.getOperand(0);
44043
44044 // If there was a match, we want Root to be a select that is the root of an
44045 // abs-diff pattern.
44046 if (!Root || Root.getOpcode() != ISD::ABS)
44047 return SDValue();
44048
44049 // Check whether we have an abs-diff pattern feeding into the select.
44050 SDValue Zext0, Zext1;
44051 if (!detectZextAbsDiff(Root, Zext0, Zext1))
44052 return SDValue();
44053
44054 // Create the SAD instruction.
44055 SDLoc DL(Extract);
44056 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
44057
44058 // If the original vector was wider than 8 elements, sum over the results
44059 // in the SAD vector.
44060 unsigned Stages = Log2_32(VT.getVectorNumElements());
44061 EVT SadVT = SAD.getValueType();
44062 if (Stages > 3) {
44063 unsigned SadElems = SadVT.getVectorNumElements();
44064
44065 for(unsigned i = Stages - 3; i > 0; --i) {
44066 SmallVector<int, 16> Mask(SadElems, -1);
44067 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
44068 Mask[j] = MaskEnd + j;
44069
44070 SDValue Shuffle =
44071 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
44072 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
44073 }
44074 }
44075
44076 unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
44077 // Return the lowest ExtractSizeInBits bits.
44078 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
44079 SadVT.getSizeInBits() / ExtractSizeInBits);
44080 SAD = DAG.getBitcast(ResVT, SAD);
44081 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
44082 Extract->getOperand(1));
44083}
44084
44085// If this extract is from a loaded vector value and will be used as an
44086// integer, that requires a potentially expensive XMM -> GPR transfer.
44087// Additionally, if we can convert to a scalar integer load, that will likely
44088// be folded into a subsequent integer op.
44089// Note: SrcVec might not have a VecVT type, but it must be the same size.
44090// Note: Unlike the related fold for this in DAGCombiner, this is not limited
44091// to a single-use of the loaded vector. For the reasons above, we
44092// expect this to be profitable even if it creates an extra load.
44093static SDValue
44095 const SDLoc &dl, SelectionDAG &DAG,
44097 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
44098 "Only EXTRACT_VECTOR_ELT supported so far");
44099
44100 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44101 EVT VT = N->getValueType(0);
44102
44103 bool LikelyUsedAsVector = any_of(N->uses(), [](SDNode *Use) {
44104 return Use->getOpcode() == ISD::STORE ||
44105 Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
44106 Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
44107 });
44108
44109 auto *LoadVec = dyn_cast<LoadSDNode>(SrcVec);
44110 if (LoadVec && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
44111 VecVT.getVectorElementType() == VT &&
44112 VecVT.getSizeInBits() == SrcVec.getValueSizeInBits() &&
44113 DCI.isAfterLegalizeDAG() && !LikelyUsedAsVector && LoadVec->isSimple()) {
44114 SDValue NewPtr = TLI.getVectorElementPointer(
44115 DAG, LoadVec->getBasePtr(), VecVT, DAG.getVectorIdxConstant(Idx, dl));
44116 unsigned PtrOff = VT.getSizeInBits() * Idx / 8;
44117 MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
44118 Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
44119 SDValue Load =
44120 DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
44121 LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
44122 DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
44123 return Load;
44124 }
44125
44126 return SDValue();
44127}
44128
44129// Attempt to peek through a target shuffle and extract the scalar from the
44130// source.
44133 const X86Subtarget &Subtarget) {
44134 if (DCI.isBeforeLegalizeOps())
44135 return SDValue();
44136
44137 SDLoc dl(N);
44138 SDValue Src = N->getOperand(0);
44139 SDValue Idx = N->getOperand(1);
44140
44141 EVT VT = N->getValueType(0);
44142 EVT SrcVT = Src.getValueType();
44143 EVT SrcSVT = SrcVT.getVectorElementType();
44144 unsigned SrcEltBits = SrcSVT.getSizeInBits();
44145 unsigned NumSrcElts = SrcVT.getVectorNumElements();
44146
44147 // Don't attempt this for boolean mask vectors or unknown extraction indices.
44148 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
44149 return SDValue();
44150
44151 const APInt &IdxC = N->getConstantOperandAPInt(1);
44152 if (IdxC.uge(NumSrcElts))
44153 return SDValue();
44154
44155 SDValue SrcBC = peekThroughBitcasts(Src);
44156
44157 // Handle extract(bitcast(broadcast(scalar_value))).
44158 if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
44159 SDValue SrcOp = SrcBC.getOperand(0);
44160 EVT SrcOpVT = SrcOp.getValueType();
44161 if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
44162 (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
44163 unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
44164 unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
44165 // TODO support non-zero offsets.
44166 if (Offset == 0) {
44167 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
44168 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
44169 return SrcOp;
44170 }
44171 }
44172 }
44173
44174 // If we're extracting a single element from a broadcast load and there are
44175 // no other users, just create a single load.
44176 if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
44177 auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
44178 unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
44179 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
44180 VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
44181 SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
44182 MemIntr->getBasePtr(),
44183 MemIntr->getPointerInfo(),
44184 MemIntr->getOriginalAlign(),
44185 MemIntr->getMemOperand()->getFlags());
44186 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
44187 return Load;
44188 }
44189 }
44190
44191 // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
44192 // TODO: Move to DAGCombine?
44193 if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
44194 SrcBC.getValueType().isInteger() &&
44195 (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
44196 SrcBC.getScalarValueSizeInBits() ==
44197 SrcBC.getOperand(0).getValueSizeInBits()) {
44198 unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
44199 if (IdxC.ult(Scale)) {
44200 unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
44201 SDValue Scl = SrcBC.getOperand(0);
44202 EVT SclVT = Scl.getValueType();
44203 if (Offset) {
44204 Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
44205 DAG.getShiftAmountConstant(Offset, SclVT, dl));
44206 }
44207 Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
44208 Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
44209 return Scl;
44210 }
44211 }
44212
44213 // Handle extract(truncate(x)) for 0'th index.
44214 // TODO: Treat this as a faux shuffle?
44215 // TODO: When can we use this for general indices?
44216 if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
44217 (SrcVT.getSizeInBits() % 128) == 0) {
44218 Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
44219 MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
44220 return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
44221 Idx);
44222 }
44223
44224 // We can only legally extract other elements from 128-bit vectors and in
44225 // certain circumstances, depending on SSE-level.
44226 // TODO: Investigate float/double extraction if it will be just stored.
44227 auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
44228 unsigned Idx) {
44229 EVT VecSVT = VecVT.getScalarType();
44230 if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
44231 (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
44232 VecSVT == MVT::i64)) {
44233 unsigned EltSizeInBits = VecSVT.getSizeInBits();
44234 unsigned NumEltsPerLane = 128 / EltSizeInBits;
44235 unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
44236 unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
44237 VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
44238 Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
44239 Idx &= (NumEltsPerLane - 1);
44240 }
44241 if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
44242 ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
44243 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
44244 DAG.getBitcast(VecVT, Vec),
44245 DAG.getIntPtrConstant(Idx, dl));
44246 }
44247 if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
44248 (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
44249 unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
44250 return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
44251 DAG.getTargetConstant(Idx, dl, MVT::i8));
44252 }
44253 return SDValue();
44254 };
44255
44256 // Resolve the target shuffle inputs and mask.
44259 if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
44260 return SDValue();
44261
44262 // Shuffle inputs must be the same size as the result.
44263 if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
44264 return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
44265 }))
44266 return SDValue();
44267
44268 // Attempt to narrow/widen the shuffle mask to the correct size.
44269 if (Mask.size() != NumSrcElts) {
44270 if ((NumSrcElts % Mask.size()) == 0) {
44271 SmallVector<int, 16> ScaledMask;
44272 int Scale = NumSrcElts / Mask.size();
44273 narrowShuffleMaskElts(Scale, Mask, ScaledMask);
44274 Mask = std::move(ScaledMask);
44275 } else if ((Mask.size() % NumSrcElts) == 0) {
44276 // Simplify Mask based on demanded element.
44277 int ExtractIdx = (int)IdxC.getZExtValue();
44278 int Scale = Mask.size() / NumSrcElts;
44279 int Lo = Scale * ExtractIdx;
44280 int Hi = Scale * (ExtractIdx + 1);
44281 for (int i = 0, e = (int)Mask.size(); i != e; ++i)
44282 if (i < Lo || Hi <= i)
44283 Mask[i] = SM_SentinelUndef;
44284
44285 SmallVector<int, 16> WidenedMask;
44286 while (Mask.size() > NumSrcElts &&
44287 canWidenShuffleElements(Mask, WidenedMask))
44288 Mask = std::move(WidenedMask);
44289 }
44290 }
44291
44292 // If narrowing/widening failed, see if we can extract+zero-extend.
44293 int ExtractIdx;
44294 EVT ExtractVT;
44295 if (Mask.size() == NumSrcElts) {
44296 ExtractIdx = Mask[IdxC.getZExtValue()];
44297 ExtractVT = SrcVT;
44298 } else {
44299 unsigned Scale = Mask.size() / NumSrcElts;
44300 if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
44301 return SDValue();
44302 unsigned ScaledIdx = Scale * IdxC.getZExtValue();
44303 if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
44304 return SDValue();
44305 ExtractIdx = Mask[ScaledIdx];
44306 EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
44307 ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
44308 assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&
44309 "Failed to widen vector type");
44310 }
44311
44312 // If the shuffle source element is undef/zero then we can just accept it.
44313 if (ExtractIdx == SM_SentinelUndef)
44314 return DAG.getUNDEF(VT);
44315
44316 if (ExtractIdx == SM_SentinelZero)
44317 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
44318 : DAG.getConstant(0, dl, VT);
44319
44320 SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
44321 ExtractIdx = ExtractIdx % Mask.size();
44322 if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
44323 return DAG.getZExtOrTrunc(V, dl, VT);
44324
44325 if (N->getOpcode() == ISD::EXTRACT_VECTOR_ELT && ExtractVT == SrcVT)
44327 N, SrcVT, peekThroughBitcasts(SrcOp), ExtractIdx, dl, DAG, DCI))
44328 return V;
44329
44330 return SDValue();
44331}
44332
44333/// Extracting a scalar FP value from vector element 0 is free, so extract each
44334/// operand first, then perform the math as a scalar op.
44336 const X86Subtarget &Subtarget) {
44337 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
44338 SDValue Vec = ExtElt->getOperand(0);
44339 SDValue Index = ExtElt->getOperand(1);
44340 EVT VT = ExtElt->getValueType(0);
44341 EVT VecVT = Vec.getValueType();
44342
44343 // TODO: If this is a unary/expensive/expand op, allow extraction from a
44344 // non-zero element because the shuffle+scalar op will be cheaper?
44345 if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
44346 return SDValue();
44347
44348 // Vector FP compares don't fit the pattern of FP math ops (propagate, not
44349 // extract, the condition code), so deal with those as a special-case.
44350 if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
44351 EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
44352 if (OpVT != MVT::f32 && OpVT != MVT::f64)
44353 return SDValue();
44354
44355 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
44356 SDLoc DL(ExtElt);
44357 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
44358 Vec.getOperand(0), Index);
44359 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
44360 Vec.getOperand(1), Index);
44361 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
44362 }
44363
44364 if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&
44365 VT != MVT::f64)
44366 return SDValue();
44367
44368 // Vector FP selects don't fit the pattern of FP math ops (because the
44369 // condition has a different type and we have to change the opcode), so deal
44370 // with those here.
44371 // FIXME: This is restricted to pre type legalization by ensuring the setcc
44372 // has i1 elements. If we loosen this we need to convert vector bool to a
44373 // scalar bool.
44374 if (Vec.getOpcode() == ISD::VSELECT &&
44375 Vec.getOperand(0).getOpcode() == ISD::SETCC &&
44376 Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
44377 Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
44378 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
44379 SDLoc DL(ExtElt);
44382 Vec.getOperand(0), Index);
44383 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
44384 Vec.getOperand(1), Index);
44385 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
44386 Vec.getOperand(2), Index);
44387 return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
44388 }
44389
44390 // TODO: This switch could include FNEG and the x86-specific FP logic ops
44391 // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
44392 // missed load folding and fma+fneg combining.
44393 switch (Vec.getOpcode()) {
44394 case ISD::FMA: // Begin 3 operands
44395 case ISD::FMAD:
44396 case ISD::FADD: // Begin 2 operands
44397 case ISD::FSUB:
44398 case ISD::FMUL:
44399 case ISD::FDIV:
44400 case ISD::FREM:
44401 case ISD::FCOPYSIGN:
44402 case ISD::FMINNUM:
44403 case ISD::FMAXNUM:
44404 case ISD::FMINNUM_IEEE:
44405 case ISD::FMAXNUM_IEEE:
44406 case ISD::FMAXIMUM:
44407 case ISD::FMINIMUM:
44408 case X86ISD::FMAX:
44409 case X86ISD::FMIN:
44410 case ISD::FABS: // Begin 1 operand
44411 case ISD::FSQRT:
44412 case ISD::FRINT:
44413 case ISD::FCEIL:
44414 case ISD::FTRUNC:
44415 case ISD::FNEARBYINT:
44416 case ISD::FROUNDEVEN:
44417 case ISD::FROUND:
44418 case ISD::FFLOOR:
44419 case X86ISD::FRCP:
44420 case X86ISD::FRSQRT: {
44421 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
44422 SDLoc DL(ExtElt);
44424 for (SDValue Op : Vec->ops())
44425 ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
44426 return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
44427 }
44428 default:
44429 return SDValue();
44430 }
44431 llvm_unreachable("All opcodes should return within switch");
44432}
44433
44434/// Try to convert a vector reduction sequence composed of binops and shuffles
44435/// into horizontal ops.
44437 const X86Subtarget &Subtarget) {
44438 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
44439
44440 // We need at least SSE2 to anything here.
44441 if (!Subtarget.hasSSE2())
44442 return SDValue();
44443
44444 ISD::NodeType Opc;
44445 SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
44446 {ISD::ADD, ISD::MUL, ISD::FADD}, true);
44447 if (!Rdx)
44448 return SDValue();
44449
44450 SDValue Index = ExtElt->getOperand(1);
44452 "Reduction doesn't end in an extract from index 0");
44453
44454 EVT VT = ExtElt->getValueType(0);
44455 EVT VecVT = Rdx.getValueType();
44456 if (VecVT.getScalarType() != VT)
44457 return SDValue();
44458
44459 SDLoc DL(ExtElt);
44460 unsigned NumElts = VecVT.getVectorNumElements();
44461 unsigned EltSizeInBits = VecVT.getScalarSizeInBits();
44462
44463 // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
44464 auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {
44465 if (V.getValueType() == MVT::v4i8) {
44466 if (ZeroExtend && Subtarget.hasSSE41()) {
44467 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
44468 DAG.getConstant(0, DL, MVT::v4i32),
44469 DAG.getBitcast(MVT::i32, V),
44470 DAG.getIntPtrConstant(0, DL));
44471 return DAG.getBitcast(MVT::v16i8, V);
44472 }
44473 V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,
44474 ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)
44475 : DAG.getUNDEF(MVT::v4i8));
44476 }
44477 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,
44478 DAG.getUNDEF(MVT::v8i8));
44479 };
44480
44481 // vXi8 mul reduction - promote to vXi16 mul reduction.
44482 if (Opc == ISD::MUL) {
44483 if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
44484 return SDValue();
44485 if (VecVT.getSizeInBits() >= 128) {
44486 EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
44487 SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
44488 SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
44489 Lo = DAG.getBitcast(WideVT, Lo);
44490 Hi = DAG.getBitcast(WideVT, Hi);
44491 Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
44492 while (Rdx.getValueSizeInBits() > 128) {
44493 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
44494 Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
44495 }
44496 } else {
44497 Rdx = WidenToV16I8(Rdx, false);
44498 Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
44499 Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
44500 }
44501 if (NumElts >= 8)
44502 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
44503 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
44504 {4, 5, 6, 7, -1, -1, -1, -1}));
44505 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
44506 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
44507 {2, 3, -1, -1, -1, -1, -1, -1}));
44508 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
44509 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
44510 {1, -1, -1, -1, -1, -1, -1, -1}));
44511 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
44512 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
44513 }
44514
44515 // vXi8 add reduction - sub 128-bit vector.
44516 if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
44517 Rdx = WidenToV16I8(Rdx, true);
44518 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
44519 DAG.getConstant(0, DL, MVT::v16i8));
44520 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
44521 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
44522 }
44523
44524 // Must be a >=128-bit vector with pow2 elements.
44525 if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))
44526 return SDValue();
44527
44528 // vXi8 add reduction - sum lo/hi halves then use PSADBW.
44529 if (VT == MVT::i8) {
44530 while (Rdx.getValueSizeInBits() > 128) {
44531 SDValue Lo, Hi;
44532 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
44533 VecVT = Lo.getValueType();
44534 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
44535 }
44536 assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
44537
44539 MVT::v16i8, DL, Rdx, Rdx,
44540 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
44541 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
44542 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
44543 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
44544 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
44545 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
44546 }
44547
44548 // See if we can use vXi8 PSADBW add reduction for larger zext types.
44549 // If the source vector values are 0-255, then we can use PSADBW to
44550 // sum+zext v8i8 subvectors to vXi64, then perform the reduction.
44551 // TODO: See if its worth avoiding vXi16/i32 truncations?
44552 if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&
44553 DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&
44554 (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||
44555 Subtarget.hasAVX512())) {
44556 if (Rdx.getValueType() == MVT::v8i16) {
44557 Rdx = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Rdx,
44558 DAG.getUNDEF(MVT::v8i16));
44559 } else {
44560 EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
44561 Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);
44562 if (ByteVT.getSizeInBits() < 128)
44563 Rdx = WidenToV16I8(Rdx, true);
44564 }
44565
44566 // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
44567 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
44568 ArrayRef<SDValue> Ops) {
44569 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
44570 SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());
44571 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);
44572 };
44573 MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);
44574 Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);
44575
44576 // TODO: We could truncate to vXi16/vXi32 before performing the reduction.
44577 while (Rdx.getValueSizeInBits() > 128) {
44578 SDValue Lo, Hi;
44579 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
44580 VecVT = Lo.getValueType();
44581 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
44582 }
44583 assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected");
44584
44585 if (NumElts > 8) {
44586 SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});
44587 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);
44588 }
44589
44590 VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());
44591 Rdx = DAG.getBitcast(VecVT, Rdx);
44592 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
44593 }
44594
44595 // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
44596 if (!shouldUseHorizontalOp(true, DAG, Subtarget))
44597 return SDValue();
44598
44599 unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
44600
44601 // 256-bit horizontal instructions operate on 128-bit chunks rather than
44602 // across the whole vector, so we need an extract + hop preliminary stage.
44603 // This is the only step where the operands of the hop are not the same value.
44604 // TODO: We could extend this to handle 512-bit or even longer vectors.
44605 if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
44606 ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
44607 unsigned NumElts = VecVT.getVectorNumElements();
44608 SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
44609 SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
44610 Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
44611 VecVT = Rdx.getValueType();
44612 }
44613 if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
44614 !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
44615 return SDValue();
44616
44617 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
44618 unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
44619 for (unsigned i = 0; i != ReductionSteps; ++i)
44620 Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
44621
44622 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
44623}
44624
44625/// Detect vector gather/scatter index generation and convert it from being a
44626/// bunch of shuffles and extracts into a somewhat faster sequence.
44627/// For i686, the best sequence is apparently storing the value and loading
44628/// scalars back, while for x64 we should use 64-bit extracts and shifts.
44631 const X86Subtarget &Subtarget) {
44632 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
44633 return NewOp;
44634
44635 SDValue InputVector = N->getOperand(0);
44636 SDValue EltIdx = N->getOperand(1);
44637 auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
44638
44639 EVT SrcVT = InputVector.getValueType();
44640 EVT VT = N->getValueType(0);
44641 SDLoc dl(InputVector);
44642 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
44643 unsigned NumSrcElts = SrcVT.getVectorNumElements();
44644 unsigned NumEltBits = VT.getScalarSizeInBits();
44645 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44646
44647 if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
44648 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
44649
44650 // Integer Constant Folding.
44651 if (CIdx && VT.isInteger()) {
44652 APInt UndefVecElts;
44653 SmallVector<APInt, 16> EltBits;
44654 unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
44655 if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
44656 EltBits, /*AllowWholeUndefs*/ true,
44657 /*AllowPartialUndefs*/ false)) {
44658 uint64_t Idx = CIdx->getZExtValue();
44659 if (UndefVecElts[Idx])
44660 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
44661 return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT);
44662 }
44663
44664 // Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).
44665 // Improves lowering of bool masks on rust which splits them into byte array.
44666 if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) {
44667 SDValue Src = peekThroughBitcasts(InputVector);
44668 if (Src.getValueType().getScalarType() == MVT::i1 &&
44669 TLI.isTypeLegal(Src.getValueType())) {
44670 MVT SubVT = MVT::getVectorVT(MVT::i1, NumEltBits);
44671 SDValue Sub = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src,
44672 DAG.getIntPtrConstant(CIdx->getZExtValue() * NumEltBits, dl));
44673 return DAG.getBitcast(VT, Sub);
44674 }
44675 }
44676 }
44677
44678 if (IsPextr) {
44679 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
44680 DCI))
44681 return SDValue(N, 0);
44682
44683 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
44684 if ((InputVector.getOpcode() == X86ISD::PINSRB ||
44685 InputVector.getOpcode() == X86ISD::PINSRW) &&
44686 InputVector.getOperand(2) == EltIdx) {
44687 assert(SrcVT == InputVector.getOperand(0).getValueType() &&
44688 "Vector type mismatch");
44689 SDValue Scl = InputVector.getOperand(1);
44690 Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
44691 return DAG.getZExtOrTrunc(Scl, dl, VT);
44692 }
44693
44694 // TODO - Remove this once we can handle the implicit zero-extension of
44695 // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
44696 // combineBasicSADPattern.
44697 return SDValue();
44698 }
44699
44700 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
44701 if (VT == MVT::i64 && SrcVT == MVT::v1i64 &&
44702 InputVector.getOpcode() == ISD::BITCAST &&
44703 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
44704 isNullConstant(EltIdx) && InputVector.hasOneUse())
44705 return DAG.getBitcast(VT, InputVector);
44706
44707 // Detect mmx to i32 conversion through a v2i32 elt extract.
44708 if (VT == MVT::i32 && SrcVT == MVT::v2i32 &&
44709 InputVector.getOpcode() == ISD::BITCAST &&
44710 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
44711 isNullConstant(EltIdx) && InputVector.hasOneUse())
44712 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32,
44713 InputVector.getOperand(0));
44714
44715 // Check whether this extract is the root of a sum of absolute differences
44716 // pattern. This has to be done here because we really want it to happen
44717 // pre-legalization,
44718 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
44719 return SAD;
44720
44721 if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))
44722 return VPDPBUSD;
44723
44724 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
44725 if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
44726 return Cmp;
44727
44728 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
44729 if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
44730 return MinMax;
44731
44732 // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
44733 if (SDValue V = combineArithReduction(N, DAG, Subtarget))
44734 return V;
44735
44736 if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget))
44737 return V;
44738
44739 if (CIdx)
44741 N, InputVector.getValueType(), InputVector, CIdx->getZExtValue(),
44742 dl, DAG, DCI))
44743 return V;
44744
44745 // Attempt to extract a i1 element by using MOVMSK to extract the signbits
44746 // and then testing the relevant element.
44747 //
44748 // Note that we only combine extracts on the *same* result number, i.e.
44749 // t0 = merge_values a0, a1, a2, a3
44750 // i1 = extract_vector_elt t0, Constant:i64<2>
44751 // i1 = extract_vector_elt t0, Constant:i64<3>
44752 // but not
44753 // i1 = extract_vector_elt t0:1, Constant:i64<2>
44754 // since the latter would need its own MOVMSK.
44755 if (SrcVT.getScalarType() == MVT::i1) {
44756 bool IsVar = !CIdx;
44757 SmallVector<SDNode *, 16> BoolExtracts;
44758 unsigned ResNo = InputVector.getResNo();
44759 auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {
44760 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
44761 Use->getOperand(0).getResNo() == ResNo &&
44762 Use->getValueType(0) == MVT::i1) {
44763 BoolExtracts.push_back(Use);
44764 IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
44765 return true;
44766 }
44767 return false;
44768 };
44769 // TODO: Can we drop the oneuse check for constant extracts?
44770 if (all_of(InputVector->uses(), IsBoolExtract) &&
44771 (IsVar || BoolExtracts.size() > 1)) {
44772 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
44773 if (SDValue BC =
44774 combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
44775 for (SDNode *Use : BoolExtracts) {
44776 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
44777 // Mask = 1 << MaskIdx
44778 SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
44779 SDValue MaskBit = DAG.getConstant(1, dl, BCVT);
44780 SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);
44781 SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
44782 Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
44783 DCI.CombineTo(Use, Res);
44784 }
44785 return SDValue(N, 0);
44786 }
44787 }
44788 }
44789
44790 // Attempt to fold extract(trunc(x),c) -> trunc(extract(x,c)).
44791 if (CIdx && InputVector.getOpcode() == ISD::TRUNCATE) {
44792 SDValue TruncSrc = InputVector.getOperand(0);
44793 EVT TruncSVT = TruncSrc.getValueType().getScalarType();
44794 if (DCI.isBeforeLegalize() && TLI.isTypeLegal(TruncSVT)) {
44795 SDValue NewExt =
44796 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TruncSVT, TruncSrc, EltIdx);
44797 return DAG.getAnyExtOrTrunc(NewExt, dl, VT);
44798 }
44799 }
44800
44801 return SDValue();
44802}
44803
44804// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
44805// This is more or less the reverse of combineBitcastvxi1.
44807 unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,
44808 TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
44809 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
44810 Opcode != ISD::ANY_EXTEND)
44811 return SDValue();
44812 if (!DCI.isBeforeLegalizeOps())
44813 return SDValue();
44814 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
44815 return SDValue();
44816
44817 EVT SVT = VT.getScalarType();
44818 EVT InSVT = N0.getValueType().getScalarType();
44819 unsigned EltSizeInBits = SVT.getSizeInBits();
44820
44821 // Input type must be extending a bool vector (bit-casted from a scalar
44822 // integer) to legal integer types.
44823 if (!VT.isVector())
44824 return SDValue();
44825 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
44826 return SDValue();
44827 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
44828 return SDValue();
44829
44830 SDValue N00 = N0.getOperand(0);
44831 EVT SclVT = N00.getValueType();
44832 if (!SclVT.isScalarInteger())
44833 return SDValue();
44834
44835 SDValue Vec;
44836 SmallVector<int> ShuffleMask;
44837 unsigned NumElts = VT.getVectorNumElements();
44838 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
44839
44840 // Broadcast the scalar integer to the vector elements.
44841 if (NumElts > EltSizeInBits) {
44842 // If the scalar integer is greater than the vector element size, then we
44843 // must split it down into sub-sections for broadcasting. For example:
44844 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
44845 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
44846 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
44847 unsigned Scale = NumElts / EltSizeInBits;
44848 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
44849 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
44850 Vec = DAG.getBitcast(VT, Vec);
44851
44852 for (unsigned i = 0; i != Scale; ++i)
44853 ShuffleMask.append(EltSizeInBits, i);
44854 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
44855 } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
44856 (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
44857 // If we have register broadcast instructions, use the scalar size as the
44858 // element type for the shuffle. Then cast to the wider element type. The
44859 // widened bits won't be used, and this might allow the use of a broadcast
44860 // load.
44861 assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
44862 unsigned Scale = EltSizeInBits / NumElts;
44863 EVT BroadcastVT =
44864 EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);
44865 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
44866 ShuffleMask.append(NumElts * Scale, 0);
44867 Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);
44868 Vec = DAG.getBitcast(VT, Vec);
44869 } else {
44870 // For smaller scalar integers, we can simply any-extend it to the vector
44871 // element size (we don't care about the upper bits) and broadcast it to all
44872 // elements.
44873 SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
44874 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
44875 ShuffleMask.append(NumElts, 0);
44876 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
44877 }
44878
44879 // Now, mask the relevant bit in each element.
44881 for (unsigned i = 0; i != NumElts; ++i) {
44882 int BitIdx = (i % EltSizeInBits);
44883 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
44884 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
44885 }
44886 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
44887 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
44888
44889 // Compare against the bitmask and extend the result.
44890 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
44891 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
44892 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
44893
44894 // For SEXT, this is now done, otherwise shift the result down for
44895 // zero-extension.
44896 if (Opcode == ISD::SIGN_EXTEND)
44897 return Vec;
44898 return DAG.getNode(ISD::SRL, DL, VT, Vec,
44899 DAG.getConstant(EltSizeInBits - 1, DL, VT));
44900}
44901
44902/// If a vector select has an operand that is -1 or 0, try to simplify the
44903/// select to a bitwise logic operation.
44904/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
44905static SDValue
44908 const X86Subtarget &Subtarget) {
44909 SDValue Cond = N->getOperand(0);
44910 SDValue LHS = N->getOperand(1);
44911 SDValue RHS = N->getOperand(2);
44912 EVT VT = LHS.getValueType();
44913 EVT CondVT = Cond.getValueType();
44914 SDLoc DL(N);
44915 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44916
44917 if (N->getOpcode() != ISD::VSELECT)
44918 return SDValue();
44919
44920 assert(CondVT.isVector() && "Vector select expects a vector selector!");
44921
44922 // TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
44923 // TODO: Can we assert that both operands are not zeros (because that should
44924 // get simplified at node creation time)?
44925 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
44926 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
44927
44928 // If both inputs are 0/undef, create a complete zero vector.
44929 // FIXME: As noted above this should be handled by DAGCombiner/getNode.
44930 if (TValIsAllZeros && FValIsAllZeros) {
44931 if (VT.isFloatingPoint())
44932 return DAG.getConstantFP(0.0, DL, VT);
44933 return DAG.getConstant(0, DL, VT);
44934 }
44935
44936 // To use the condition operand as a bitwise mask, it must have elements that
44937 // are the same size as the select elements. Ie, the condition operand must
44938 // have already been promoted from the IR select condition type <N x i1>.
44939 // Don't check if the types themselves are equal because that excludes
44940 // vector floating-point selects.
44941 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
44942 return SDValue();
44943
44944 // Try to invert the condition if true value is not all 1s and false value is
44945 // not all 0s. Only do this if the condition has one use.
44946 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
44947 if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
44948 // Check if the selector will be produced by CMPP*/PCMP*.
44949 Cond.getOpcode() == ISD::SETCC &&
44950 // Check if SETCC has already been promoted.
44951 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
44952 CondVT) {
44953 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
44954
44955 if (TValIsAllZeros || FValIsAllOnes) {
44956 SDValue CC = Cond.getOperand(2);
44958 cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
44959 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
44960 NewCC);
44961 std::swap(LHS, RHS);
44962 TValIsAllOnes = FValIsAllOnes;
44963 FValIsAllZeros = TValIsAllZeros;
44964 }
44965 }
44966
44967 // Cond value must be 'sign splat' to be converted to a logical op.
44968 if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
44969 return SDValue();
44970
44971 // vselect Cond, 111..., 000... -> Cond
44972 if (TValIsAllOnes && FValIsAllZeros)
44973 return DAG.getBitcast(VT, Cond);
44974
44975 if (!TLI.isTypeLegal(CondVT))
44976 return SDValue();
44977
44978 // vselect Cond, 111..., X -> or Cond, X
44979 if (TValIsAllOnes) {
44980 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
44981 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
44982 return DAG.getBitcast(VT, Or);
44983 }
44984
44985 // vselect Cond, X, 000... -> and Cond, X
44986 if (FValIsAllZeros) {
44987 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
44988 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
44989 return DAG.getBitcast(VT, And);
44990 }
44991
44992 // vselect Cond, 000..., X -> andn Cond, X
44993 if (TValIsAllZeros) {
44994 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
44995 SDValue AndN;
44996 // The canonical form differs for i1 vectors - x86andnp is not used
44997 if (CondVT.getScalarType() == MVT::i1)
44998 AndN = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT),
44999 CastRHS);
45000 else
45001 AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS);
45002 return DAG.getBitcast(VT, AndN);
45003 }
45004
45005 return SDValue();
45006}
45007
45008/// If both arms of a vector select are concatenated vectors, split the select,
45009/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
45010/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
45011/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
45013 const X86Subtarget &Subtarget) {
45014 unsigned Opcode = N->getOpcode();
45015 if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
45016 return SDValue();
45017
45018 // TODO: Split 512-bit vectors too?
45019 EVT VT = N->getValueType(0);
45020 if (!VT.is256BitVector())
45021 return SDValue();
45022
45023 // TODO: Split as long as any 2 of the 3 operands are concatenated?
45024 SDValue Cond = N->getOperand(0);
45025 SDValue TVal = N->getOperand(1);
45026 SDValue FVal = N->getOperand(2);
45027 if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
45028 !isFreeToSplitVector(TVal.getNode(), DAG) ||
45029 !isFreeToSplitVector(FVal.getNode(), DAG))
45030 return SDValue();
45031
45032 auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
45033 ArrayRef<SDValue> Ops) {
45034 return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
45035 };
45036 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },
45037 makeBlend, /*CheckBWI*/ false);
45038}
45039
45041 SDValue Cond = N->getOperand(0);
45042 SDValue LHS = N->getOperand(1);
45043 SDValue RHS = N->getOperand(2);
45044 SDLoc DL(N);
45045
45046 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
45047 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
45048 if (!TrueC || !FalseC)
45049 return SDValue();
45050
45051 // Don't do this for crazy integer types.
45052 EVT VT = N->getValueType(0);
45053 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
45054 return SDValue();
45055
45056 // We're going to use the condition bit in math or logic ops. We could allow
45057 // this with a wider condition value (post-legalization it becomes an i8),
45058 // but if nothing is creating selects that late, it doesn't matter.
45059 if (Cond.getValueType() != MVT::i1)
45060 return SDValue();
45061
45062 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
45063 // 3, 5, or 9 with i32/i64, so those get transformed too.
45064 // TODO: For constants that overflow or do not differ by power-of-2 or small
45065 // multiplier, convert to 'and' + 'add'.
45066 const APInt &TrueVal = TrueC->getAPIntValue();
45067 const APInt &FalseVal = FalseC->getAPIntValue();
45068
45069 // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.
45070 if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&
45071 Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {
45072 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
45073 if (CC == ISD::SETEQ || CC == ISD::SETNE)
45074 return SDValue();
45075 }
45076
45077 bool OV;
45078 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
45079 if (OV)
45080 return SDValue();
45081
45082 APInt AbsDiff = Diff.abs();
45083 if (AbsDiff.isPowerOf2() ||
45084 ((VT == MVT::i32 || VT == MVT::i64) &&
45085 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
45086
45087 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
45088 // of the condition can usually be folded into a compare predicate, but even
45089 // without that, the sequence should be cheaper than a CMOV alternative.
45090 if (TrueVal.slt(FalseVal)) {
45091 Cond = DAG.getNOT(DL, Cond, MVT::i1);
45092 std::swap(TrueC, FalseC);
45093 }
45094
45095 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
45096 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
45097
45098 // Multiply condition by the difference if non-one.
45099 if (!AbsDiff.isOne())
45100 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
45101
45102 // Add the base if non-zero.
45103 if (!FalseC->isZero())
45104 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
45105
45106 return R;
45107 }
45108
45109 return SDValue();
45110}
45111
45112/// If this is a *dynamic* select (non-constant condition) and we can match
45113/// this node with one of the variable blend instructions, restructure the
45114/// condition so that blends can use the high (sign) bit of each element.
45115/// This function will also call SimplifyDemandedBits on already created
45116/// BLENDV to perform additional simplifications.
45119 const X86Subtarget &Subtarget) {
45120 SDValue Cond = N->getOperand(0);
45121 if ((N->getOpcode() != ISD::VSELECT &&
45122 N->getOpcode() != X86ISD::BLENDV) ||
45124 return SDValue();
45125
45126 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45127 unsigned BitWidth = Cond.getScalarValueSizeInBits();
45128 EVT VT = N->getValueType(0);
45129
45130 // We can only handle the cases where VSELECT is directly legal on the
45131 // subtarget. We custom lower VSELECT nodes with constant conditions and
45132 // this makes it hard to see whether a dynamic VSELECT will correctly
45133 // lower, so we both check the operation's status and explicitly handle the
45134 // cases where a *dynamic* blend will fail even though a constant-condition
45135 // blend could be custom lowered.
45136 // FIXME: We should find a better way to handle this class of problems.
45137 // Potentially, we should combine constant-condition vselect nodes
45138 // pre-legalization into shuffles and not mark as many types as custom
45139 // lowered.
45141 return SDValue();
45142 // FIXME: We don't support i16-element blends currently. We could and
45143 // should support them by making *all* the bits in the condition be set
45144 // rather than just the high bit and using an i8-element blend.
45145 if (VT.getVectorElementType() == MVT::i16)
45146 return SDValue();
45147 // Dynamic blending was only available from SSE4.1 onward.
45148 if (VT.is128BitVector() && !Subtarget.hasSSE41())
45149 return SDValue();
45150 // Byte blends are only available in AVX2
45151 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
45152 return SDValue();
45153 // There are no 512-bit blend instructions that use sign bits.
45154 if (VT.is512BitVector())
45155 return SDValue();
45156
45157 // Don't optimize before the condition has been transformed to a legal type
45158 // and don't ever optimize vector selects that map to AVX512 mask-registers.
45159 if (BitWidth < 8 || BitWidth > 64)
45160 return SDValue();
45161
45162 auto OnlyUsedAsSelectCond = [](SDValue Cond) {
45163 for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
45164 UI != UE; ++UI)
45165 if ((UI->getOpcode() != ISD::VSELECT &&
45166 UI->getOpcode() != X86ISD::BLENDV) ||
45167 UI.getOperandNo() != 0)
45168 return false;
45169
45170 return true;
45171 };
45172
45174
45175 if (OnlyUsedAsSelectCond(Cond)) {
45176 KnownBits Known;
45178 !DCI.isBeforeLegalizeOps());
45179 if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
45180 return SDValue();
45181
45182 // If we changed the computation somewhere in the DAG, this change will
45183 // affect all users of Cond. Update all the nodes so that we do not use
45184 // the generic VSELECT anymore. Otherwise, we may perform wrong
45185 // optimizations as we messed with the actual expectation for the vector
45186 // boolean values.
45187 for (SDNode *U : Cond->uses()) {
45188 if (U->getOpcode() == X86ISD::BLENDV)
45189 continue;
45190
45191 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
45192 Cond, U->getOperand(1), U->getOperand(2));
45193 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
45194 DCI.AddToWorklist(U);
45195 }
45196 DCI.CommitTargetLoweringOpt(TLO);
45197 return SDValue(N, 0);
45198 }
45199
45200 // Otherwise we can still at least try to simplify multiple use bits.
45202 return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,
45203 N->getOperand(1), N->getOperand(2));
45204
45205 return SDValue();
45206}
45207
45208// Try to match:
45209// (or (and (M, (sub 0, X)), (pandn M, X)))
45210// which is a special case of:
45211// (select M, (sub 0, X), X)
45212// Per:
45213// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
45214// We know that, if fNegate is 0 or 1:
45215// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
45216//
45217// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
45218// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
45219// ( M ? -X : X) == ((X ^ M ) + (M & 1))
45220// This lets us transform our vselect to:
45221// (add (xor X, M), (and M, 1))
45222// And further to:
45223// (sub (xor X, M), M)
45225 EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
45226 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
45227 EVT MaskVT = Mask.getValueType();
45228 assert(MaskVT.isInteger() &&
45229 DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
45230 "Mask must be zero/all-bits");
45231
45232 if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
45233 return SDValue();
45235 return SDValue();
45236
45237 auto IsNegV = [](SDNode *N, SDValue V) {
45238 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
45239 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
45240 };
45241
45242 SDValue V;
45243 if (IsNegV(Y.getNode(), X))
45244 V = X;
45245 else if (IsNegV(X.getNode(), Y))
45246 V = Y;
45247 else
45248 return SDValue();
45249
45250 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
45251 SDValue SubOp2 = Mask;
45252
45253 // If the negate was on the false side of the select, then
45254 // the operands of the SUB need to be swapped. PR 27251.
45255 // This is because the pattern being matched above is
45256 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
45257 // but if the pattern matched was
45258 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
45259 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
45260 // pattern also needs to be a negation of the replacement pattern above.
45261 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
45262 // sub accomplishes the negation of the replacement pattern.
45263 if (V == Y)
45264 std::swap(SubOp1, SubOp2);
45265
45266 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
45267 return DAG.getBitcast(VT, Res);
45268}
45269
45271 const X86Subtarget &Subtarget) {
45272 if (!Subtarget.hasAVX512())
45273 return SDValue();
45274 if (N->getOpcode() != ISD::VSELECT)
45275 return SDValue();
45276
45277 SDLoc DL(N);
45278 SDValue Cond = N->getOperand(0);
45279 SDValue LHS = N->getOperand(1);
45280 SDValue RHS = N->getOperand(2);
45281
45282 if (canCombineAsMaskOperation(LHS, Subtarget))
45283 return SDValue();
45284
45285 if (!canCombineAsMaskOperation(RHS, Subtarget))
45286 return SDValue();
45287
45288 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
45289 return SDValue();
45290
45291 // Commute LHS and RHS to create opportunity to select mask instruction.
45292 // (vselect M, L, R) -> (vselect ~M, R, L)
45293 ISD::CondCode NewCC =
45294 ISD::getSetCCInverse(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
45295 Cond.getOperand(0).getValueType());
45296 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), Cond.getOperand(0),
45297 Cond.getOperand(1), NewCC);
45298 return DAG.getSelect(DL, LHS.getValueType(), Cond, RHS, LHS);
45299}
45300
45301/// Do target-specific dag combines on SELECT and VSELECT nodes.
45304 const X86Subtarget &Subtarget) {
45305 SDLoc DL(N);
45306 SDValue Cond = N->getOperand(0);
45307 SDValue LHS = N->getOperand(1);
45308 SDValue RHS = N->getOperand(2);
45309
45310 // Try simplification again because we use this function to optimize
45311 // BLENDV nodes that are not handled by the generic combiner.
45312 if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
45313 return V;
45314
45315 // When avx512 is available the lhs operand of select instruction can be
45316 // folded with mask instruction, while the rhs operand can't. Commute the
45317 // lhs and rhs of the select instruction to create the opportunity of
45318 // folding.
45319 if (SDValue V = commuteSelect(N, DAG, Subtarget))
45320 return V;
45321
45322 EVT VT = LHS.getValueType();
45323 EVT CondVT = Cond.getValueType();
45324 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45325 bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
45326
45327 // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
45328 // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
45329 // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
45330 if (CondVT.isVector() && CondVT.isInteger() &&
45331 CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
45332 (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
45335 DL, DAG, Subtarget))
45336 return V;
45337
45338 // Convert vselects with constant condition into shuffles.
45339 if (CondConstantVector && DCI.isBeforeLegalizeOps() &&
45340 (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV)) {
45343 N->getOpcode() == X86ISD::BLENDV))
45344 return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
45345 }
45346
45347 // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
45348 // by forcing the unselected elements to zero.
45349 // TODO: Can we handle more shuffles with this?
45350 if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&
45351 LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&
45352 LHS.hasOneUse() && RHS.hasOneUse()) {
45353 MVT SimpleVT = VT.getSimpleVT();
45354 SmallVector<SDValue, 1> LHSOps, RHSOps;
45355 SmallVector<int, 64> LHSMask, RHSMask, CondMask;
45356 if (createShuffleMaskFromVSELECT(CondMask, Cond) &&
45357 getTargetShuffleMask(LHS, true, LHSOps, LHSMask) &&
45358 getTargetShuffleMask(RHS, true, RHSOps, RHSMask)) {
45359 int NumElts = VT.getVectorNumElements();
45360 for (int i = 0; i != NumElts; ++i) {
45361 // getConstVector sets negative shuffle mask values as undef, so ensure
45362 // we hardcode SM_SentinelZero values to zero (0x80).
45363 if (CondMask[i] < NumElts) {
45364 LHSMask[i] = isUndefOrZero(LHSMask[i]) ? 0x80 : LHSMask[i];
45365 RHSMask[i] = 0x80;
45366 } else {
45367 LHSMask[i] = 0x80;
45368 RHSMask[i] = isUndefOrZero(RHSMask[i]) ? 0x80 : RHSMask[i];
45369 }
45370 }
45371 LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),
45372 getConstVector(LHSMask, SimpleVT, DAG, DL, true));
45373 RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),
45374 getConstVector(RHSMask, SimpleVT, DAG, DL, true));
45375 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
45376 }
45377 }
45378
45379 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
45380 // instructions match the semantics of the common C idiom x<y?x:y but not
45381 // x<=y?x:y, because of how they handle negative zero (which can be
45382 // ignored in unsafe-math mode).
45383 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
45384 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
45385 VT != MVT::f80 && VT != MVT::f128 && !isSoftF16(VT, Subtarget) &&
45386 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
45387 (Subtarget.hasSSE2() ||
45388 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
45389 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
45390
45391 unsigned Opcode = 0;
45392 // Check for x CC y ? x : y.
45393 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
45394 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
45395 switch (CC) {
45396 default: break;
45397 case ISD::SETULT:
45398 // Converting this to a min would handle NaNs incorrectly, and swapping
45399 // the operands would cause it to handle comparisons between positive
45400 // and negative zero incorrectly.
45401 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
45403 !(DAG.isKnownNeverZeroFloat(LHS) ||
45405 break;
45406 std::swap(LHS, RHS);
45407 }
45408 Opcode = X86ISD::FMIN;
45409 break;
45410 case ISD::SETOLE:
45411 // Converting this to a min would handle comparisons between positive
45412 // and negative zero incorrectly.
45415 break;
45416 Opcode = X86ISD::FMIN;
45417 break;
45418 case ISD::SETULE:
45419 // Converting this to a min would handle both negative zeros and NaNs
45420 // incorrectly, but we can swap the operands to fix both.
45421 std::swap(LHS, RHS);
45422 [[fallthrough]];
45423 case ISD::SETOLT:
45424 case ISD::SETLT:
45425 case ISD::SETLE:
45426 Opcode = X86ISD::FMIN;
45427 break;
45428
45429 case ISD::SETOGE:
45430 // Converting this to a max would handle comparisons between positive
45431 // and negative zero incorrectly.
45434 break;
45435 Opcode = X86ISD::FMAX;
45436 break;
45437 case ISD::SETUGT:
45438 // Converting this to a max would handle NaNs incorrectly, and swapping
45439 // the operands would cause it to handle comparisons between positive
45440 // and negative zero incorrectly.
45441 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
45443 !(DAG.isKnownNeverZeroFloat(LHS) ||
45445 break;
45446 std::swap(LHS, RHS);
45447 }
45448 Opcode = X86ISD::FMAX;
45449 break;
45450 case ISD::SETUGE:
45451 // Converting this to a max would handle both negative zeros and NaNs
45452 // incorrectly, but we can swap the operands to fix both.
45453 std::swap(LHS, RHS);
45454 [[fallthrough]];
45455 case ISD::SETOGT:
45456 case ISD::SETGT:
45457 case ISD::SETGE:
45458 Opcode = X86ISD::FMAX;
45459 break;
45460 }
45461 // Check for x CC y ? y : x -- a min/max with reversed arms.
45462 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
45463 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
45464 switch (CC) {
45465 default: break;
45466 case ISD::SETOGE:
45467 // Converting this to a min would handle comparisons between positive
45468 // and negative zero incorrectly, and swapping the operands would
45469 // cause it to handle NaNs incorrectly.
45471 !(DAG.isKnownNeverZeroFloat(LHS) ||
45472 DAG.isKnownNeverZeroFloat(RHS))) {
45473 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
45474 break;
45475 std::swap(LHS, RHS);
45476 }
45477 Opcode = X86ISD::FMIN;
45478 break;
45479 case ISD::SETUGT:
45480 // Converting this to a min would handle NaNs incorrectly.
45481 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
45482 break;
45483 Opcode = X86ISD::FMIN;
45484 break;
45485 case ISD::SETUGE:
45486 // Converting this to a min would handle both negative zeros and NaNs
45487 // incorrectly, but we can swap the operands to fix both.
45488 std::swap(LHS, RHS);
45489 [[fallthrough]];
45490 case ISD::SETOGT:
45491 case ISD::SETGT:
45492 case ISD::SETGE:
45493 Opcode = X86ISD::FMIN;
45494 break;
45495
45496 case ISD::SETULT:
45497 // Converting this to a max would handle NaNs incorrectly.
45498 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
45499 break;
45500 Opcode = X86ISD::FMAX;
45501 break;
45502 case ISD::SETOLE:
45503 // Converting this to a max would handle comparisons between positive
45504 // and negative zero incorrectly, and swapping the operands would
45505 // cause it to handle NaNs incorrectly.
45507 !DAG.isKnownNeverZeroFloat(LHS) &&
45508 !DAG.isKnownNeverZeroFloat(RHS)) {
45509 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
45510 break;
45511 std::swap(LHS, RHS);
45512 }
45513 Opcode = X86ISD::FMAX;
45514 break;
45515 case ISD::SETULE:
45516 // Converting this to a max would handle both negative zeros and NaNs
45517 // incorrectly, but we can swap the operands to fix both.
45518 std::swap(LHS, RHS);
45519 [[fallthrough]];
45520 case ISD::SETOLT:
45521 case ISD::SETLT:
45522 case ISD::SETLE:
45523 Opcode = X86ISD::FMAX;
45524 break;
45525 }
45526 }
45527
45528 if (Opcode)
45529 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
45530 }
45531
45532 // Some mask scalar intrinsics rely on checking if only one bit is set
45533 // and implement it in C code like this:
45534 // A[0] = (U & 1) ? A[0] : W[0];
45535 // This creates some redundant instructions that break pattern matching.
45536 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
45537 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
45538 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
45539 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
45540 SDValue AndNode = Cond.getOperand(0);
45541 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
45542 isNullConstant(Cond.getOperand(1)) &&
45543 isOneConstant(AndNode.getOperand(1))) {
45544 // LHS and RHS swapped due to
45545 // setcc outputting 1 when AND resulted in 0 and vice versa.
45546 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
45547 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
45548 }
45549 }
45550
45551 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
45552 // lowering on KNL. In this case we convert it to
45553 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
45554 // The same situation all vectors of i8 and i16 without BWI.
45555 // Make sure we extend these even before type legalization gets a chance to
45556 // split wide vectors.
45557 // Since SKX these selects have a proper lowering.
45558 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
45559 CondVT.getVectorElementType() == MVT::i1 &&
45560 (VT.getVectorElementType() == MVT::i8 ||
45561 VT.getVectorElementType() == MVT::i16)) {
45562 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
45563 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
45564 }
45565
45566 // AVX512 - Extend select with zero to merge with target shuffle.
45567 // select(mask, extract_subvector(shuffle(x)), zero) -->
45568 // extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
45569 // TODO - support non target shuffles as well.
45570 if (Subtarget.hasAVX512() && CondVT.isVector() &&
45571 CondVT.getVectorElementType() == MVT::i1) {
45572 auto SelectableOp = [&TLI](SDValue Op) {
45573 return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
45574 isTargetShuffle(Op.getOperand(0).getOpcode()) &&
45575 isNullConstant(Op.getOperand(1)) &&
45576 TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
45577 Op.hasOneUse() && Op.getOperand(0).hasOneUse();
45578 };
45579
45580 bool SelectableLHS = SelectableOp(LHS);
45581 bool SelectableRHS = SelectableOp(RHS);
45582 bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());
45583 bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());
45584
45585 if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {
45586 EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
45587 : RHS.getOperand(0).getValueType();
45588 EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
45589 LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
45590 VT.getSizeInBits());
45591 RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
45592 VT.getSizeInBits());
45593 Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
45594 DAG.getUNDEF(SrcCondVT), Cond,
45595 DAG.getIntPtrConstant(0, DL));
45596 SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
45597 return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
45598 }
45599 }
45600
45602 return V;
45603
45604 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
45605 Cond.hasOneUse()) {
45606 EVT CondVT = Cond.getValueType();
45607 SDValue Cond0 = Cond.getOperand(0);
45608 SDValue Cond1 = Cond.getOperand(1);
45609 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
45610
45611 // Canonicalize min/max:
45612 // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
45613 // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
45614 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
45615 // the need for an extra compare against zero. e.g.
45616 // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
45617 // subl %esi, %edi
45618 // testl %edi, %edi
45619 // movl $0, %eax
45620 // cmovgl %edi, %eax
45621 // =>
45622 // xorl %eax, %eax
45623 // subl %esi, $edi
45624 // cmovsl %eax, %edi
45625 //
45626 // We can also canonicalize
45627 // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
45628 // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
45629 // This allows the use of a test instruction for the compare.
45630 if (LHS == Cond0 && RHS == Cond1) {
45631 if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
45634 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
45635 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
45636 }
45637 if (CC == ISD::SETUGT && isOneConstant(RHS)) {
45638 ISD::CondCode NewCC = ISD::SETUGE;
45639 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
45640 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
45641 }
45642 }
45643
45644 // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
45645 // fold eq + gt/lt nested selects into ge/le selects
45646 // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
45647 // --> (select (cmpuge Cond0, Cond1), LHS, Y)
45648 // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
45649 // --> (select (cmpsle Cond0, Cond1), LHS, Y)
45650 // .. etc ..
45651 if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
45652 RHS.getOperand(0).getOpcode() == ISD::SETCC) {
45653 SDValue InnerSetCC = RHS.getOperand(0);
45654 ISD::CondCode InnerCC =
45655 cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
45656 if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
45657 Cond0 == InnerSetCC.getOperand(0) &&
45658 Cond1 == InnerSetCC.getOperand(1)) {
45659 ISD::CondCode NewCC;
45660 switch (CC == ISD::SETEQ ? InnerCC : CC) {
45661 // clang-format off
45662 case ISD::SETGT: NewCC = ISD::SETGE; break;
45663 case ISD::SETLT: NewCC = ISD::SETLE; break;
45664 case ISD::SETUGT: NewCC = ISD::SETUGE; break;
45665 case ISD::SETULT: NewCC = ISD::SETULE; break;
45666 default: NewCC = ISD::SETCC_INVALID; break;
45667 // clang-format on
45668 }
45669 if (NewCC != ISD::SETCC_INVALID) {
45670 Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
45671 return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
45672 }
45673 }
45674 }
45675 }
45676
45677 // Check if the first operand is all zeros and Cond type is vXi1.
45678 // If this an avx512 target we can improve the use of zero masking by
45679 // swapping the operands and inverting the condition.
45680 if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
45681 Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
45682 ISD::isBuildVectorAllZeros(LHS.getNode()) &&
45683 !ISD::isBuildVectorAllZeros(RHS.getNode())) {
45684 // Invert the cond to not(cond) : xor(op,allones)=not(op)
45685 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
45686 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
45687 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
45688 }
45689
45690 // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
45691 // get split by legalization.
45692 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
45693 CondVT.getVectorElementType() == MVT::i1 &&
45694 TLI.isTypeLegal(VT.getScalarType())) {
45695 EVT ExtCondVT = VT.changeVectorElementTypeToInteger();
45697 ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {
45698 ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);
45699 return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);
45700 }
45701 }
45702
45703 // Early exit check
45704 if (!TLI.isTypeLegal(VT) || isSoftF16(VT, Subtarget))
45705 return SDValue();
45706
45707 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
45708 return V;
45709
45710 if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
45711 return V;
45712
45713 if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
45714 return V;
45715
45716 // select(~Cond, X, Y) -> select(Cond, Y, X)
45717 if (CondVT.getScalarType() != MVT::i1) {
45718 if (SDValue CondNot = IsNOT(Cond, DAG))
45719 return DAG.getNode(N->getOpcode(), DL, VT,
45720 DAG.getBitcast(CondVT, CondNot), RHS, LHS);
45721
45722 // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the
45723 // signbit.
45724 if (Cond.getOpcode() == X86ISD::PCMPGT &&
45725 ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode()) &&
45726 Cond.hasOneUse()) {
45727 Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
45728 DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
45729 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
45730 }
45731 }
45732
45733 // Try to optimize vXi1 selects if both operands are either all constants or
45734 // bitcasts from scalar integer type. In that case we can convert the operands
45735 // to integer and use an integer select which will be converted to a CMOV.
45736 // We need to take a little bit of care to avoid creating an i64 type after
45737 // type legalization.
45738 if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
45739 VT.getVectorElementType() == MVT::i1 &&
45740 (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
45742 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(IntVT)) {
45743 bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
45744 bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
45745
45746 if ((LHSIsConst || (LHS.getOpcode() == ISD::BITCAST &&
45747 LHS.getOperand(0).getValueType() == IntVT)) &&
45748 (RHSIsConst || (RHS.getOpcode() == ISD::BITCAST &&
45749 RHS.getOperand(0).getValueType() == IntVT))) {
45750 if (LHSIsConst)
45752 else
45753 LHS = LHS.getOperand(0);
45754
45755 if (RHSIsConst)
45757 else
45758 RHS = RHS.getOperand(0);
45759
45760 SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
45761 return DAG.getBitcast(VT, Select);
45762 }
45763 }
45764 }
45765
45766 // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
45767 // single bits, then invert the predicate and swap the select operands.
45768 // This can lower using a vector shift bit-hack rather than mask and compare.
45769 if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
45770 N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
45771 Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
45772 Cond.getOperand(0).getOpcode() == ISD::AND &&
45773 isNullOrNullSplat(Cond.getOperand(1)) &&
45774 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
45775 Cond.getOperand(0).getValueType() == VT) {
45776 // The 'and' mask must be composed of power-of-2 constants.
45777 SDValue And = Cond.getOperand(0);
45778 auto *C = isConstOrConstSplat(And.getOperand(1));
45779 if (C && C->getAPIntValue().isPowerOf2()) {
45780 // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
45781 SDValue NotCond =
45782 DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
45783 return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
45784 }
45785
45786 // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
45787 // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
45788 // 16-bit lacks a proper blendv.
45789 unsigned EltBitWidth = VT.getScalarSizeInBits();
45790 bool CanShiftBlend =
45791 TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
45792 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
45793 (Subtarget.hasXOP()));
45794 if (CanShiftBlend &&
45795 ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
45796 return C->getAPIntValue().isPowerOf2();
45797 })) {
45798 // Create a left-shift constant to get the mask bits over to the sign-bit.
45799 SDValue Mask = And.getOperand(1);
45800 SmallVector<int, 32> ShlVals;
45801 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
45802 auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
45803 ShlVals.push_back(EltBitWidth - 1 -
45804 MaskVal->getAPIntValue().exactLogBase2());
45805 }
45806 // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
45807 SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
45808 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
45809 SDValue NewCond =
45810 DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
45811 return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
45812 }
45813 }
45814
45815 return SDValue();
45816}
45817
45818/// Combine:
45819/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
45820/// to:
45821/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
45822/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
45823/// Note that this is only legal for some op/cc combinations.
45825 SelectionDAG &DAG,
45826 const X86Subtarget &Subtarget) {
45827 // This combine only operates on CMP-like nodes.
45828 if (!(Cmp.getOpcode() == X86ISD::CMP ||
45829 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
45830 return SDValue();
45831
45832 // Can't replace the cmp if it has more uses than the one we're looking at.
45833 // FIXME: We would like to be able to handle this, but would need to make sure
45834 // all uses were updated.
45835 if (!Cmp.hasOneUse())
45836 return SDValue();
45837
45838 // This only applies to variations of the common case:
45839 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
45840 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
45841 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
45842 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
45843 // Using the proper condcodes (see below), overflow is checked for.
45844
45845 // FIXME: We can generalize both constraints:
45846 // - XOR/OR/AND (if they were made to survive AtomicExpand)
45847 // - LHS != 1
45848 // if the result is compared.
45849
45850 SDValue CmpLHS = Cmp.getOperand(0);
45851 SDValue CmpRHS = Cmp.getOperand(1);
45852 EVT CmpVT = CmpLHS.getValueType();
45853
45854 if (!CmpLHS.hasOneUse())
45855 return SDValue();
45856
45857 unsigned Opc = CmpLHS.getOpcode();
45858 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
45859 return SDValue();
45860
45861 SDValue OpRHS = CmpLHS.getOperand(2);
45862 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
45863 if (!OpRHSC)
45864 return SDValue();
45865
45866 APInt Addend = OpRHSC->getAPIntValue();
45867 if (Opc == ISD::ATOMIC_LOAD_SUB)
45868 Addend = -Addend;
45869
45870 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
45871 if (!CmpRHSC)
45872 return SDValue();
45873
45874 APInt Comparison = CmpRHSC->getAPIntValue();
45875 APInt NegAddend = -Addend;
45876
45877 // See if we can adjust the CC to make the comparison match the negated
45878 // addend.
45879 if (Comparison != NegAddend) {
45880 APInt IncComparison = Comparison + 1;
45881 if (IncComparison == NegAddend) {
45882 if (CC == X86::COND_A && !Comparison.isMaxValue()) {
45883 Comparison = IncComparison;
45884 CC = X86::COND_AE;
45885 } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
45886 Comparison = IncComparison;
45887 CC = X86::COND_L;
45888 }
45889 }
45890 APInt DecComparison = Comparison - 1;
45891 if (DecComparison == NegAddend) {
45892 if (CC == X86::COND_AE && !Comparison.isMinValue()) {
45893 Comparison = DecComparison;
45894 CC = X86::COND_A;
45895 } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
45896 Comparison = DecComparison;
45897 CC = X86::COND_LE;
45898 }
45899 }
45900 }
45901
45902 // If the addend is the negation of the comparison value, then we can do
45903 // a full comparison by emitting the atomic arithmetic as a locked sub.
45904 if (Comparison == NegAddend) {
45905 // The CC is fine, but we need to rewrite the LHS of the comparison as an
45906 // atomic sub.
45907 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
45908 auto AtomicSub = DAG.getAtomic(
45909 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
45910 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
45911 /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
45912 AN->getMemOperand());
45913 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
45914 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
45915 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
45916 return LockOp;
45917 }
45918
45919 // We can handle comparisons with zero in a number of cases by manipulating
45920 // the CC used.
45921 if (!Comparison.isZero())
45922 return SDValue();
45923
45924 if (CC == X86::COND_S && Addend == 1)
45925 CC = X86::COND_LE;
45926 else if (CC == X86::COND_NS && Addend == 1)
45927 CC = X86::COND_G;
45928 else if (CC == X86::COND_G && Addend == -1)
45929 CC = X86::COND_GE;
45930 else if (CC == X86::COND_LE && Addend == -1)
45931 CC = X86::COND_L;
45932 else
45933 return SDValue();
45934
45935 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
45936 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
45937 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
45938 return LockOp;
45939}
45940
45941// Check whether a boolean test is testing a boolean value generated by
45942// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
45943// code.
45944//
45945// Simplify the following patterns:
45946// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
45947// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
45948// to (Op EFLAGS Cond)
45949//
45950// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
45951// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
45952// to (Op EFLAGS !Cond)
45953//
45954// where Op could be BRCOND or CMOV.
45955//
45957 // This combine only operates on CMP-like nodes.
45958 if (!(Cmp.getOpcode() == X86ISD::CMP ||
45959 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
45960 return SDValue();
45961
45962 // Quit if not used as a boolean value.
45963 if (CC != X86::COND_E && CC != X86::COND_NE)
45964 return SDValue();
45965
45966 // Check CMP operands. One of them should be 0 or 1 and the other should be
45967 // an SetCC or extended from it.
45968 SDValue Op1 = Cmp.getOperand(0);
45969 SDValue Op2 = Cmp.getOperand(1);
45970
45971 SDValue SetCC;
45972 const ConstantSDNode* C = nullptr;
45973 bool needOppositeCond = (CC == X86::COND_E);
45974 bool checkAgainstTrue = false; // Is it a comparison against 1?
45975
45976 if ((C = dyn_cast<ConstantSDNode>(Op1)))
45977 SetCC = Op2;
45978 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
45979 SetCC = Op1;
45980 else // Quit if all operands are not constants.
45981 return SDValue();
45982
45983 if (C->getZExtValue() == 1) {
45984 needOppositeCond = !needOppositeCond;
45985 checkAgainstTrue = true;
45986 } else if (C->getZExtValue() != 0)
45987 // Quit if the constant is neither 0 or 1.
45988 return SDValue();
45989
45990 bool truncatedToBoolWithAnd = false;
45991 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
45992 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
45993 SetCC.getOpcode() == ISD::TRUNCATE ||
45994 SetCC.getOpcode() == ISD::AND) {
45995 if (SetCC.getOpcode() == ISD::AND) {
45996 int OpIdx = -1;
45997 if (isOneConstant(SetCC.getOperand(0)))
45998 OpIdx = 1;
45999 if (isOneConstant(SetCC.getOperand(1)))
46000 OpIdx = 0;
46001 if (OpIdx < 0)
46002 break;
46003 SetCC = SetCC.getOperand(OpIdx);
46004 truncatedToBoolWithAnd = true;
46005 } else
46006 SetCC = SetCC.getOperand(0);
46007 }
46008
46009 switch (SetCC.getOpcode()) {
46011 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
46012 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
46013 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
46014 // truncated to i1 using 'and'.
46015 if (checkAgainstTrue && !truncatedToBoolWithAnd)
46016 break;
46018 "Invalid use of SETCC_CARRY!");
46019 [[fallthrough]];
46020 case X86ISD::SETCC:
46021 // Set the condition code or opposite one if necessary.
46023 if (needOppositeCond)
46025 return SetCC.getOperand(1);
46026 case X86ISD::CMOV: {
46027 // Check whether false/true value has canonical one, i.e. 0 or 1.
46028 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
46029 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
46030 // Quit if true value is not a constant.
46031 if (!TVal)
46032 return SDValue();
46033 // Quit if false value is not a constant.
46034 if (!FVal) {
46035 SDValue Op = SetCC.getOperand(0);
46036 // Skip 'zext' or 'trunc' node.
46037 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
46038 Op.getOpcode() == ISD::TRUNCATE)
46039 Op = Op.getOperand(0);
46040 // A special case for rdrand/rdseed, where 0 is set if false cond is
46041 // found.
46042 if ((Op.getOpcode() != X86ISD::RDRAND &&
46043 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
46044 return SDValue();
46045 }
46046 // Quit if false value is not the constant 0 or 1.
46047 bool FValIsFalse = true;
46048 if (FVal && FVal->getZExtValue() != 0) {
46049 if (FVal->getZExtValue() != 1)
46050 return SDValue();
46051 // If FVal is 1, opposite cond is needed.
46052 needOppositeCond = !needOppositeCond;
46053 FValIsFalse = false;
46054 }
46055 // Quit if TVal is not the constant opposite of FVal.
46056 if (FValIsFalse && TVal->getZExtValue() != 1)
46057 return SDValue();
46058 if (!FValIsFalse && TVal->getZExtValue() != 0)
46059 return SDValue();
46061 if (needOppositeCond)
46063 return SetCC.getOperand(3);
46064 }
46065 }
46066
46067 return SDValue();
46068}
46069
46070/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
46071/// Match:
46072/// (X86or (X86setcc) (X86setcc))
46073/// (X86cmp (and (X86setcc) (X86setcc)), 0)
46075 X86::CondCode &CC1, SDValue &Flags,
46076 bool &isAnd) {
46077 if (Cond->getOpcode() == X86ISD::CMP) {
46078 if (!isNullConstant(Cond->getOperand(1)))
46079 return false;
46080
46081 Cond = Cond->getOperand(0);
46082 }
46083
46084 isAnd = false;
46085
46086 SDValue SetCC0, SetCC1;
46087 switch (Cond->getOpcode()) {
46088 default: return false;
46089 case ISD::AND:
46090 case X86ISD::AND:
46091 isAnd = true;
46092 [[fallthrough]];
46093 case ISD::OR:
46094 case X86ISD::OR:
46095 SetCC0 = Cond->getOperand(0);
46096 SetCC1 = Cond->getOperand(1);
46097 break;
46098 };
46099
46100 // Make sure we have SETCC nodes, using the same flags value.
46101 if (SetCC0.getOpcode() != X86ISD::SETCC ||
46102 SetCC1.getOpcode() != X86ISD::SETCC ||
46103 SetCC0->getOperand(1) != SetCC1->getOperand(1))
46104 return false;
46105
46106 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
46107 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
46108 Flags = SetCC0->getOperand(1);
46109 return true;
46110}
46111
46112// When legalizing carry, we create carries via add X, -1
46113// If that comes from an actual carry, via setcc, we use the
46114// carry directly.
46116 if (EFLAGS.getOpcode() == X86ISD::ADD) {
46117 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
46118 bool FoundAndLSB = false;
46119 SDValue Carry = EFLAGS.getOperand(0);
46120 while (Carry.getOpcode() == ISD::TRUNCATE ||
46121 Carry.getOpcode() == ISD::ZERO_EXTEND ||
46122 (Carry.getOpcode() == ISD::AND &&
46123 isOneConstant(Carry.getOperand(1)))) {
46124 FoundAndLSB |= Carry.getOpcode() == ISD::AND;
46125 Carry = Carry.getOperand(0);
46126 }
46127 if (Carry.getOpcode() == X86ISD::SETCC ||
46128 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
46129 // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
46130 uint64_t CarryCC = Carry.getConstantOperandVal(0);
46131 SDValue CarryOp1 = Carry.getOperand(1);
46132 if (CarryCC == X86::COND_B)
46133 return CarryOp1;
46134 if (CarryCC == X86::COND_A) {
46135 // Try to convert COND_A into COND_B in an attempt to facilitate
46136 // materializing "setb reg".
46137 //
46138 // Do not flip "e > c", where "c" is a constant, because Cmp
46139 // instruction cannot take an immediate as its first operand.
46140 //
46141 if (CarryOp1.getOpcode() == X86ISD::SUB &&
46142 CarryOp1.getNode()->hasOneUse() &&
46143 CarryOp1.getValueType().isInteger() &&
46144 !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
46145 SDValue SubCommute =
46146 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
46147 CarryOp1.getOperand(1), CarryOp1.getOperand(0));
46148 return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
46149 }
46150 }
46151 // If this is a check of the z flag of an add with 1, switch to the
46152 // C flag.
46153 if (CarryCC == X86::COND_E &&
46154 CarryOp1.getOpcode() == X86ISD::ADD &&
46155 isOneConstant(CarryOp1.getOperand(1)))
46156 return CarryOp1;
46157 } else if (FoundAndLSB) {
46158 SDLoc DL(Carry);
46159 SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());
46160 if (Carry.getOpcode() == ISD::SRL) {
46161 BitNo = Carry.getOperand(1);
46162 Carry = Carry.getOperand(0);
46163 }
46164 return getBT(Carry, BitNo, DL, DAG);
46165 }
46166 }
46167 }
46168
46169 return SDValue();
46170}
46171
46172/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
46173/// to avoid the inversion.
46175 SelectionDAG &DAG,
46176 const X86Subtarget &Subtarget) {
46177 // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
46178 if (EFLAGS.getOpcode() != X86ISD::PTEST &&
46179 EFLAGS.getOpcode() != X86ISD::TESTP)
46180 return SDValue();
46181
46182 // PTEST/TESTP sets EFLAGS as:
46183 // TESTZ: ZF = (Op0 & Op1) == 0
46184 // TESTC: CF = (~Op0 & Op1) == 0
46185 // TESTNZC: ZF == 0 && CF == 0
46186 MVT VT = EFLAGS.getSimpleValueType();
46187 SDValue Op0 = EFLAGS.getOperand(0);
46188 SDValue Op1 = EFLAGS.getOperand(1);
46189 MVT OpVT = Op0.getSimpleValueType();
46190 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46191
46192 // TEST*(~X,Y) == TEST*(X,Y)
46193 if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
46194 X86::CondCode InvCC;
46195 switch (CC) {
46196 case X86::COND_B:
46197 // testc -> testz.
46198 InvCC = X86::COND_E;
46199 break;
46200 case X86::COND_AE:
46201 // !testc -> !testz.
46202 InvCC = X86::COND_NE;
46203 break;
46204 case X86::COND_E:
46205 // testz -> testc.
46206 InvCC = X86::COND_B;
46207 break;
46208 case X86::COND_NE:
46209 // !testz -> !testc.
46210 InvCC = X86::COND_AE;
46211 break;
46212 case X86::COND_A:
46213 case X86::COND_BE:
46214 // testnzc -> testnzc (no change).
46215 InvCC = CC;
46216 break;
46217 default:
46218 InvCC = X86::COND_INVALID;
46219 break;
46220 }
46221
46222 if (InvCC != X86::COND_INVALID) {
46223 CC = InvCC;
46224 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
46225 DAG.getBitcast(OpVT, NotOp0), Op1);
46226 }
46227 }
46228
46229 if (CC == X86::COND_B || CC == X86::COND_AE) {
46230 // TESTC(X,~X) == TESTC(X,-1)
46231 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
46232 if (peekThroughBitcasts(NotOp1) == peekThroughBitcasts(Op0)) {
46233 SDLoc DL(EFLAGS);
46234 return DAG.getNode(
46235 EFLAGS.getOpcode(), DL, VT, DAG.getBitcast(OpVT, NotOp1),
46236 DAG.getBitcast(OpVT,
46237 DAG.getAllOnesConstant(DL, NotOp1.getValueType())));
46238 }
46239 }
46240 }
46241
46242 if (CC == X86::COND_E || CC == X86::COND_NE) {
46243 // TESTZ(X,~Y) == TESTC(Y,X)
46244 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
46246 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
46247 DAG.getBitcast(OpVT, NotOp1), Op0);
46248 }
46249
46250 if (Op0 == Op1) {
46251 SDValue BC = peekThroughBitcasts(Op0);
46252 EVT BCVT = BC.getValueType();
46253
46254 // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
46255 if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
46256 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
46257 DAG.getBitcast(OpVT, BC.getOperand(0)),
46258 DAG.getBitcast(OpVT, BC.getOperand(1)));
46259 }
46260
46261 // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
46262 if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
46264 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
46265 DAG.getBitcast(OpVT, BC.getOperand(0)),
46266 DAG.getBitcast(OpVT, BC.getOperand(1)));
46267 }
46268
46269 // If every element is an all-sign value, see if we can use TESTP/MOVMSK
46270 // to more efficiently extract the sign bits and compare that.
46271 // TODO: Handle TESTC with comparison inversion.
46272 // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
46273 // TESTP/MOVMSK combines to make sure its never worse than PTEST?
46274 if (BCVT.isVector() && TLI.isTypeLegal(BCVT)) {
46275 unsigned EltBits = BCVT.getScalarSizeInBits();
46276 if (DAG.ComputeNumSignBits(BC) == EltBits) {
46277 assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
46278 APInt SignMask = APInt::getSignMask(EltBits);
46279 if (SDValue Res =
46280 TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
46281 // For vXi16 cases we need to use pmovmksb and extract every other
46282 // sign bit.
46283 SDLoc DL(EFLAGS);
46284 if ((EltBits == 32 || EltBits == 64) && Subtarget.hasAVX()) {
46285 MVT FloatSVT = MVT::getFloatingPointVT(EltBits);
46286 MVT FloatVT =
46287 MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits);
46288 Res = DAG.getBitcast(FloatVT, Res);
46289 return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res);
46290 } else if (EltBits == 16) {
46291 MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
46292 Res = DAG.getBitcast(MovmskVT, Res);
46293 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
46294 Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
46295 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
46296 } else {
46297 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
46298 }
46299 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
46300 DAG.getConstant(0, DL, MVT::i32));
46301 }
46302 }
46303 }
46304 }
46305
46306 // TESTZ(-1,X) == TESTZ(X,X)
46308 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
46309
46310 // TESTZ(X,-1) == TESTZ(X,X)
46312 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
46313
46314 // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)
46315 // TODO: Add COND_NE handling?
46316 if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {
46317 SDValue Src0 = peekThroughBitcasts(Op0);
46318 SDValue Src1 = peekThroughBitcasts(Op1);
46319 if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {
46321 peekThroughBitcasts(Src0.getOperand(1)), true);
46323 peekThroughBitcasts(Src1.getOperand(1)), true);
46324 if (Src0 && Src1) {
46325 MVT OpVT2 = OpVT.getDoubleNumVectorElementsVT();
46326 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
46327 DAG.getBitcast(OpVT2, Src0),
46328 DAG.getBitcast(OpVT2, Src1));
46329 }
46330 }
46331 }
46332 }
46333
46334 return SDValue();
46335}
46336
46337// Attempt to simplify the MOVMSK input based on the comparison type.
46339 SelectionDAG &DAG,
46340 const X86Subtarget &Subtarget) {
46341 // Handle eq/ne against zero (any_of).
46342 // Handle eq/ne against -1 (all_of).
46343 if (!(CC == X86::COND_E || CC == X86::COND_NE))
46344 return SDValue();
46345 if (EFLAGS.getValueType() != MVT::i32)
46346 return SDValue();
46347 unsigned CmpOpcode = EFLAGS.getOpcode();
46348 if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
46349 return SDValue();
46350 auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
46351 if (!CmpConstant)
46352 return SDValue();
46353 const APInt &CmpVal = CmpConstant->getAPIntValue();
46354
46355 SDValue CmpOp = EFLAGS.getOperand(0);
46356 unsigned CmpBits = CmpOp.getValueSizeInBits();
46357 assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch");
46358
46359 // Peek through any truncate.
46360 if (CmpOp.getOpcode() == ISD::TRUNCATE)
46361 CmpOp = CmpOp.getOperand(0);
46362
46363 // Bail if we don't find a MOVMSK.
46364 if (CmpOp.getOpcode() != X86ISD::MOVMSK)
46365 return SDValue();
46366
46367 SDValue Vec = CmpOp.getOperand(0);
46368 MVT VecVT = Vec.getSimpleValueType();
46369 assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
46370 "Unexpected MOVMSK operand");
46371 unsigned NumElts = VecVT.getVectorNumElements();
46372 unsigned NumEltBits = VecVT.getScalarSizeInBits();
46373
46374 bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();
46375 bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&
46376 NumElts <= CmpBits && CmpVal.isMask(NumElts);
46377 if (!IsAnyOf && !IsAllOf)
46378 return SDValue();
46379
46380 // TODO: Check more combining cases for me.
46381 // Here we check the cmp use number to decide do combining or not.
46382 // Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))"
46383 // and "MOVMSK(PCMPEQ(..))" are fit to use this constraint.
46384 bool IsOneUse = CmpOp.getNode()->hasOneUse();
46385
46386 // See if we can peek through to a vector with a wider element type, if the
46387 // signbits extend down to all the sub-elements as well.
46388 // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
46389 // potential SimplifyDemandedBits/Elts cases.
46390 // If we looked through a truncate that discard bits, we can't do this
46391 // transform.
46392 // FIXME: We could do this transform for truncates that discarded bits by
46393 // inserting an AND mask between the new MOVMSK and the CMP.
46394 if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {
46395 SDValue BC = peekThroughBitcasts(Vec);
46396 MVT BCVT = BC.getSimpleValueType();
46397 unsigned BCNumElts = BCVT.getVectorNumElements();
46398 unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
46399 if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
46400 BCNumEltBits > NumEltBits &&
46401 DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
46402 SDLoc DL(EFLAGS);
46403 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);
46404 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
46405 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
46406 DAG.getConstant(CmpMask, DL, MVT::i32));
46407 }
46408 }
46409
46410 // MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)).
46411 // MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).
46412 // MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).
46413 // MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).
46414 if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {
46416 if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&
46417 Ops.size() == 2) {
46418 SDLoc DL(EFLAGS);
46419 EVT SubVT = Ops[0].getValueType().changeTypeToInteger();
46420 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);
46421 SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,
46422 DAG.getBitcast(SubVT, Ops[0]),
46423 DAG.getBitcast(SubVT, Ops[1]));
46424 V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);
46425 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
46426 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),
46427 DAG.getConstant(CmpMask, DL, MVT::i32));
46428 }
46429 }
46430
46431 // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
46432 // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
46433 // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(XOR(X,Y),XOR(X,Y)).
46434 // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(XOR(X,Y),XOR(X,Y)).
46435 if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) {
46436 MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
46437 SDValue BC = peekThroughBitcasts(Vec);
46438 // Ensure MOVMSK was testing every signbit of BC.
46439 if (BC.getValueType().getVectorNumElements() <= NumElts) {
46440 if (BC.getOpcode() == X86ISD::PCMPEQ) {
46441 SDValue V = DAG.getNode(ISD::XOR, SDLoc(BC), BC.getValueType(),
46442 BC.getOperand(0), BC.getOperand(1));
46443 V = DAG.getBitcast(TestVT, V);
46444 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
46445 }
46446 // Check for 256-bit split vector cases.
46447 if (BC.getOpcode() == ISD::AND &&
46448 BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&
46449 BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {
46450 SDValue LHS = BC.getOperand(0);
46451 SDValue RHS = BC.getOperand(1);
46452 LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), LHS.getValueType(),
46453 LHS.getOperand(0), LHS.getOperand(1));
46454 RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), RHS.getValueType(),
46455 RHS.getOperand(0), RHS.getOperand(1));
46456 LHS = DAG.getBitcast(TestVT, LHS);
46457 RHS = DAG.getBitcast(TestVT, RHS);
46458 SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);
46459 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
46460 }
46461 }
46462 }
46463
46464 // See if we can avoid a PACKSS by calling MOVMSK on the sources.
46465 // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
46466 // sign bits prior to the comparison with zero unless we know that
46467 // the vXi16 splats the sign bit down to the lower i8 half.
46468 // TODO: Handle all_of patterns.
46469 if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
46470 SDValue VecOp0 = Vec.getOperand(0);
46471 SDValue VecOp1 = Vec.getOperand(1);
46472 bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
46473 bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
46474 // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
46475 if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
46476 SDLoc DL(EFLAGS);
46477 SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
46478 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
46479 Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
46480 if (!SignExt0) {
46481 Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
46482 DAG.getConstant(0xAAAA, DL, MVT::i16));
46483 }
46484 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
46485 DAG.getConstant(0, DL, MVT::i16));
46486 }
46487 // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
46488 // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
46489 if (CmpBits >= 16 && Subtarget.hasInt256() &&
46490 (IsAnyOf || (SignExt0 && SignExt1))) {
46491 if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {
46492 SDLoc DL(EFLAGS);
46493 SDValue Result = peekThroughBitcasts(Src);
46494 if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&
46495 Result.getValueType().getVectorNumElements() <= NumElts) {
46496 SDValue V = DAG.getNode(ISD::XOR, DL, Result.getValueType(),
46497 Result.getOperand(0), Result.getOperand(1));
46498 V = DAG.getBitcast(MVT::v4i64, V);
46499 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
46500 }
46501 Result = DAG.getBitcast(MVT::v32i8, Result);
46502 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
46503 unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
46504 if (!SignExt0 || !SignExt1) {
46505 assert(IsAnyOf &&
46506 "Only perform v16i16 signmasks for any_of patterns");
46507 Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
46508 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
46509 }
46510 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
46511 DAG.getConstant(CmpMask, DL, MVT::i32));
46512 }
46513 }
46514 }
46515
46516 // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
46517 // Since we peek through a bitcast, we need to be careful if the base vector
46518 // type has smaller elements than the MOVMSK type. In that case, even if
46519 // all the elements are demanded by the shuffle mask, only the "high"
46520 // elements which have highbits that align with highbits in the MOVMSK vec
46521 // elements are actually demanded. A simplification of spurious operations
46522 // on the "low" elements take place during other simplifications.
46523 //
46524 // For example:
46525 // MOVMSK64(BITCAST(SHUF32 X, (1,0,3,2))) even though all the elements are
46526 // demanded, because we are swapping around the result can change.
46527 //
46528 // To address this, we check that we can scale the shuffle mask to MOVMSK
46529 // element width (this will ensure "high" elements match). Its slightly overly
46530 // conservative, but fine for an edge case fold.
46531 SmallVector<int, 32> ShuffleMask, ScaledMaskUnused;
46532 SmallVector<SDValue, 2> ShuffleInputs;
46533 if (NumElts <= CmpBits &&
46534 getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
46535 ShuffleMask, DAG) &&
46536 ShuffleInputs.size() == 1 && isCompletePermute(ShuffleMask) &&
46537 ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits() &&
46538 scaleShuffleElements(ShuffleMask, NumElts, ScaledMaskUnused)) {
46539 SDLoc DL(EFLAGS);
46540 SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
46541 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
46542 Result =
46543 DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
46544 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result, EFLAGS.getOperand(1));
46545 }
46546
46547 // MOVMSKPS(V) !=/== 0 -> TESTPS(V,V)
46548 // MOVMSKPD(V) !=/== 0 -> TESTPD(V,V)
46549 // MOVMSKPS(V) !=/== -1 -> TESTPS(V,V)
46550 // MOVMSKPD(V) !=/== -1 -> TESTPD(V,V)
46551 // iff every element is referenced.
46552 if (NumElts <= CmpBits && Subtarget.hasAVX() &&
46553 !Subtarget.preferMovmskOverVTest() && IsOneUse &&
46554 (NumEltBits == 32 || NumEltBits == 64)) {
46555 SDLoc DL(EFLAGS);
46556 MVT FloatSVT = MVT::getFloatingPointVT(NumEltBits);
46557 MVT FloatVT = MVT::getVectorVT(FloatSVT, NumElts);
46558 MVT IntVT = FloatVT.changeVectorElementTypeToInteger();
46559 SDValue LHS = Vec;
46560 SDValue RHS = IsAnyOf ? Vec : DAG.getAllOnesConstant(DL, IntVT);
46561 CC = IsAnyOf ? CC : (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
46562 return DAG.getNode(X86ISD::TESTP, DL, MVT::i32,
46563 DAG.getBitcast(FloatVT, LHS),
46564 DAG.getBitcast(FloatVT, RHS));
46565 }
46566
46567 return SDValue();
46568}
46569
46570/// Optimize an EFLAGS definition used according to the condition code \p CC
46571/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
46572/// uses of chain values.
46574 SelectionDAG &DAG,
46575 const X86Subtarget &Subtarget) {
46576 if (CC == X86::COND_B)
46577 if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
46578 return Flags;
46579
46580 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
46581 return R;
46582
46583 if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
46584 return R;
46585
46586 if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
46587 return R;
46588
46589 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
46590}
46591
46592/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
46595 const X86Subtarget &Subtarget) {
46596 SDLoc DL(N);
46597
46598 SDValue FalseOp = N->getOperand(0);
46599 SDValue TrueOp = N->getOperand(1);
46600 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
46601 SDValue Cond = N->getOperand(3);
46602
46603 // cmov X, X, ?, ? --> X
46604 if (TrueOp == FalseOp)
46605 return TrueOp;
46606
46607 // Try to simplify the EFLAGS and condition code operands.
46608 // We can't always do this as FCMOV only supports a subset of X86 cond.
46609 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
46610 if (!(FalseOp.getValueType() == MVT::f80 ||
46611 (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
46612 (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
46613 !Subtarget.canUseCMOV() || hasFPCMov(CC)) {
46614 SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
46615 Flags};
46616 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
46617 }
46618 }
46619
46620 // If this is a select between two integer constants, try to do some
46621 // optimizations. Note that the operands are ordered the opposite of SELECT
46622 // operands.
46623 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
46624 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
46625 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
46626 // larger than FalseC (the false value).
46627 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
46629 std::swap(TrueC, FalseC);
46630 std::swap(TrueOp, FalseOp);
46631 }
46632
46633 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
46634 // This is efficient for any integer data type (including i8/i16) and
46635 // shift amount.
46636 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
46637 Cond = getSETCC(CC, Cond, DL, DAG);
46638
46639 // Zero extend the condition if needed.
46640 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
46641
46642 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
46643 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
46644 DAG.getConstant(ShAmt, DL, MVT::i8));
46645 return Cond;
46646 }
46647
46648 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
46649 // for any integer data type, including i8/i16.
46650 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
46651 Cond = getSETCC(CC, Cond, DL, DAG);
46652
46653 // Zero extend the condition if needed.
46655 FalseC->getValueType(0), Cond);
46656 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
46657 SDValue(FalseC, 0));
46658 return Cond;
46659 }
46660
46661 // Optimize cases that will turn into an LEA instruction. This requires
46662 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
46663 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
46664 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
46665 assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&
46666 "Implicit constant truncation");
46667
46668 bool isFastMultiplier = false;
46669 if (Diff.ult(10)) {
46670 switch (Diff.getZExtValue()) {
46671 default: break;
46672 case 1: // result = add base, cond
46673 case 2: // result = lea base( , cond*2)
46674 case 3: // result = lea base(cond, cond*2)
46675 case 4: // result = lea base( , cond*4)
46676 case 5: // result = lea base(cond, cond*4)
46677 case 8: // result = lea base( , cond*8)
46678 case 9: // result = lea base(cond, cond*8)
46679 isFastMultiplier = true;
46680 break;
46681 }
46682 }
46683
46684 if (isFastMultiplier) {
46685 Cond = getSETCC(CC, Cond, DL ,DAG);
46686 // Zero extend the condition if needed.
46687 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
46688 Cond);
46689 // Scale the condition by the difference.
46690 if (Diff != 1)
46691 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
46692 DAG.getConstant(Diff, DL, Cond.getValueType()));
46693
46694 // Add the base if non-zero.
46695 if (FalseC->getAPIntValue() != 0)
46696 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
46697 SDValue(FalseC, 0));
46698 return Cond;
46699 }
46700 }
46701 }
46702 }
46703
46704 // Handle these cases:
46705 // (select (x != c), e, c) -> select (x != c), e, x),
46706 // (select (x == c), c, e) -> select (x == c), x, e)
46707 // where the c is an integer constant, and the "select" is the combination
46708 // of CMOV and CMP.
46709 //
46710 // The rationale for this change is that the conditional-move from a constant
46711 // needs two instructions, however, conditional-move from a register needs
46712 // only one instruction.
46713 //
46714 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
46715 // some instruction-combining opportunities. This opt needs to be
46716 // postponed as late as possible.
46717 //
46718 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
46719 // the DCI.xxxx conditions are provided to postpone the optimization as
46720 // late as possible.
46721
46722 ConstantSDNode *CmpAgainst = nullptr;
46723 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
46724 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
46725 !isa<ConstantSDNode>(Cond.getOperand(0))) {
46726
46727 if (CC == X86::COND_NE &&
46728 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
46730 std::swap(TrueOp, FalseOp);
46731 }
46732
46733 if (CC == X86::COND_E &&
46734 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
46735 SDValue Ops[] = {FalseOp, Cond.getOperand(0),
46736 DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
46737 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
46738 }
46739 }
46740 }
46741
46742 // Transform:
46743 //
46744 // (cmov 1 T (uge T 2))
46745 //
46746 // to:
46747 //
46748 // (adc T 0 (sub T 1))
46749 if (CC == X86::COND_AE && isOneConstant(FalseOp) &&
46750 Cond.getOpcode() == X86ISD::SUB && Cond->hasOneUse()) {
46751 SDValue Cond0 = Cond.getOperand(0);
46752 if (Cond0.getOpcode() == ISD::TRUNCATE)
46753 Cond0 = Cond0.getOperand(0);
46754 auto *Sub1C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
46755 if (Cond0 == TrueOp && Sub1C && Sub1C->getZExtValue() == 2) {
46756 EVT CondVT = Cond->getValueType(0);
46757 EVT OuterVT = N->getValueType(0);
46758 // Subtract 1 and generate a carry.
46759 SDValue NewSub =
46760 DAG.getNode(X86ISD::SUB, DL, Cond->getVTList(), Cond.getOperand(0),
46761 DAG.getConstant(1, DL, CondVT));
46762 SDValue EFLAGS(NewSub.getNode(), 1);
46763 return DAG.getNode(X86ISD::ADC, DL, DAG.getVTList(OuterVT, MVT::i32),
46764 TrueOp, DAG.getConstant(0, DL, OuterVT), EFLAGS);
46765 }
46766 }
46767
46768 // Fold and/or of setcc's to double CMOV:
46769 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
46770 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
46771 //
46772 // This combine lets us generate:
46773 // cmovcc1 (jcc1 if we don't have CMOV)
46774 // cmovcc2 (same)
46775 // instead of:
46776 // setcc1
46777 // setcc2
46778 // and/or
46779 // cmovne (jne if we don't have CMOV)
46780 // When we can't use the CMOV instruction, it might increase branch
46781 // mispredicts.
46782 // When we can use CMOV, or when there is no mispredict, this improves
46783 // throughput and reduces register pressure.
46784 //
46785 if (CC == X86::COND_NE) {
46786 SDValue Flags;
46787 X86::CondCode CC0, CC1;
46788 bool isAndSetCC;
46789 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
46790 if (isAndSetCC) {
46791 std::swap(FalseOp, TrueOp);
46794 }
46795
46796 SDValue LOps[] = {FalseOp, TrueOp,
46797 DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
46798 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
46799 SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
46800 Flags};
46801 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
46802 return CMOV;
46803 }
46804 }
46805
46806 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
46807 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
46808 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
46809 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
46810 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
46811 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
46812 SDValue Add = TrueOp;
46813 SDValue Const = FalseOp;
46814 // Canonicalize the condition code for easier matching and output.
46815 if (CC == X86::COND_E)
46816 std::swap(Add, Const);
46817
46818 // We might have replaced the constant in the cmov with the LHS of the
46819 // compare. If so change it to the RHS of the compare.
46820 if (Const == Cond.getOperand(0))
46821 Const = Cond.getOperand(1);
46822
46823 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
46824 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
46825 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
46826 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
46827 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
46828 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
46829 EVT VT = N->getValueType(0);
46830 // This should constant fold.
46831 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
46832 SDValue CMov =
46833 DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
46834 DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
46835 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
46836 }
46837 }
46838
46839 return SDValue();
46840}
46841
46842/// Different mul shrinking modes.
46844
46846 EVT VT = N->getOperand(0).getValueType();
46847 if (VT.getScalarSizeInBits() != 32)
46848 return false;
46849
46850 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
46851 unsigned SignBits[2] = {1, 1};
46852 bool IsPositive[2] = {false, false};
46853 for (unsigned i = 0; i < 2; i++) {
46854 SDValue Opd = N->getOperand(i);
46855
46856 SignBits[i] = DAG.ComputeNumSignBits(Opd);
46857 IsPositive[i] = DAG.SignBitIsZero(Opd);
46858 }
46859
46860 bool AllPositive = IsPositive[0] && IsPositive[1];
46861 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
46862 // When ranges are from -128 ~ 127, use MULS8 mode.
46863 if (MinSignBits >= 25)
46864 Mode = ShrinkMode::MULS8;
46865 // When ranges are from 0 ~ 255, use MULU8 mode.
46866 else if (AllPositive && MinSignBits >= 24)
46867 Mode = ShrinkMode::MULU8;
46868 // When ranges are from -32768 ~ 32767, use MULS16 mode.
46869 else if (MinSignBits >= 17)
46870 Mode = ShrinkMode::MULS16;
46871 // When ranges are from 0 ~ 65535, use MULU16 mode.
46872 else if (AllPositive && MinSignBits >= 16)
46873 Mode = ShrinkMode::MULU16;
46874 else
46875 return false;
46876 return true;
46877}
46878
46879/// When the operands of vector mul are extended from smaller size values,
46880/// like i8 and i16, the type of mul may be shrinked to generate more
46881/// efficient code. Two typical patterns are handled:
46882/// Pattern1:
46883/// %2 = sext/zext <N x i8> %1 to <N x i32>
46884/// %4 = sext/zext <N x i8> %3 to <N x i32>
46885// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
46886/// %5 = mul <N x i32> %2, %4
46887///
46888/// Pattern2:
46889/// %2 = zext/sext <N x i16> %1 to <N x i32>
46890/// %4 = zext/sext <N x i16> %3 to <N x i32>
46891/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
46892/// %5 = mul <N x i32> %2, %4
46893///
46894/// There are four mul shrinking modes:
46895/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
46896/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
46897/// generate pmullw+sext32 for it (MULS8 mode).
46898/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
46899/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
46900/// generate pmullw+zext32 for it (MULU8 mode).
46901/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
46902/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
46903/// generate pmullw+pmulhw for it (MULS16 mode).
46904/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
46905/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
46906/// generate pmullw+pmulhuw for it (MULU16 mode).
46908 const X86Subtarget &Subtarget) {
46909 // Check for legality
46910 // pmullw/pmulhw are not supported by SSE.
46911 if (!Subtarget.hasSSE2())
46912 return SDValue();
46913
46914 // Check for profitability
46915 // pmulld is supported since SSE41. It is better to use pmulld
46916 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
46917 // the expansion.
46918 bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
46919 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
46920 return SDValue();
46921
46922 ShrinkMode Mode;
46923 if (!canReduceVMulWidth(N, DAG, Mode))
46924 return SDValue();
46925
46926 SDValue N0 = N->getOperand(0);
46927 SDValue N1 = N->getOperand(1);
46928 EVT VT = N->getOperand(0).getValueType();
46929 unsigned NumElts = VT.getVectorNumElements();
46930 if ((NumElts % 2) != 0)
46931 return SDValue();
46932
46933 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
46934
46935 // Shrink the operands of mul.
46936 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
46937 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
46938
46939 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
46940 // lower part is needed.
46941 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
46942 if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)
46943 return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
46945 DL, VT, MulLo);
46946
46947 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
46948 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
46949 // the higher part is also needed.
46950 SDValue MulHi =
46951 DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
46952 ReducedVT, NewN0, NewN1);
46953
46954 // Repack the lower part and higher part result of mul into a wider
46955 // result.
46956 // Generate shuffle functioning as punpcklwd.
46957 SmallVector<int, 16> ShuffleMask(NumElts);
46958 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
46959 ShuffleMask[2 * i] = i;
46960 ShuffleMask[2 * i + 1] = i + NumElts;
46961 }
46962 SDValue ResLo =
46963 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
46964 ResLo = DAG.getBitcast(ResVT, ResLo);
46965 // Generate shuffle functioning as punpckhwd.
46966 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
46967 ShuffleMask[2 * i] = i + NumElts / 2;
46968 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
46969 }
46970 SDValue ResHi =
46971 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
46972 ResHi = DAG.getBitcast(ResVT, ResHi);
46973 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
46974}
46975
46977 EVT VT, const SDLoc &DL) {
46978
46979 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
46980 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
46981 DAG.getConstant(Mult, DL, VT));
46982 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
46983 DAG.getConstant(Shift, DL, MVT::i8));
46984 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
46985 N->getOperand(0));
46986 return Result;
46987 };
46988
46989 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
46990 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
46991 DAG.getConstant(Mul1, DL, VT));
46992 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
46993 DAG.getConstant(Mul2, DL, VT));
46994 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
46995 N->getOperand(0));
46996 return Result;
46997 };
46998
46999 switch (MulAmt) {
47000 default:
47001 break;
47002 case 11:
47003 // mul x, 11 => add ((shl (mul x, 5), 1), x)
47004 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
47005 case 21:
47006 // mul x, 21 => add ((shl (mul x, 5), 2), x)
47007 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
47008 case 41:
47009 // mul x, 41 => add ((shl (mul x, 5), 3), x)
47010 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
47011 case 22:
47012 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
47013 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
47014 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
47015 case 19:
47016 // mul x, 19 => add ((shl (mul x, 9), 1), x)
47017 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
47018 case 37:
47019 // mul x, 37 => add ((shl (mul x, 9), 2), x)
47020 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
47021 case 73:
47022 // mul x, 73 => add ((shl (mul x, 9), 3), x)
47023 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
47024 case 13:
47025 // mul x, 13 => add ((shl (mul x, 3), 2), x)
47026 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
47027 case 23:
47028 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
47029 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
47030 case 26:
47031 // mul x, 26 => add ((mul (mul x, 5), 5), x)
47032 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
47033 case 28:
47034 // mul x, 28 => add ((mul (mul x, 9), 3), x)
47035 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
47036 case 29:
47037 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
47038 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
47039 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
47040 }
47041
47042 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
47043 // by a single LEA.
47044 // First check if this a sum of two power of 2s because that's easy. Then
47045 // count how many zeros are up to the first bit.
47046 // TODO: We can do this even without LEA at a cost of two shifts and an add.
47047 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
47048 unsigned ScaleShift = llvm::countr_zero(MulAmt);
47049 if (ScaleShift >= 1 && ScaleShift < 4) {
47050 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
47051 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47052 DAG.getConstant(ShiftAmt, DL, MVT::i8));
47053 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47054 DAG.getConstant(ScaleShift, DL, MVT::i8));
47055 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
47056 }
47057 }
47058
47059 return SDValue();
47060}
47061
47062// If the upper 17 bits of either element are zero and the other element are
47063// zero/sign bits then we can use PMADDWD, which is always at least as quick as
47064// PMULLD, except on KNL.
47066 SelectionDAG &DAG,
47067 const X86Subtarget &Subtarget) {
47068 if (!Subtarget.hasSSE2())
47069 return SDValue();
47070
47071 if (Subtarget.isPMADDWDSlow())
47072 return SDValue();
47073
47074 EVT VT = N->getValueType(0);
47075
47076 // Only support vXi32 vectors.
47077 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
47078 return SDValue();
47079
47080 // Make sure the type is legal or can split/widen to a legal type.
47081 // With AVX512 but without BWI, we would need to split v32i16.
47082 unsigned NumElts = VT.getVectorNumElements();
47083 if (NumElts == 1 || !isPowerOf2_32(NumElts))
47084 return SDValue();
47085
47086 // With AVX512 but without BWI, we would need to split v32i16.
47087 if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())
47088 return SDValue();
47089
47090 SDValue N0 = N->getOperand(0);
47091 SDValue N1 = N->getOperand(1);
47092
47093 // If we are zero/sign extending two steps without SSE4.1, its better to
47094 // reduce the vmul width instead.
47095 if (!Subtarget.hasSSE41() &&
47096 (((N0.getOpcode() == ISD::ZERO_EXTEND &&
47097 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
47098 (N1.getOpcode() == ISD::ZERO_EXTEND &&
47099 N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||
47100 ((N0.getOpcode() == ISD::SIGN_EXTEND &&
47101 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
47102 (N1.getOpcode() == ISD::SIGN_EXTEND &&
47103 N1.getOperand(0).getScalarValueSizeInBits() <= 8))))
47104 return SDValue();
47105
47106 // If we are sign extending a wide vector without SSE4.1, its better to reduce
47107 // the vmul width instead.
47108 if (!Subtarget.hasSSE41() &&
47109 (N0.getOpcode() == ISD::SIGN_EXTEND &&
47110 N0.getOperand(0).getValueSizeInBits() > 128) &&
47111 (N1.getOpcode() == ISD::SIGN_EXTEND &&
47112 N1.getOperand(0).getValueSizeInBits() > 128))
47113 return SDValue();
47114
47115 // Sign bits must extend down to the lowest i16.
47116 if (DAG.ComputeMaxSignificantBits(N1) > 16 ||
47117 DAG.ComputeMaxSignificantBits(N0) > 16)
47118 return SDValue();
47119
47120 // At least one of the elements must be zero in the upper 17 bits, or can be
47121 // safely made zero without altering the final result.
47122 auto GetZeroableOp = [&](SDValue Op) {
47123 APInt Mask17 = APInt::getHighBitsSet(32, 17);
47124 if (DAG.MaskedValueIsZero(Op, Mask17))
47125 return Op;
47126 // Mask off upper 16-bits of sign-extended constants.
47128 return DAG.getNode(ISD::AND, DL, VT, Op, DAG.getConstant(0xFFFF, DL, VT));
47129 if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
47130 SDValue Src = Op.getOperand(0);
47131 // Convert sext(vXi16) to zext(vXi16).
47132 if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)
47133 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
47134 // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
47135 // which will expand the extension.
47136 if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {
47137 EVT ExtVT = VT.changeVectorElementType(MVT::i16);
47138 Src = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, Src);
47139 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
47140 }
47141 }
47142 // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.
47143 if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
47144 N->isOnlyUserOf(Op.getNode())) {
47145 SDValue Src = Op.getOperand(0);
47146 if (Src.getScalarValueSizeInBits() == 16)
47147 return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, Src);
47148 }
47149 // Convert VSRAI(Op, 16) to VSRLI(Op, 16).
47150 if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&
47151 N->isOnlyUserOf(Op.getNode())) {
47152 return DAG.getNode(X86ISD::VSRLI, DL, VT, Op.getOperand(0),
47153 Op.getOperand(1));
47154 }
47155 return SDValue();
47156 };
47157 SDValue ZeroN0 = GetZeroableOp(N0);
47158 SDValue ZeroN1 = GetZeroableOp(N1);
47159 if (!ZeroN0 && !ZeroN1)
47160 return SDValue();
47161 N0 = ZeroN0 ? ZeroN0 : N0;
47162 N1 = ZeroN1 ? ZeroN1 : N1;
47163
47164 // Use SplitOpsAndApply to handle AVX splitting.
47165 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47166 ArrayRef<SDValue> Ops) {
47167 MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
47168 MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16);
47169 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
47170 DAG.getBitcast(OpVT, Ops[0]),
47171 DAG.getBitcast(OpVT, Ops[1]));
47172 };
47173 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDWDBuilder);
47174}
47175
47177 const X86Subtarget &Subtarget) {
47178 if (!Subtarget.hasSSE2())
47179 return SDValue();
47180
47181 EVT VT = N->getValueType(0);
47182
47183 // Only support vXi64 vectors.
47184 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
47185 VT.getVectorNumElements() < 2 ||
47187 return SDValue();
47188
47189 SDValue N0 = N->getOperand(0);
47190 SDValue N1 = N->getOperand(1);
47191
47192 // MULDQ returns the 64-bit result of the signed multiplication of the lower
47193 // 32-bits. We can lower with this if the sign bits stretch that far.
47194 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
47195 DAG.ComputeNumSignBits(N1) > 32) {
47196 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47197 ArrayRef<SDValue> Ops) {
47198 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
47199 };
47200 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULDQBuilder,
47201 /*CheckBWI*/ false);
47202 }
47203
47204 // If the upper bits are zero we can use a single pmuludq.
47205 APInt Mask = APInt::getHighBitsSet(64, 32);
47206 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
47207 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47208 ArrayRef<SDValue> Ops) {
47209 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
47210 };
47211 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULUDQBuilder,
47212 /*CheckBWI*/ false);
47213 }
47214
47215 return SDValue();
47216}
47217
47220 const X86Subtarget &Subtarget) {
47221 EVT VT = N->getValueType(0);
47222 SDLoc DL(N);
47223
47224 if (SDValue V = combineMulToPMADDWD(N, DL, DAG, Subtarget))
47225 return V;
47226
47227 if (SDValue V = combineMulToPMULDQ(N, DL, DAG, Subtarget))
47228 return V;
47229
47230 if (DCI.isBeforeLegalize() && VT.isVector())
47231 return reduceVMULWidth(N, DL, DAG, Subtarget);
47232
47233 // Optimize a single multiply with constant into two operations in order to
47234 // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
47236 return SDValue();
47237
47238 // An imul is usually smaller than the alternative sequence.
47240 return SDValue();
47241
47242 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
47243 return SDValue();
47244
47245 if (VT != MVT::i64 && VT != MVT::i32 &&
47246 (!VT.isVector() || !VT.isSimple() || !VT.isInteger()))
47247 return SDValue();
47248
47250 N->getOperand(1), /*AllowUndefs*/ true, /*AllowTrunc*/ false);
47251 const APInt *C = nullptr;
47252 if (!CNode) {
47253 if (VT.isVector())
47254 if (auto *RawC = getTargetConstantFromNode(N->getOperand(1)))
47255 if (auto *SplatC = RawC->getSplatValue())
47256 C = &(SplatC->getUniqueInteger());
47257
47258 if (!C || C->getBitWidth() != VT.getScalarSizeInBits())
47259 return SDValue();
47260 } else {
47261 C = &(CNode->getAPIntValue());
47262 }
47263
47264 if (isPowerOf2_64(C->getZExtValue()))
47265 return SDValue();
47266
47267 int64_t SignMulAmt = C->getSExtValue();
47268 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
47269 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
47270
47271 SDValue NewMul = SDValue();
47272 if (VT == MVT::i64 || VT == MVT::i32) {
47273 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
47274 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
47275 DAG.getConstant(AbsMulAmt, DL, VT));
47276 if (SignMulAmt < 0)
47277 NewMul = DAG.getNegative(NewMul, DL, VT);
47278
47279 return NewMul;
47280 }
47281
47282 uint64_t MulAmt1 = 0;
47283 uint64_t MulAmt2 = 0;
47284 if ((AbsMulAmt % 9) == 0) {
47285 MulAmt1 = 9;
47286 MulAmt2 = AbsMulAmt / 9;
47287 } else if ((AbsMulAmt % 5) == 0) {
47288 MulAmt1 = 5;
47289 MulAmt2 = AbsMulAmt / 5;
47290 } else if ((AbsMulAmt % 3) == 0) {
47291 MulAmt1 = 3;
47292 MulAmt2 = AbsMulAmt / 3;
47293 }
47294
47295 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
47296 if (MulAmt2 &&
47297 (isPowerOf2_64(MulAmt2) ||
47298 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
47299
47300 if (isPowerOf2_64(MulAmt2) && !(SignMulAmt >= 0 && N->hasOneUse() &&
47301 N->use_begin()->getOpcode() == ISD::ADD))
47302 // If second multiplifer is pow2, issue it first. We want the multiply
47303 // by 3, 5, or 9 to be folded into the addressing mode unless the lone
47304 // use is an add. Only do this for positive multiply amounts since the
47305 // negate would prevent it from being used as an address mode anyway.
47306 std::swap(MulAmt1, MulAmt2);
47307
47308 if (isPowerOf2_64(MulAmt1))
47309 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47310 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
47311 else
47312 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
47313 DAG.getConstant(MulAmt1, DL, VT));
47314
47315 if (isPowerOf2_64(MulAmt2))
47316 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
47317 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
47318 else
47319 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
47320 DAG.getConstant(MulAmt2, DL, VT));
47321
47322 // Negate the result.
47323 if (SignMulAmt < 0)
47324 NewMul = DAG.getNegative(NewMul, DL, VT);
47325 } else if (!Subtarget.slowLEA())
47326 NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
47327 }
47328 if (!NewMul) {
47329 EVT ShiftVT = VT.isVector() ? VT : MVT::i8;
47330 assert(C->getZExtValue() != 0 &&
47331 C->getZExtValue() != maxUIntN(VT.getScalarSizeInBits()) &&
47332 "Both cases that could cause potential overflows should have "
47333 "already been handled.");
47334 if (isPowerOf2_64(AbsMulAmt - 1)) {
47335 // (mul x, 2^N + 1) => (add (shl x, N), x)
47336 NewMul = DAG.getNode(
47337 ISD::ADD, DL, VT, N->getOperand(0),
47338 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47339 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL, ShiftVT)));
47340 if (SignMulAmt < 0)
47341 NewMul = DAG.getNegative(NewMul, DL, VT);
47342 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
47343 // (mul x, 2^N - 1) => (sub (shl x, N), x)
47344 NewMul =
47345 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47346 DAG.getConstant(Log2_64(AbsMulAmt + 1), DL, ShiftVT));
47347 // To negate, reverse the operands of the subtract.
47348 if (SignMulAmt < 0)
47349 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
47350 else
47351 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
47352 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2) &&
47353 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
47354 // (mul x, 2^N + 2) => (add (shl x, N), (add x, x))
47355 NewMul =
47356 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47357 DAG.getConstant(Log2_64(AbsMulAmt - 2), DL, ShiftVT));
47358 NewMul = DAG.getNode(
47359 ISD::ADD, DL, VT, NewMul,
47360 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
47361 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2) &&
47362 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
47363 // (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))
47364 NewMul =
47365 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47366 DAG.getConstant(Log2_64(AbsMulAmt + 2), DL, ShiftVT));
47367 NewMul = DAG.getNode(
47368 ISD::SUB, DL, VT, NewMul,
47369 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
47370 } else if (SignMulAmt >= 0 && VT.isVector() &&
47371 Subtarget.fastImmVectorShift()) {
47372 uint64_t AbsMulAmtLowBit = AbsMulAmt & (-AbsMulAmt);
47373 uint64_t ShiftAmt1;
47374 std::optional<unsigned> Opc;
47375 if (isPowerOf2_64(AbsMulAmt - AbsMulAmtLowBit)) {
47376 ShiftAmt1 = AbsMulAmt - AbsMulAmtLowBit;
47377 Opc = ISD::ADD;
47378 } else if (isPowerOf2_64(AbsMulAmt + AbsMulAmtLowBit)) {
47379 ShiftAmt1 = AbsMulAmt + AbsMulAmtLowBit;
47380 Opc = ISD::SUB;
47381 }
47382
47383 if (Opc) {
47384 SDValue Shift1 =
47385 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47386 DAG.getConstant(Log2_64(ShiftAmt1), DL, ShiftVT));
47387 SDValue Shift2 =
47388 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47389 DAG.getConstant(Log2_64(AbsMulAmtLowBit), DL, ShiftVT));
47390 NewMul = DAG.getNode(*Opc, DL, VT, Shift1, Shift2);
47391 }
47392 }
47393 }
47394
47395 return NewMul;
47396}
47397
47398// Try to form a MULHU or MULHS node by looking for
47399// (srl (mul ext, ext), 16)
47400// TODO: This is X86 specific because we want to be able to handle wide types
47401// before type legalization. But we can only do it if the vector will be
47402// legalized via widening/splitting. Type legalization can't handle promotion
47403// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
47404// combiner.
47406 const X86Subtarget &Subtarget) {
47407 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
47408 "SRL or SRA node is required here!");
47409 SDLoc DL(N);
47410
47411 if (!Subtarget.hasSSE2())
47412 return SDValue();
47413
47414 // The operation feeding into the shift must be a multiply.
47415 SDValue ShiftOperand = N->getOperand(0);
47416 if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())
47417 return SDValue();
47418
47419 // Input type should be at least vXi32.
47420 EVT VT = N->getValueType(0);
47421 if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
47422 return SDValue();
47423
47424 // Need a shift by 16.
47425 APInt ShiftAmt;
47426 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||
47427 ShiftAmt != 16)
47428 return SDValue();
47429
47430 SDValue LHS = ShiftOperand.getOperand(0);
47431 SDValue RHS = ShiftOperand.getOperand(1);
47432
47433 unsigned ExtOpc = LHS.getOpcode();
47434 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
47435 RHS.getOpcode() != ExtOpc)
47436 return SDValue();
47437
47438 // Peek through the extends.
47439 LHS = LHS.getOperand(0);
47440 RHS = RHS.getOperand(0);
47441
47442 // Ensure the input types match.
47443 EVT MulVT = LHS.getValueType();
47444 if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
47445 return SDValue();
47446
47447 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
47448 SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
47449
47450 ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
47451 return DAG.getNode(ExtOpc, DL, VT, Mulh);
47452}
47453
47455 SDValue N0 = N->getOperand(0);
47456 SDValue N1 = N->getOperand(1);
47457 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
47458 EVT VT = N0.getValueType();
47459
47460 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
47461 // since the result of setcc_c is all zero's or all ones.
47462 if (VT.isInteger() && !VT.isVector() &&
47463 N1C && N0.getOpcode() == ISD::AND &&
47464 N0.getOperand(1).getOpcode() == ISD::Constant) {
47465 SDValue N00 = N0.getOperand(0);
47466 APInt Mask = N0.getConstantOperandAPInt(1);
47467 Mask <<= N1C->getAPIntValue();
47468 bool MaskOK = false;
47469 // We can handle cases concerning bit-widening nodes containing setcc_c if
47470 // we carefully interrogate the mask to make sure we are semantics
47471 // preserving.
47472 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
47473 // of the underlying setcc_c operation if the setcc_c was zero extended.
47474 // Consider the following example:
47475 // zext(setcc_c) -> i32 0x0000FFFF
47476 // c1 -> i32 0x0000FFFF
47477 // c2 -> i32 0x00000001
47478 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
47479 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
47480 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
47481 MaskOK = true;
47482 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
47484 MaskOK = true;
47485 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
47486 N00.getOpcode() == ISD::ANY_EXTEND) &&
47488 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
47489 }
47490 if (MaskOK && Mask != 0) {
47491 SDLoc DL(N);
47492 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
47493 }
47494 }
47495
47496 return SDValue();
47497}
47498
47500 const X86Subtarget &Subtarget) {
47501 SDValue N0 = N->getOperand(0);
47502 SDValue N1 = N->getOperand(1);
47503 EVT VT = N0.getValueType();
47504 unsigned Size = VT.getSizeInBits();
47505
47506 if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
47507 return V;
47508
47509 APInt ShiftAmt;
47510 if (supportedVectorVarShift(VT, Subtarget, ISD::SRA) &&
47511 N1.getOpcode() == ISD::UMIN &&
47512 ISD::isConstantSplatVector(N1.getOperand(1).getNode(), ShiftAmt) &&
47513 ShiftAmt == VT.getScalarSizeInBits() - 1) {
47514 SDValue ShrAmtVal = N1.getOperand(0);
47515 SDLoc DL(N);
47516 return DAG.getNode(X86ISD::VSRAV, DL, N->getVTList(), N0, ShrAmtVal);
47517 }
47518
47519 // fold (SRA (SHL X, ShlConst), SraConst)
47520 // into (SHL (sext_in_reg X), ShlConst - SraConst)
47521 // or (sext_in_reg X)
47522 // or (SRA (sext_in_reg X), SraConst - ShlConst)
47523 // depending on relation between SraConst and ShlConst.
47524 // We only do this if (Size - ShlConst) is equal to 8, 16 or 32. That allows
47525 // us to do the sext_in_reg from corresponding bit.
47526
47527 // sexts in X86 are MOVs. The MOVs have the same code size
47528 // as above SHIFTs (only SHIFT on 1 has lower code size).
47529 // However the MOVs have 2 advantages to a SHIFT:
47530 // 1. MOVs can write to a register that differs from source
47531 // 2. MOVs accept memory operands
47532
47533 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
47534 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
47536 return SDValue();
47537
47538 SDValue N00 = N0.getOperand(0);
47539 SDValue N01 = N0.getOperand(1);
47540 APInt ShlConst = N01->getAsAPIntVal();
47541 APInt SraConst = N1->getAsAPIntVal();
47542 EVT CVT = N1.getValueType();
47543
47544 if (CVT != N01.getValueType())
47545 return SDValue();
47546 if (SraConst.isNegative())
47547 return SDValue();
47548
47549 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
47550 unsigned ShiftSize = SVT.getSizeInBits();
47551 // Only deal with (Size - ShlConst) being equal to 8, 16 or 32.
47552 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
47553 continue;
47554 SDLoc DL(N);
47555 SDValue NN =
47556 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
47557 if (SraConst.eq(ShlConst))
47558 return NN;
47559 if (SraConst.ult(ShlConst))
47560 return DAG.getNode(ISD::SHL, DL, VT, NN,
47561 DAG.getConstant(ShlConst - SraConst, DL, CVT));
47562 return DAG.getNode(ISD::SRA, DL, VT, NN,
47563 DAG.getConstant(SraConst - ShlConst, DL, CVT));
47564 }
47565 return SDValue();
47566}
47567
47570 const X86Subtarget &Subtarget) {
47571 SDValue N0 = N->getOperand(0);
47572 SDValue N1 = N->getOperand(1);
47573 EVT VT = N0.getValueType();
47574
47575 if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
47576 return V;
47577
47578 // Only do this on the last DAG combine as it can interfere with other
47579 // combines.
47580 if (!DCI.isAfterLegalizeDAG())
47581 return SDValue();
47582
47583 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
47584 // TODO: This is a generic DAG combine that became an x86-only combine to
47585 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
47586 // and-not ('andn').
47587 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
47588 return SDValue();
47589
47590 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
47591 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
47592 if (!ShiftC || !AndC)
47593 return SDValue();
47594
47595 // If we can shrink the constant mask below 8-bits or 32-bits, then this
47596 // transform should reduce code size. It may also enable secondary transforms
47597 // from improved known-bits analysis or instruction selection.
47598 APInt MaskVal = AndC->getAPIntValue();
47599
47600 // If this can be matched by a zero extend, don't optimize.
47601 if (MaskVal.isMask()) {
47602 unsigned TO = MaskVal.countr_one();
47603 if (TO >= 8 && isPowerOf2_32(TO))
47604 return SDValue();
47605 }
47606
47607 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
47608 unsigned OldMaskSize = MaskVal.getSignificantBits();
47609 unsigned NewMaskSize = NewMaskVal.getSignificantBits();
47610 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
47611 (OldMaskSize > 32 && NewMaskSize <= 32)) {
47612 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
47613 SDLoc DL(N);
47614 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
47615 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
47616 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
47617 }
47618 return SDValue();
47619}
47620
47622 const X86Subtarget &Subtarget) {
47623 unsigned Opcode = N->getOpcode();
47624 assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode");
47625
47626 SDLoc DL(N);
47627 EVT VT = N->getValueType(0);
47628 SDValue N0 = N->getOperand(0);
47629 SDValue N1 = N->getOperand(1);
47630 EVT SrcVT = N0.getValueType();
47631
47632 SDValue BC0 =
47633 N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
47634 SDValue BC1 =
47635 N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
47636
47637 // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
47638 // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
47639 // truncation trees that help us avoid lane crossing shuffles.
47640 // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
47641 // TODO: We don't handle vXf64 shuffles yet.
47642 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
47643 if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {
47645 SmallVector<int> ShuffleMask, ScaledMask;
47646 SDValue Vec = peekThroughBitcasts(BCSrc);
47647 if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
47649 // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
47650 // shuffle to a v4X64 width - we can probably relax this in the future.
47651 if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
47652 ShuffleOps[0].getValueType().is256BitVector() &&
47653 scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
47654 SDValue Lo, Hi;
47655 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
47656 std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
47657 Lo = DAG.getBitcast(SrcVT, Lo);
47658 Hi = DAG.getBitcast(SrcVT, Hi);
47659 SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
47660 Res = DAG.getBitcast(ShufVT, Res);
47661 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
47662 return DAG.getBitcast(VT, Res);
47663 }
47664 }
47665 }
47666 }
47667
47668 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
47669 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
47670 // If either/both ops are a shuffle that can scale to v2x64,
47671 // then see if we can perform this as a v4x32 post shuffle.
47672 SmallVector<SDValue> Ops0, Ops1;
47673 SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
47674 bool IsShuf0 =
47675 getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
47676 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
47677 all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
47678 bool IsShuf1 =
47679 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
47680 scaleShuffleElements(Mask1, 2, ScaledMask1) &&
47681 all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
47682 if (IsShuf0 || IsShuf1) {
47683 if (!IsShuf0) {
47684 Ops0.assign({BC0});
47685 ScaledMask0.assign({0, 1});
47686 }
47687 if (!IsShuf1) {
47688 Ops1.assign({BC1});
47689 ScaledMask1.assign({0, 1});
47690 }
47691
47692 SDValue LHS, RHS;
47693 int PostShuffle[4] = {-1, -1, -1, -1};
47694 auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
47695 if (M < 0)
47696 return true;
47697 Idx = M % 2;
47698 SDValue Src = Ops[M / 2];
47699 if (!LHS || LHS == Src) {
47700 LHS = Src;
47701 return true;
47702 }
47703 if (!RHS || RHS == Src) {
47704 Idx += 2;
47705 RHS = Src;
47706 return true;
47707 }
47708 return false;
47709 };
47710 if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
47711 FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
47712 FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
47713 FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
47714 LHS = DAG.getBitcast(SrcVT, LHS);
47715 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
47716 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
47717 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
47718 Res = DAG.getBitcast(ShufVT, Res);
47719 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
47720 return DAG.getBitcast(VT, Res);
47721 }
47722 }
47723 }
47724
47725 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
47726 if (VT.is256BitVector() && Subtarget.hasInt256()) {
47727 SmallVector<int> Mask0, Mask1;
47728 SmallVector<SDValue> Ops0, Ops1;
47729 SmallVector<int, 2> ScaledMask0, ScaledMask1;
47730 if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
47731 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
47732 !Ops0.empty() && !Ops1.empty() &&
47733 all_of(Ops0,
47734 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
47735 all_of(Ops1,
47736 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
47737 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
47738 scaleShuffleElements(Mask1, 2, ScaledMask1)) {
47739 SDValue Op00 = peekThroughBitcasts(Ops0.front());
47740 SDValue Op10 = peekThroughBitcasts(Ops1.front());
47741 SDValue Op01 = peekThroughBitcasts(Ops0.back());
47742 SDValue Op11 = peekThroughBitcasts(Ops1.back());
47743 if ((Op00 == Op11) && (Op01 == Op10)) {
47744 std::swap(Op10, Op11);
47746 }
47747 if ((Op00 == Op10) && (Op01 == Op11)) {
47748 const int Map[4] = {0, 2, 1, 3};
47749 SmallVector<int, 4> ShuffleMask(
47750 {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
47751 Map[ScaledMask1[1]]});
47752 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
47753 SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
47754 DAG.getBitcast(SrcVT, Op01));
47755 Res = DAG.getBitcast(ShufVT, Res);
47756 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
47757 return DAG.getBitcast(VT, Res);
47758 }
47759 }
47760 }
47761
47762 return SDValue();
47763}
47764
47767 const X86Subtarget &Subtarget) {
47768 unsigned Opcode = N->getOpcode();
47769 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
47770 "Unexpected pack opcode");
47771
47772 EVT VT = N->getValueType(0);
47773 SDValue N0 = N->getOperand(0);
47774 SDValue N1 = N->getOperand(1);
47775 unsigned NumDstElts = VT.getVectorNumElements();
47776 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
47777 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
47778 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
47779 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
47780 "Unexpected PACKSS/PACKUS input type");
47781
47782 bool IsSigned = (X86ISD::PACKSS == Opcode);
47783
47784 // Constant Folding.
47785 APInt UndefElts0, UndefElts1;
47786 SmallVector<APInt, 32> EltBits0, EltBits1;
47787 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
47788 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
47789 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0,
47790 /*AllowWholeUndefs*/ true,
47791 /*AllowPartialUndefs*/ true) &&
47792 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1,
47793 /*AllowWholeUndefs*/ true,
47794 /*AllowPartialUndefs*/ true)) {
47795 unsigned NumLanes = VT.getSizeInBits() / 128;
47796 unsigned NumSrcElts = NumDstElts / 2;
47797 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
47798 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
47799
47800 APInt Undefs(NumDstElts, 0);
47801 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));
47802 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
47803 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
47804 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
47805 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
47806 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
47807
47808 if (UndefElts[SrcIdx]) {
47809 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
47810 continue;
47811 }
47812
47813 APInt &Val = EltBits[SrcIdx];
47814 if (IsSigned) {
47815 // PACKSS: Truncate signed value with signed saturation.
47816 // Source values less than dst minint are saturated to minint.
47817 // Source values greater than dst maxint are saturated to maxint.
47818 Val = Val.truncSSat(DstBitsPerElt);
47819 } else {
47820 // PACKUS: Truncate signed value with unsigned saturation.
47821 // Source values less than zero are saturated to zero.
47822 // Source values greater than dst maxuint are saturated to maxuint.
47823 // NOTE: This is different from APInt::truncUSat.
47824 if (Val.isIntN(DstBitsPerElt))
47825 Val = Val.trunc(DstBitsPerElt);
47826 else if (Val.isNegative())
47827 Val = APInt::getZero(DstBitsPerElt);
47828 else
47829 Val = APInt::getAllOnes(DstBitsPerElt);
47830 }
47831 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
47832 }
47833 }
47834
47835 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
47836 }
47837
47838 // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
47839 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
47840 return V;
47841
47842 // Try to fold PACKSS(NOT(X),NOT(Y)) -> NOT(PACKSS(X,Y)).
47843 // Currently limit this to allsignbits cases only.
47844 if (IsSigned &&
47845 (N0.isUndef() || DAG.ComputeNumSignBits(N0) == SrcBitsPerElt) &&
47846 (N1.isUndef() || DAG.ComputeNumSignBits(N1) == SrcBitsPerElt)) {
47847 SDValue Not0 = N0.isUndef() ? N0 : IsNOT(N0, DAG);
47848 SDValue Not1 = N1.isUndef() ? N1 : IsNOT(N1, DAG);
47849 if (Not0 && Not1) {
47850 SDLoc DL(N);
47851 MVT SrcVT = N0.getSimpleValueType();
47852 SDValue Pack =
47853 DAG.getNode(X86ISD::PACKSS, DL, VT, DAG.getBitcast(SrcVT, Not0),
47854 DAG.getBitcast(SrcVT, Not1));
47855 return DAG.getNOT(DL, Pack, VT);
47856 }
47857 }
47858
47859 // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
47860 // truncate to create a larger truncate.
47861 if (Subtarget.hasAVX512() &&
47862 N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
47863 N0.getOperand(0).getValueType() == MVT::v8i32) {
47864 if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
47865 (!IsSigned &&
47866 DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
47867 if (Subtarget.hasVLX())
47868 return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
47869
47870 // Widen input to v16i32 so we can truncate that.
47871 SDLoc dl(N);
47872 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
47873 N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
47874 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
47875 }
47876 }
47877
47878 // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
47879 if (VT.is128BitVector()) {
47880 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
47881 SDValue Src0, Src1;
47882 if (N0.getOpcode() == ExtOpc &&
47884 N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
47885 Src0 = N0.getOperand(0);
47886 }
47887 if (N1.getOpcode() == ExtOpc &&
47889 N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
47890 Src1 = N1.getOperand(0);
47891 }
47892 if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
47893 assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)");
47894 Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
47895 Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
47896 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
47897 }
47898
47899 // Try again with pack(*_extend_vector_inreg, undef).
47900 unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG
47902 if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&
47903 N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)
47904 return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),
47905 DAG);
47906 }
47907
47908 // Attempt to combine as shuffle.
47909 SDValue Op(N, 0);
47910 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
47911 return Res;
47912
47913 return SDValue();
47914}
47915
47918 const X86Subtarget &Subtarget) {
47919 assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||
47920 X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
47921 "Unexpected horizontal add/sub opcode");
47922
47923 if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
47924 MVT VT = N->getSimpleValueType(0);
47925 SDValue LHS = N->getOperand(0);
47926 SDValue RHS = N->getOperand(1);
47927
47928 // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
47929 if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
47930 LHS.getOpcode() == RHS.getOpcode() &&
47931 LHS.getValueType() == RHS.getValueType() &&
47932 N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {
47933 SDValue LHS0 = LHS.getOperand(0);
47934 SDValue LHS1 = LHS.getOperand(1);
47935 SDValue RHS0 = RHS.getOperand(0);
47936 SDValue RHS1 = RHS.getOperand(1);
47937 if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&
47938 (RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {
47939 SDLoc DL(N);
47940 SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
47941 LHS0.isUndef() ? LHS1 : LHS0,
47942 RHS0.isUndef() ? RHS1 : RHS0);
47943 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
47944 Res = DAG.getBitcast(ShufVT, Res);
47945 SDValue NewLHS =
47946 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
47947 getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
47948 SDValue NewRHS =
47949 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
47950 getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
47951 return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),
47952 DAG.getBitcast(VT, NewRHS));
47953 }
47954 }
47955 }
47956
47957 // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
47958 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
47959 return V;
47960
47961 return SDValue();
47962}
47963
47966 const X86Subtarget &Subtarget) {
47967 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||
47968 X86ISD::VSRL == N->getOpcode()) &&
47969 "Unexpected shift opcode");
47970 EVT VT = N->getValueType(0);
47971 SDValue N0 = N->getOperand(0);
47972 SDValue N1 = N->getOperand(1);
47973
47974 // Shift zero -> zero.
47976 return DAG.getConstant(0, SDLoc(N), VT);
47977
47978 // Detect constant shift amounts.
47979 APInt UndefElts;
47980 SmallVector<APInt, 32> EltBits;
47981 if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits,
47982 /*AllowWholeUndefs*/ true,
47983 /*AllowPartialUndefs*/ false)) {
47984 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
47985 return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
47986 EltBits[0].getZExtValue(), DAG);
47987 }
47988
47989 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47990 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
47991 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
47992 return SDValue(N, 0);
47993
47994 return SDValue();
47995}
47996
47999 const X86Subtarget &Subtarget) {
48000 unsigned Opcode = N->getOpcode();
48001 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
48002 X86ISD::VSRLI == Opcode) &&
48003 "Unexpected shift opcode");
48004 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
48005 EVT VT = N->getValueType(0);
48006 SDValue N0 = N->getOperand(0);
48007 SDValue N1 = N->getOperand(1);
48008 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
48009 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
48010 "Unexpected value type");
48011 assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type");
48012
48013 // (shift undef, X) -> 0
48014 if (N0.isUndef())
48015 return DAG.getConstant(0, SDLoc(N), VT);
48016
48017 // Out of range logical bit shifts are guaranteed to be zero.
48018 // Out of range arithmetic bit shifts splat the sign bit.
48019 unsigned ShiftVal = N->getConstantOperandVal(1);
48020 if (ShiftVal >= NumBitsPerElt) {
48021 if (LogicalShift)
48022 return DAG.getConstant(0, SDLoc(N), VT);
48023 ShiftVal = NumBitsPerElt - 1;
48024 }
48025
48026 // (shift X, 0) -> X
48027 if (!ShiftVal)
48028 return N0;
48029
48030 // (shift 0, C) -> 0
48032 // N0 is all zeros or undef. We guarantee that the bits shifted into the
48033 // result are all zeros, not undef.
48034 return DAG.getConstant(0, SDLoc(N), VT);
48035
48036 // (VSRAI -1, C) -> -1
48037 if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
48038 // N0 is all ones or undef. We guarantee that the bits shifted into the
48039 // result are all ones, not undef.
48040 return DAG.getConstant(-1, SDLoc(N), VT);
48041
48042 auto MergeShifts = [&](SDValue X, uint64_t Amt0, uint64_t Amt1) {
48043 unsigned NewShiftVal = Amt0 + Amt1;
48044 if (NewShiftVal >= NumBitsPerElt) {
48045 // Out of range logical bit shifts are guaranteed to be zero.
48046 // Out of range arithmetic bit shifts splat the sign bit.
48047 if (LogicalShift)
48048 return DAG.getConstant(0, SDLoc(N), VT);
48049 NewShiftVal = NumBitsPerElt - 1;
48050 }
48051 return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
48052 DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
48053 };
48054
48055 // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
48056 if (Opcode == N0.getOpcode())
48057 return MergeShifts(N0.getOperand(0), ShiftVal, N0.getConstantOperandVal(1));
48058
48059 // (shl (add X, X), C) -> (shl X, (C + 1))
48060 if (Opcode == X86ISD::VSHLI && N0.getOpcode() == ISD::ADD &&
48061 N0.getOperand(0) == N0.getOperand(1))
48062 return MergeShifts(N0.getOperand(0), ShiftVal, 1);
48063
48064 // We can decode 'whole byte' logical bit shifts as shuffles.
48065 if (LogicalShift && (ShiftVal % 8) == 0) {
48066 SDValue Op(N, 0);
48067 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
48068 return Res;
48069 }
48070
48071 // Attempt to detect an expanded vXi64 SIGN_EXTEND_INREG vXi1 pattern, and
48072 // convert to a splatted v2Xi32 SIGN_EXTEND_INREG pattern:
48073 // psrad(pshufd(psllq(X,63),1,1,3,3),31) ->
48074 // pshufd(psrad(pslld(X,31),31),0,0,2,2).
48075 if (Opcode == X86ISD::VSRAI && NumBitsPerElt == 32 && ShiftVal == 31 &&
48076 N0.getOpcode() == X86ISD::PSHUFD &&
48077 N0.getConstantOperandVal(1) == getV4X86ShuffleImm({1, 1, 3, 3}) &&
48078 N0->hasOneUse()) {
48080 if (BC.getOpcode() == X86ISD::VSHLI &&
48081 BC.getScalarValueSizeInBits() == 64 &&
48082 BC.getConstantOperandVal(1) == 63) {
48083 SDLoc DL(N);
48084 SDValue Src = BC.getOperand(0);
48085 Src = DAG.getBitcast(VT, Src);
48086 Src = DAG.getNode(X86ISD::PSHUFD, DL, VT, Src,
48087 getV4X86ShuffleImm8ForMask({0, 0, 2, 2}, DL, DAG));
48088 Src = DAG.getNode(X86ISD::VSHLI, DL, VT, Src, N1);
48089 Src = DAG.getNode(X86ISD::VSRAI, DL, VT, Src, N1);
48090 return Src;
48091 }
48092 }
48093
48094 auto TryConstantFold = [&](SDValue V) {
48095 APInt UndefElts;
48096 SmallVector<APInt, 32> EltBits;
48097 if (!getTargetConstantBitsFromNode(V, NumBitsPerElt, UndefElts, EltBits,
48098 /*AllowWholeUndefs*/ true,
48099 /*AllowPartialUndefs*/ true))
48100 return SDValue();
48101 assert(EltBits.size() == VT.getVectorNumElements() &&
48102 "Unexpected shift value type");
48103 // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
48104 // created an undef input due to no input bits being demanded, but user
48105 // still expects 0 in other bits.
48106 for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
48107 APInt &Elt = EltBits[i];
48108 if (UndefElts[i])
48109 Elt = 0;
48110 else if (X86ISD::VSHLI == Opcode)
48111 Elt <<= ShiftVal;
48112 else if (X86ISD::VSRAI == Opcode)
48113 Elt.ashrInPlace(ShiftVal);
48114 else
48115 Elt.lshrInPlace(ShiftVal);
48116 }
48117 // Reset undef elements since they were zeroed above.
48118 UndefElts = 0;
48119 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
48120 };
48121
48122 // Constant Folding.
48123 if (N->isOnlyUserOf(N0.getNode())) {
48124 if (SDValue C = TryConstantFold(N0))
48125 return C;
48126
48127 // Fold (shift (logic X, C2), C1) -> (logic (shift X, C1), (shift C2, C1))
48128 // Don't break NOT patterns.
48130 if (ISD::isBitwiseLogicOp(BC.getOpcode()) &&
48131 BC->isOnlyUserOf(BC.getOperand(1).getNode()) &&
48133 if (SDValue RHS = TryConstantFold(BC.getOperand(1))) {
48134 SDLoc DL(N);
48135 SDValue LHS = DAG.getNode(Opcode, DL, VT,
48136 DAG.getBitcast(VT, BC.getOperand(0)), N1);
48137 return DAG.getNode(BC.getOpcode(), DL, VT, LHS, RHS);
48138 }
48139 }
48140 }
48141
48142 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48143 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),
48144 DCI))
48145 return SDValue(N, 0);
48146
48147 return SDValue();
48148}
48149
48152 const X86Subtarget &Subtarget) {
48153 EVT VT = N->getValueType(0);
48154 unsigned Opcode = N->getOpcode();
48155 assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) ||
48156 (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) ||
48157 Opcode == ISD::INSERT_VECTOR_ELT) &&
48158 "Unexpected vector insertion");
48159
48160 SDValue Vec = N->getOperand(0);
48161 SDValue Scl = N->getOperand(1);
48162 SDValue Idx = N->getOperand(2);
48163
48164 // Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).
48165 if (Opcode == ISD::INSERT_VECTOR_ELT && Vec.isUndef() && isNullConstant(Idx))
48166 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Scl);
48167
48168 if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) {
48169 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
48170 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48171 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
48172 APInt::getAllOnes(NumBitsPerElt), DCI))
48173 return SDValue(N, 0);
48174 }
48175
48176 // Attempt to combine insertion patterns to a shuffle.
48177 if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
48178 SDValue Op(N, 0);
48179 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
48180 return Res;
48181 }
48182
48183 return SDValue();
48184}
48185
48186/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
48187/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
48188/// OR -> CMPNEQSS.
48191 const X86Subtarget &Subtarget) {
48192 unsigned opcode;
48193
48194 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
48195 // we're requiring SSE2 for both.
48196 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
48197 SDValue N0 = N->getOperand(0);
48198 SDValue N1 = N->getOperand(1);
48199 SDValue CMP0 = N0.getOperand(1);
48200 SDValue CMP1 = N1.getOperand(1);
48201 SDLoc DL(N);
48202
48203 // The SETCCs should both refer to the same CMP.
48204 if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
48205 return SDValue();
48206
48207 SDValue CMP00 = CMP0->getOperand(0);
48208 SDValue CMP01 = CMP0->getOperand(1);
48209 EVT VT = CMP00.getValueType();
48210
48211 if (VT == MVT::f32 || VT == MVT::f64 ||
48212 (VT == MVT::f16 && Subtarget.hasFP16())) {
48213 bool ExpectingFlags = false;
48214 // Check for any users that want flags:
48215 for (const SDNode *U : N->uses()) {
48216 if (ExpectingFlags)
48217 break;
48218
48219 switch (U->getOpcode()) {
48220 default:
48221 case ISD::BR_CC:
48222 case ISD::BRCOND:
48223 case ISD::SELECT:
48224 ExpectingFlags = true;
48225 break;
48226 case ISD::CopyToReg:
48227 case ISD::SIGN_EXTEND:
48228 case ISD::ZERO_EXTEND:
48229 case ISD::ANY_EXTEND:
48230 break;
48231 }
48232 }
48233
48234 if (!ExpectingFlags) {
48235 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
48236 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
48237
48238 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
48239 X86::CondCode tmp = cc0;
48240 cc0 = cc1;
48241 cc1 = tmp;
48242 }
48243
48244 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
48245 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
48246 // FIXME: need symbolic constants for these magic numbers.
48247 // See X86ATTInstPrinter.cpp:printSSECC().
48248 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
48249 if (Subtarget.hasAVX512()) {
48250 SDValue FSetCC =
48251 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
48252 DAG.getTargetConstant(x86cc, DL, MVT::i8));
48253 // Need to fill with zeros to ensure the bitcast will produce zeroes
48254 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
48255 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
48256 DAG.getConstant(0, DL, MVT::v16i1),
48257 FSetCC, DAG.getIntPtrConstant(0, DL));
48258 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
48259 N->getSimpleValueType(0));
48260 }
48261 SDValue OnesOrZeroesF =
48262 DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
48263 CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
48264
48265 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
48266 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
48267
48268 if (is64BitFP && !Subtarget.is64Bit()) {
48269 // On a 32-bit target, we cannot bitcast the 64-bit float to a
48270 // 64-bit integer, since that's not a legal type. Since
48271 // OnesOrZeroesF is all ones or all zeroes, we don't need all the
48272 // bits, but can do this little dance to extract the lowest 32 bits
48273 // and work with those going forward.
48274 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
48275 OnesOrZeroesF);
48276 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
48277 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
48278 Vector32, DAG.getIntPtrConstant(0, DL));
48279 IntVT = MVT::i32;
48280 }
48281
48282 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
48283 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
48284 DAG.getConstant(1, DL, IntVT));
48285 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
48286 ANDed);
48287 return OneBitOfTruth;
48288 }
48289 }
48290 }
48291 }
48292 return SDValue();
48293}
48294
48295/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
48297 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
48298
48299 MVT VT = N->getSimpleValueType(0);
48300 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
48301 return SDValue();
48302
48303 SDValue X, Y;
48304 SDValue N0 = N->getOperand(0);
48305 SDValue N1 = N->getOperand(1);
48306
48307 if (SDValue Not = IsNOT(N0, DAG)) {
48308 X = Not;
48309 Y = N1;
48310 } else if (SDValue Not = IsNOT(N1, DAG)) {
48311 X = Not;
48312 Y = N0;
48313 } else
48314 return SDValue();
48315
48316 X = DAG.getBitcast(VT, X);
48317 Y = DAG.getBitcast(VT, Y);
48318 return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
48319}
48320
48321/// Try to fold:
48322/// and (vector_shuffle<Z,...,Z>
48323/// (insert_vector_elt undef, (xor X, -1), Z), undef), Y
48324/// ->
48325/// andnp (vector_shuffle<Z,...,Z>
48326/// (insert_vector_elt undef, X, Z), undef), Y
48328 const X86Subtarget &Subtarget) {
48329 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
48330
48331 EVT VT = N->getValueType(0);
48332 // Do not split 256 and 512 bit vectors with SSE2 as they overwrite original
48333 // value and require extra moves.
48334 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
48335 ((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX())))
48336 return SDValue();
48337
48338 auto GetNot = [&DAG](SDValue V) {
48339 auto *SVN = dyn_cast<ShuffleVectorSDNode>(peekThroughOneUseBitcasts(V));
48340 // TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all
48341 // end-users are ISD::AND including cases
48342 // (and(extract_vector_element(SVN), Y)).
48343 if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||
48344 !SVN->getOperand(1).isUndef()) {
48345 return SDValue();
48346 }
48347 SDValue IVEN = SVN->getOperand(0);
48348 if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT ||
48349 !IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse())
48350 return SDValue();
48351 if (!isa<ConstantSDNode>(IVEN.getOperand(2)) ||
48352 IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())
48353 return SDValue();
48354 SDValue Src = IVEN.getOperand(1);
48355 if (SDValue Not = IsNOT(Src, DAG)) {
48356 SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not);
48357 SDValue NotIVEN =
48359 IVEN.getOperand(0), NotSrc, IVEN.getOperand(2));
48360 return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,
48361 SVN->getOperand(1), SVN->getMask());
48362 }
48363 return SDValue();
48364 };
48365
48366 SDValue X, Y;
48367 SDValue N0 = N->getOperand(0);
48368 SDValue N1 = N->getOperand(1);
48369 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48370
48371 if (SDValue Not = GetNot(N0)) {
48372 X = Not;
48373 Y = N1;
48374 } else if (SDValue Not = GetNot(N1)) {
48375 X = Not;
48376 Y = N0;
48377 } else
48378 return SDValue();
48379
48380 X = DAG.getBitcast(VT, X);
48381 Y = DAG.getBitcast(VT, Y);
48382 SDLoc DL(N);
48383
48384 // We do not split for SSE at all, but we need to split vectors for AVX1 and
48385 // AVX2.
48386 if (!Subtarget.useAVX512Regs() && VT.is512BitVector() &&
48388 SDValue LoX, HiX;
48389 std::tie(LoX, HiX) = splitVector(X, DAG, DL);
48390 SDValue LoY, HiY;
48391 std::tie(LoY, HiY) = splitVector(Y, DAG, DL);
48392 EVT SplitVT = LoX.getValueType();
48393 SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY});
48394 SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY});
48395 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV});
48396 }
48397
48398 if (TLI.isTypeLegal(VT))
48399 return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});
48400
48401 return SDValue();
48402}
48403
48404// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
48405// logical operations, like in the example below.
48406// or (and (truncate x, truncate y)),
48407// (xor (truncate z, build_vector (constants)))
48408// Given a target type \p VT, we generate
48409// or (and x, y), (xor z, zext(build_vector (constants)))
48410// given x, y and z are of type \p VT. We can do so, if operands are either
48411// truncates from VT types, the second operand is a vector of constants or can
48412// be recursively promoted.
48414 SelectionDAG &DAG, unsigned Depth) {
48415 // Limit recursion to avoid excessive compile times.
48417 return SDValue();
48418
48419 if (!ISD::isBitwiseLogicOp(N.getOpcode()))
48420 return SDValue();
48421
48422 SDValue N0 = N.getOperand(0);
48423 SDValue N1 = N.getOperand(1);
48424
48425 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48426 if (!TLI.isOperationLegalOrPromote(N.getOpcode(), VT))
48427 return SDValue();
48428
48429 if (SDValue NN0 = PromoteMaskArithmetic(N0, DL, VT, DAG, Depth + 1))
48430 N0 = NN0;
48431 else {
48432 // The left side has to be a trunc.
48433 if (N0.getOpcode() != ISD::TRUNCATE)
48434 return SDValue();
48435
48436 // The type of the truncated inputs.
48437 if (N0.getOperand(0).getValueType() != VT)
48438 return SDValue();
48439
48440 N0 = N0.getOperand(0);
48441 }
48442
48443 if (SDValue NN1 = PromoteMaskArithmetic(N1, DL, VT, DAG, Depth + 1))
48444 N1 = NN1;
48445 else {
48446 // The right side has to be a 'trunc' or a (foldable) constant.
48447 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
48448 N1.getOperand(0).getValueType() == VT;
48449 if (RHSTrunc)
48450 N1 = N1.getOperand(0);
48451 else if (SDValue Cst =
48453 N1 = Cst;
48454 else
48455 return SDValue();
48456 }
48457
48458 return DAG.getNode(N.getOpcode(), DL, VT, N0, N1);
48459}
48460
48461// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
48462// register. In most cases we actually compare or select YMM-sized registers
48463// and mixing the two types creates horrible code. This method optimizes
48464// some of the transition sequences.
48465// Even with AVX-512 this is still useful for removing casts around logical
48466// operations on vXi1 mask types.
48468 SelectionDAG &DAG,
48469 const X86Subtarget &Subtarget) {
48470 EVT VT = N.getValueType();
48471 assert(VT.isVector() && "Expected vector type");
48472 assert((N.getOpcode() == ISD::ANY_EXTEND ||
48473 N.getOpcode() == ISD::ZERO_EXTEND ||
48474 N.getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
48475
48476 SDValue Narrow = N.getOperand(0);
48477 EVT NarrowVT = Narrow.getValueType();
48478
48479 // Generate the wide operation.
48480 SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, 0);
48481 if (!Op)
48482 return SDValue();
48483 switch (N.getOpcode()) {
48484 default: llvm_unreachable("Unexpected opcode");
48485 case ISD::ANY_EXTEND:
48486 return Op;
48487 case ISD::ZERO_EXTEND:
48488 return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
48489 case ISD::SIGN_EXTEND:
48490 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
48491 Op, DAG.getValueType(NarrowVT));
48492 }
48493}
48494
48495static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
48496 unsigned FPOpcode;
48497 switch (Opcode) {
48498 // clang-format off
48499 default: llvm_unreachable("Unexpected input node for FP logic conversion");
48500 case ISD::AND: FPOpcode = X86ISD::FAND; break;
48501 case ISD::OR: FPOpcode = X86ISD::FOR; break;
48502 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
48503 // clang-format on
48504 }
48505 return FPOpcode;
48506}
48507
48508/// If both input operands of a logic op are being cast from floating-point
48509/// types or FP compares, try to convert this into a floating-point logic node
48510/// to avoid unnecessary moves from SSE to integer registers.
48513 const X86Subtarget &Subtarget) {
48514 EVT VT = N->getValueType(0);
48515 SDValue N0 = N->getOperand(0);
48516 SDValue N1 = N->getOperand(1);
48517 SDLoc DL(N);
48518
48519 if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||
48520 (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))
48521 return SDValue();
48522
48523 SDValue N00 = N0.getOperand(0);
48524 SDValue N10 = N1.getOperand(0);
48525 EVT N00Type = N00.getValueType();
48526 EVT N10Type = N10.getValueType();
48527
48528 // Ensure that both types are the same and are legal scalar fp types.
48529 if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
48530 (Subtarget.hasSSE2() && N00Type == MVT::f64) ||
48531 (Subtarget.hasFP16() && N00Type == MVT::f16)))
48532 return SDValue();
48533
48534 if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {
48535 unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());
48536 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
48537 return DAG.getBitcast(VT, FPLogic);
48538 }
48539
48540 if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||
48541 !N1.hasOneUse())
48542 return SDValue();
48543
48544 ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();
48545 ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();
48546
48547 // The vector ISA for FP predicates is incomplete before AVX, so converting
48548 // COMIS* to CMPS* may not be a win before AVX.
48549 if (!Subtarget.hasAVX() &&
48550 !(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))
48551 return SDValue();
48552
48553 // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)
48554 // and vector logic:
48555 // logic (setcc N00, N01), (setcc N10, N11) -->
48556 // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0
48557 unsigned NumElts = 128 / N00Type.getSizeInBits();
48558 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);
48559 EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
48560 SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);
48561 SDValue N01 = N0.getOperand(1);
48562 SDValue N11 = N1.getOperand(1);
48563 SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);
48564 SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);
48565 SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);
48566 SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);
48567 SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);
48568 SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);
48569 SDValue Logic = DAG.getNode(N->getOpcode(), DL, BoolVecVT, Setcc0, Setcc1);
48570 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);
48571}
48572
48573// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
48574// to reduce XMM->GPR traffic.
48576 unsigned Opc = N->getOpcode();
48577 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
48578 "Unexpected bit opcode");
48579
48580 SDValue N0 = N->getOperand(0);
48581 SDValue N1 = N->getOperand(1);
48582
48583 // Both operands must be single use MOVMSK.
48584 if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
48585 N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
48586 return SDValue();
48587
48588 SDValue Vec0 = N0.getOperand(0);
48589 SDValue Vec1 = N1.getOperand(0);
48590 EVT VecVT0 = Vec0.getValueType();
48591 EVT VecVT1 = Vec1.getValueType();
48592
48593 // Both MOVMSK operands must be from vectors of the same size and same element
48594 // size, but its OK for a fp/int diff.
48595 if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
48596 VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
48597 return SDValue();
48598
48599 SDLoc DL(N);
48600 unsigned VecOpc =
48601 VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;
48602 SDValue Result =
48603 DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
48604 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
48605}
48606
48607// Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).
48608// NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws
48609// handles in InstCombine.
48611 unsigned Opc = N->getOpcode();
48612 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
48613 "Unexpected bit opcode");
48614
48615 SDValue N0 = N->getOperand(0);
48616 SDValue N1 = N->getOperand(1);
48617 EVT VT = N->getValueType(0);
48618
48619 // Both operands must be single use.
48620 if (!N0.hasOneUse() || !N1.hasOneUse())
48621 return SDValue();
48622
48623 // Search for matching shifts.
48626
48627 unsigned BCOpc = BC0.getOpcode();
48628 EVT BCVT = BC0.getValueType();
48629 if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())
48630 return SDValue();
48631
48632 switch (BCOpc) {
48633 case X86ISD::VSHLI:
48634 case X86ISD::VSRLI:
48635 case X86ISD::VSRAI: {
48636 if (BC0.getOperand(1) != BC1.getOperand(1))
48637 return SDValue();
48638
48639 SDLoc DL(N);
48640 SDValue BitOp =
48641 DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));
48642 SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));
48643 return DAG.getBitcast(VT, Shift);
48644 }
48645 }
48646
48647 return SDValue();
48648}
48649
48650// Attempt to fold:
48651// BITOP(PACKSS(X,Z),PACKSS(Y,W)) --> PACKSS(BITOP(X,Y),BITOP(Z,W)).
48652// TODO: Handle PACKUS handling.
48654 unsigned Opc = N->getOpcode();
48655 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
48656 "Unexpected bit opcode");
48657
48658 SDValue N0 = N->getOperand(0);
48659 SDValue N1 = N->getOperand(1);
48660 EVT VT = N->getValueType(0);
48661
48662 // Both operands must be single use.
48663 if (!N0.hasOneUse() || !N1.hasOneUse())
48664 return SDValue();
48665
48666 // Search for matching packs.
48669
48670 if (N0.getOpcode() != X86ISD::PACKSS || N1.getOpcode() != X86ISD::PACKSS)
48671 return SDValue();
48672
48673 MVT DstVT = N0.getSimpleValueType();
48674 if (DstVT != N1.getSimpleValueType())
48675 return SDValue();
48676
48677 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
48678 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
48679
48680 // Limit to allsignbits packing.
48681 if (DAG.ComputeNumSignBits(N0.getOperand(0)) != NumSrcBits ||
48682 DAG.ComputeNumSignBits(N0.getOperand(1)) != NumSrcBits ||
48683 DAG.ComputeNumSignBits(N1.getOperand(0)) != NumSrcBits ||
48684 DAG.ComputeNumSignBits(N1.getOperand(1)) != NumSrcBits)
48685 return SDValue();
48686
48687 SDLoc DL(N);
48688 SDValue LHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(0), N1.getOperand(0));
48689 SDValue RHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(1), N1.getOperand(1));
48690 return DAG.getBitcast(VT, DAG.getNode(X86ISD::PACKSS, DL, DstVT, LHS, RHS));
48691}
48692
48693/// If this is a zero/all-bits result that is bitwise-anded with a low bits
48694/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
48695/// with a shift-right to eliminate loading the vector constant mask value.
48697 const X86Subtarget &Subtarget) {
48698 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
48699 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
48700 EVT VT = Op0.getValueType();
48701 if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())
48702 return SDValue();
48703
48704 // Try to convert an "is positive" signbit masking operation into arithmetic
48705 // shift and "andn". This saves a materialization of a -1 vector constant.
48706 // The "is negative" variant should be handled more generally because it only
48707 // requires "and" rather than "andn":
48708 // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y
48709 //
48710 // This is limited to the original type to avoid producing even more bitcasts.
48711 // If the bitcasts can't be eliminated, then it is unlikely that this fold
48712 // will be profitable.
48713 if (N->getValueType(0) == VT &&
48714 supportedVectorShiftWithImm(VT, Subtarget, ISD::SRA)) {
48715 SDValue X, Y;
48716 if (Op1.getOpcode() == X86ISD::PCMPGT &&
48717 isAllOnesOrAllOnesSplat(Op1.getOperand(1)) && Op1.hasOneUse()) {
48718 X = Op1.getOperand(0);
48719 Y = Op0;
48720 } else if (Op0.getOpcode() == X86ISD::PCMPGT &&
48721 isAllOnesOrAllOnesSplat(Op0.getOperand(1)) && Op0.hasOneUse()) {
48722 X = Op0.getOperand(0);
48723 Y = Op1;
48724 }
48725 if (X && Y) {
48726 SDLoc DL(N);
48727 SDValue Sra =
48729 VT.getScalarSizeInBits() - 1, DAG);
48730 return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);
48731 }
48732 }
48733
48734 APInt SplatVal;
48735 if (!X86::isConstantSplat(Op1, SplatVal, false) || !SplatVal.isMask())
48736 return SDValue();
48737
48738 // Don't prevent creation of ANDN.
48739 if (isBitwiseNot(Op0))
48740 return SDValue();
48741
48742 if (!supportedVectorShiftWithImm(VT, Subtarget, ISD::SRL))
48743 return SDValue();
48744
48745 unsigned EltBitWidth = VT.getScalarSizeInBits();
48746 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
48747 return SDValue();
48748
48749 SDLoc DL(N);
48750 unsigned ShiftVal = SplatVal.countr_one();
48751 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
48752 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);
48753 return DAG.getBitcast(N->getValueType(0), Shift);
48754}
48755
48756// Get the index node from the lowered DAG of a GEP IR instruction with one
48757// indexing dimension.
48759 if (Ld->isIndexed())
48760 return SDValue();
48761
48762 SDValue Base = Ld->getBasePtr();
48763
48764 if (Base.getOpcode() != ISD::ADD)
48765 return SDValue();
48766
48767 SDValue ShiftedIndex = Base.getOperand(0);
48768
48769 if (ShiftedIndex.getOpcode() != ISD::SHL)
48770 return SDValue();
48771
48772 return ShiftedIndex.getOperand(0);
48773
48774}
48775
48776static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
48777 return Subtarget.hasBMI2() &&
48778 (VT == MVT::i32 || (VT == MVT::i64 && Subtarget.is64Bit()));
48779}
48780
48781// This function recognizes cases where X86 bzhi instruction can replace and
48782// 'and-load' sequence.
48783// In case of loading integer value from an array of constants which is defined
48784// as follows:
48785//
48786// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
48787//
48788// then applying a bitwise and on the result with another input.
48789// It's equivalent to performing bzhi (zero high bits) on the input, with the
48790// same index of the load.
48792 const X86Subtarget &Subtarget) {
48793 MVT VT = Node->getSimpleValueType(0);
48794 SDLoc dl(Node);
48795
48796 // Check if subtarget has BZHI instruction for the node's type
48797 if (!hasBZHI(Subtarget, VT))
48798 return SDValue();
48799
48800 // Try matching the pattern for both operands.
48801 for (unsigned i = 0; i < 2; i++) {
48802 SDValue N = Node->getOperand(i);
48803 LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
48804
48805 // continue if the operand is not a load instruction
48806 if (!Ld)
48807 return SDValue();
48808
48809 const Value *MemOp = Ld->getMemOperand()->getValue();
48810
48811 if (!MemOp)
48812 return SDValue();
48813
48814 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
48815 if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
48816 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
48817
48818 Constant *Init = GV->getInitializer();
48819 Type *Ty = Init->getType();
48820 if (!isa<ConstantDataArray>(Init) ||
48821 !Ty->getArrayElementType()->isIntegerTy() ||
48823 VT.getSizeInBits() ||
48824 Ty->getArrayNumElements() >
48826 continue;
48827
48828 // Check if the array's constant elements are suitable to our case.
48829 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
48830 bool ConstantsMatch = true;
48831 for (uint64_t j = 0; j < ArrayElementCount; j++) {
48832 auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
48833 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
48834 ConstantsMatch = false;
48835 break;
48836 }
48837 }
48838 if (!ConstantsMatch)
48839 continue;
48840
48841 // Do the transformation (For 32-bit type):
48842 // -> (and (load arr[idx]), inp)
48843 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
48844 // that will be replaced with one bzhi instruction.
48845 SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
48846 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
48847
48848 // Get the Node which indexes into the array.
48850 if (!Index)
48851 return SDValue();
48852 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
48853
48854 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
48855 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
48856
48857 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
48858 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
48859
48860 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
48861 }
48862 }
48863 }
48864 }
48865 return SDValue();
48866}
48867
48868// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
48869// Where C is a mask containing the same number of bits as the setcc and
48870// where the setcc will freely 0 upper bits of k-register. We can replace the
48871// undef in the concat with 0s and remove the AND. This mainly helps with
48872// v2i1/v4i1 setcc being casted to scalar.
48874 const X86Subtarget &Subtarget) {
48875 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
48876
48877 EVT VT = N->getValueType(0);
48878
48879 // Make sure this is an AND with constant. We will check the value of the
48880 // constant later.
48881 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
48882 if (!C1)
48883 return SDValue();
48884
48885 // This is implied by the ConstantSDNode.
48886 assert(!VT.isVector() && "Expected scalar VT!");
48887
48888 SDValue Src = N->getOperand(0);
48889 if (!Src.hasOneUse())
48890 return SDValue();
48891
48892 // (Optionally) peek through any_extend().
48893 if (Src.getOpcode() == ISD::ANY_EXTEND) {
48894 if (!Src.getOperand(0).hasOneUse())
48895 return SDValue();
48896 Src = Src.getOperand(0);
48897 }
48898
48899 if (Src.getOpcode() != ISD::BITCAST || !Src.getOperand(0).hasOneUse())
48900 return SDValue();
48901
48902 Src = Src.getOperand(0);
48903 EVT SrcVT = Src.getValueType();
48904
48905 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48906 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
48907 !TLI.isTypeLegal(SrcVT))
48908 return SDValue();
48909
48910 if (Src.getOpcode() != ISD::CONCAT_VECTORS)
48911 return SDValue();
48912
48913 // We only care about the first subvector of the concat, we expect the
48914 // other subvectors to be ignored due to the AND if we make the change.
48915 SDValue SubVec = Src.getOperand(0);
48916 EVT SubVecVT = SubVec.getValueType();
48917
48918 // The RHS of the AND should be a mask with as many bits as SubVec.
48919 if (!TLI.isTypeLegal(SubVecVT) ||
48920 !C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements()))
48921 return SDValue();
48922
48923 // First subvector should be a setcc with a legal result type or a
48924 // AND containing at least one setcc with a legal result type.
48925 auto IsLegalSetCC = [&](SDValue V) {
48926 if (V.getOpcode() != ISD::SETCC)
48927 return false;
48928 EVT SetccVT = V.getOperand(0).getValueType();
48929 if (!TLI.isTypeLegal(SetccVT) ||
48930 !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
48931 return false;
48932 if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
48933 return false;
48934 return true;
48935 };
48936 if (!(IsLegalSetCC(SubVec) || (SubVec.getOpcode() == ISD::AND &&
48937 (IsLegalSetCC(SubVec.getOperand(0)) ||
48938 IsLegalSetCC(SubVec.getOperand(1))))))
48939 return SDValue();
48940
48941 // We passed all the checks. Rebuild the concat_vectors with zeroes
48942 // and cast it back to VT.
48943 SDLoc dl(N);
48944 SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
48945 DAG.getConstant(0, dl, SubVecVT));
48946 Ops[0] = SubVec;
48947 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
48948 Ops);
48949 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits());
48950 return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT);
48951}
48952
48953static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG,
48954 SDValue OpMustEq, SDValue Op, unsigned Depth) {
48955 // We don't want to go crazy with the recursion here. This isn't a super
48956 // important optimization.
48957 static constexpr unsigned kMaxDepth = 2;
48958
48959 // Only do this re-ordering if op has one use.
48960 if (!Op.hasOneUse())
48961 return SDValue();
48962
48963 SDLoc DL(Op);
48964 // If we hit another assosiative op, recurse further.
48965 if (Op.getOpcode() == Opc) {
48966 // Done recursing.
48967 if (Depth++ >= kMaxDepth)
48968 return SDValue();
48969
48970 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
48971 if (SDValue R =
48972 getBMIMatchingOp(Opc, DAG, OpMustEq, Op.getOperand(OpIdx), Depth))
48973 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), R,
48974 Op.getOperand(1 - OpIdx));
48975
48976 } else if (Op.getOpcode() == ISD::SUB) {
48977 if (Opc == ISD::AND) {
48978 // BLSI: (and x, (sub 0, x))
48979 if (isNullConstant(Op.getOperand(0)) && Op.getOperand(1) == OpMustEq)
48980 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
48981 }
48982 // Opc must be ISD::AND or ISD::XOR
48983 // BLSR: (and x, (sub x, 1))
48984 // BLSMSK: (xor x, (sub x, 1))
48985 if (isOneConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
48986 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
48987
48988 } else if (Op.getOpcode() == ISD::ADD) {
48989 // Opc must be ISD::AND or ISD::XOR
48990 // BLSR: (and x, (add x, -1))
48991 // BLSMSK: (xor x, (add x, -1))
48992 if (isAllOnesConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
48993 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
48994 }
48995 return SDValue();
48996}
48997
48999 const X86Subtarget &Subtarget) {
49000 EVT VT = N->getValueType(0);
49001 // Make sure this node is a candidate for BMI instructions.
49002 if (!Subtarget.hasBMI() || !VT.isScalarInteger() ||
49003 (VT != MVT::i32 && VT != MVT::i64))
49004 return SDValue();
49005
49006 assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR);
49007
49008 // Try and match LHS and RHS.
49009 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
49010 if (SDValue OpMatch =
49011 getBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx),
49012 N->getOperand(1 - OpIdx), 0))
49013 return OpMatch;
49014 return SDValue();
49015}
49016
49019 const X86Subtarget &Subtarget) {
49020 SDValue N0 = N->getOperand(0);
49021 SDValue N1 = N->getOperand(1);
49022 EVT VT = N->getValueType(0);
49023 SDLoc dl(N);
49024 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49025
49026 // If this is SSE1 only convert to FAND to avoid scalarization.
49027 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
49028 return DAG.getBitcast(MVT::v4i32,
49029 DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,
49030 DAG.getBitcast(MVT::v4f32, N0),
49031 DAG.getBitcast(MVT::v4f32, N1)));
49032 }
49033
49034 // Use a 32-bit and+zext if upper bits known zero.
49035 if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {
49036 APInt HiMask = APInt::getHighBitsSet(64, 32);
49037 if (DAG.MaskedValueIsZero(N1, HiMask) ||
49038 DAG.MaskedValueIsZero(N0, HiMask)) {
49039 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);
49040 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);
49041 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
49042 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
49043 }
49044 }
49045
49046 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
49047 // TODO: Support multiple SrcOps.
49048 if (VT == MVT::i1) {
49050 SmallVector<APInt, 2> SrcPartials;
49051 if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
49052 SrcOps.size() == 1) {
49053 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
49054 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
49055 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
49056 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
49057 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
49058 if (Mask) {
49059 assert(SrcPartials[0].getBitWidth() == NumElts &&
49060 "Unexpected partial reduction mask");
49061 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
49062 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
49063 return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
49064 }
49065 }
49066 }
49067
49068 // InstCombine converts:
49069 // `(-x << C0) & C1`
49070 // to
49071 // `(x * (Pow2_Ceil(C1) - (1 << C0))) & C1`
49072 // This saves an IR instruction but on x86 the neg/shift version is preferable
49073 // so undo the transform.
49074
49075 if (N0.getOpcode() == ISD::MUL && N0.hasOneUse()) {
49076 // TODO: We don't actually need a splat for this, we just need the checks to
49077 // hold for each element.
49078 ConstantSDNode *N1C = isConstOrConstSplat(N1, /*AllowUndefs*/ true,
49079 /*AllowTruncation*/ false);
49080 ConstantSDNode *N01C =
49081 isConstOrConstSplat(N0.getOperand(1), /*AllowUndefs*/ true,
49082 /*AllowTruncation*/ false);
49083 if (N1C && N01C) {
49084 const APInt &MulC = N01C->getAPIntValue();
49085 const APInt &AndC = N1C->getAPIntValue();
49086 APInt MulCLowBit = MulC & (-MulC);
49087 if (MulC.uge(AndC) && !MulC.isPowerOf2() &&
49088 (MulCLowBit + MulC).isPowerOf2()) {
49089 SDValue Neg = DAG.getNegative(N0.getOperand(0), dl, VT);
49090 int32_t MulCLowBitLog = MulCLowBit.exactLogBase2();
49091 assert(MulCLowBitLog != -1 &&
49092 "Isolated lowbit is somehow not a power of 2!");
49093 SDValue Shift = DAG.getNode(ISD::SHL, dl, VT, Neg,
49094 DAG.getConstant(MulCLowBitLog, dl, VT));
49095 return DAG.getNode(ISD::AND, dl, VT, Shift, N1);
49096 }
49097 }
49098 }
49099
49100 if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
49101 return V;
49102
49103 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
49104 return R;
49105
49106 if (SDValue R = combineBitOpWithShift(N, DAG))
49107 return R;
49108
49109 if (SDValue R = combineBitOpWithPACK(N, DAG))
49110 return R;
49111
49112 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
49113 return FPLogic;
49114
49115 if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget))
49116 return R;
49117
49118 if (DCI.isBeforeLegalizeOps())
49119 return SDValue();
49120
49121 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
49122 return R;
49123
49124 if (SDValue R = combineAndNotIntoANDNP(N, DAG))
49125 return R;
49126
49127 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
49128 return ShiftRight;
49129
49130 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
49131 return R;
49132
49133 // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
49134 // iff c2 is all/no bits mask - i.e. a select-with-zero mask.
49135 // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?
49136 if (VT.isVector() && getTargetConstantFromNode(N1)) {
49137 unsigned Opc0 = N0.getOpcode();
49138 if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&
49140 DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&
49141 N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {
49142 SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);
49143 return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);
49144 }
49145 }
49146
49147 // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
49148 // avoids slow variable shift (moving shift amount to ECX etc.)
49149 if (isOneConstant(N1) && N0->hasOneUse()) {
49150 SDValue Src = N0;
49151 while ((Src.getOpcode() == ISD::ZERO_EXTEND ||
49152 Src.getOpcode() == ISD::TRUNCATE) &&
49153 Src.getOperand(0)->hasOneUse())
49154 Src = Src.getOperand(0);
49155 bool ContainsNOT = false;
49156 X86::CondCode X86CC = X86::COND_B;
49157 // Peek through AND(NOT(SRL(X,Y)),1).
49158 if (isBitwiseNot(Src)) {
49159 Src = Src.getOperand(0);
49160 X86CC = X86::COND_AE;
49161 ContainsNOT = true;
49162 }
49163 if (Src.getOpcode() == ISD::SRL &&
49164 !isa<ConstantSDNode>(Src.getOperand(1))) {
49165 SDValue BitNo = Src.getOperand(1);
49166 Src = Src.getOperand(0);
49167 // Peek through AND(SRL(NOT(X),Y),1).
49168 if (isBitwiseNot(Src)) {
49169 Src = Src.getOperand(0);
49170 X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;
49171 ContainsNOT = true;
49172 }
49173 // If we have BMI2 then SHRX should be faster for i32/i64 cases.
49174 if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32))
49175 if (SDValue BT = getBT(Src, BitNo, dl, DAG))
49176 return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);
49177 }
49178 }
49179
49180 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
49181 // Attempt to recursively combine a bitmask AND with shuffles.
49182 SDValue Op(N, 0);
49183 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
49184 return Res;
49185
49186 // If either operand is a constant mask, then only the elements that aren't
49187 // zero are actually demanded by the other operand.
49188 auto GetDemandedMasks = [&](SDValue Op) {
49189 APInt UndefElts;
49190 SmallVector<APInt> EltBits;
49191 int NumElts = VT.getVectorNumElements();
49192 int EltSizeInBits = VT.getScalarSizeInBits();
49193 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
49194 APInt DemandedElts = APInt::getAllOnes(NumElts);
49195 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
49196 EltBits)) {
49197 DemandedBits.clearAllBits();
49198 DemandedElts.clearAllBits();
49199 for (int I = 0; I != NumElts; ++I) {
49200 if (UndefElts[I]) {
49201 // We can't assume an undef src element gives an undef dst - the
49202 // other src might be zero.
49203 DemandedBits.setAllBits();
49204 DemandedElts.setBit(I);
49205 } else if (!EltBits[I].isZero()) {
49206 DemandedBits |= EltBits[I];
49207 DemandedElts.setBit(I);
49208 }
49209 }
49210 }
49211 return std::make_pair(DemandedBits, DemandedElts);
49212 };
49213 APInt Bits0, Elts0;
49214 APInt Bits1, Elts1;
49215 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
49216 std::tie(Bits1, Elts1) = GetDemandedMasks(N0);
49217
49218 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
49219 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
49220 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
49221 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
49222 if (N->getOpcode() != ISD::DELETED_NODE)
49223 DCI.AddToWorklist(N);
49224 return SDValue(N, 0);
49225 }
49226
49227 SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Bits0, Elts0, DAG);
49228 SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Bits1, Elts1, DAG);
49229 if (NewN0 || NewN1)
49230 return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,
49231 NewN1 ? NewN1 : N1);
49232 }
49233
49234 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
49235 if ((VT.getScalarSizeInBits() % 8) == 0 &&
49237 isa<ConstantSDNode>(N0.getOperand(1)) && N0->hasOneUse()) {
49238 SDValue BitMask = N1;
49239 SDValue SrcVec = N0.getOperand(0);
49240 EVT SrcVecVT = SrcVec.getValueType();
49241
49242 // Check that the constant bitmask masks whole bytes.
49243 APInt UndefElts;
49244 SmallVector<APInt, 64> EltBits;
49245 if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&
49246 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
49247 llvm::all_of(EltBits, [](const APInt &M) {
49248 return M.isZero() || M.isAllOnes();
49249 })) {
49250 unsigned NumElts = SrcVecVT.getVectorNumElements();
49251 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
49252 unsigned Idx = N0.getConstantOperandVal(1);
49253
49254 // Create a root shuffle mask from the byte mask and the extracted index.
49255 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
49256 for (unsigned i = 0; i != Scale; ++i) {
49257 if (UndefElts[i])
49258 continue;
49259 int VecIdx = Scale * Idx + i;
49260 ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;
49261 }
49262
49264 {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
49266 /*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true,
49267 /*AllowVarPerLaneMask*/ true, DAG, Subtarget))
49268 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,
49269 N0.getOperand(1));
49270 }
49271 }
49272
49273 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
49274 return R;
49275
49276 return SDValue();
49277}
49278
49279// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
49281 const X86Subtarget &Subtarget) {
49282 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
49283
49284 MVT VT = N->getSimpleValueType(0);
49285 unsigned EltSizeInBits = VT.getScalarSizeInBits();
49286 if (!VT.isVector() || (EltSizeInBits % 8) != 0)
49287 return SDValue();
49288
49289 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
49290 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
49291 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
49292 return SDValue();
49293
49294 // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
49295 // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
49296 if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||
49297 !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
49298 return SDValue();
49299
49300 // Attempt to extract constant byte masks.
49301 APInt UndefElts0, UndefElts1;
49302 SmallVector<APInt, 32> EltBits0, EltBits1;
49303 if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
49304 /*AllowWholeUndefs*/ false,
49305 /*AllowPartialUndefs*/ false))
49306 return SDValue();
49307 if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
49308 /*AllowWholeUndefs*/ false,
49309 /*AllowPartialUndefs*/ false))
49310 return SDValue();
49311
49312 for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
49313 // TODO - add UNDEF elts support.
49314 if (UndefElts0[i] || UndefElts1[i])
49315 return SDValue();
49316 if (EltBits0[i] != ~EltBits1[i])
49317 return SDValue();
49318 }
49319
49320 SDLoc DL(N);
49321
49322 if (useVPTERNLOG(Subtarget, VT)) {
49323 // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.
49324 // VPTERNLOG is only available as vXi32/64-bit types.
49325 MVT OpSVT = EltSizeInBits <= 32 ? MVT::i32 : MVT::i64;
49326 MVT OpVT =
49327 MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());
49328 SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));
49329 SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));
49330 SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));
49331 SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
49332 SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},
49333 DAG, Subtarget);
49334 return DAG.getBitcast(VT, Res);
49335 }
49336
49337 SDValue X = N->getOperand(0);
49338 SDValue Y =
49339 DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
49340 DAG.getBitcast(VT, N1.getOperand(0)));
49341 return DAG.getNode(ISD::OR, DL, VT, X, Y);
49342}
49343
49344// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
49345static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
49346 if (N->getOpcode() != ISD::OR)
49347 return false;
49348
49349 SDValue N0 = N->getOperand(0);
49350 SDValue N1 = N->getOperand(1);
49351
49352 // Canonicalize AND to LHS.
49353 if (N1.getOpcode() == ISD::AND)
49354 std::swap(N0, N1);
49355
49356 // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
49357 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
49358 return false;
49359
49360 Mask = N1.getOperand(0);
49361 X = N1.getOperand(1);
49362
49363 // Check to see if the mask appeared in both the AND and ANDNP.
49364 if (N0.getOperand(0) == Mask)
49365 Y = N0.getOperand(1);
49366 else if (N0.getOperand(1) == Mask)
49367 Y = N0.getOperand(0);
49368 else
49369 return false;
49370
49371 // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
49372 // ANDNP combine allows other combines to happen that prevent matching.
49373 return true;
49374}
49375
49376// Try to fold:
49377// (or (and (m, y), (pandn m, x)))
49378// into:
49379// (vselect m, x, y)
49380// As a special case, try to fold:
49381// (or (and (m, (sub 0, x)), (pandn m, x)))
49382// into:
49383// (sub (xor X, M), M)
49385 const X86Subtarget &Subtarget) {
49386 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
49387
49388 EVT VT = N->getValueType(0);
49389 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
49390 (VT.is256BitVector() && Subtarget.hasInt256())))
49391 return SDValue();
49392
49393 SDValue X, Y, Mask;
49394 if (!matchLogicBlend(N, X, Y, Mask))
49395 return SDValue();
49396
49397 // Validate that X, Y, and Mask are bitcasts, and see through them.
49398 Mask = peekThroughBitcasts(Mask);
49401
49402 EVT MaskVT = Mask.getValueType();
49403 unsigned EltBits = MaskVT.getScalarSizeInBits();
49404
49405 // TODO: Attempt to handle floating point cases as well?
49406 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
49407 return SDValue();
49408
49409 SDLoc DL(N);
49410
49411 // Attempt to combine to conditional negate: (sub (xor X, M), M)
49412 if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
49413 DAG, Subtarget))
49414 return Res;
49415
49416 // PBLENDVB is only available on SSE 4.1.
49417 if (!Subtarget.hasSSE41())
49418 return SDValue();
49419
49420 // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
49421 if (Subtarget.hasVLX())
49422 return SDValue();
49423
49424 MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
49425
49426 X = DAG.getBitcast(BlendVT, X);
49427 Y = DAG.getBitcast(BlendVT, Y);
49428 Mask = DAG.getBitcast(BlendVT, Mask);
49429 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
49430 return DAG.getBitcast(VT, Mask);
49431}
49432
49433// Helper function for combineOrCmpEqZeroToCtlzSrl
49434// Transforms:
49435// seteq(cmp x, 0)
49436// into:
49437// srl(ctlz x), log2(bitsize(x))
49438// Input pattern is checked by caller.
49440 SDValue Cmp = Op.getOperand(1);
49441 EVT VT = Cmp.getOperand(0).getValueType();
49442 unsigned Log2b = Log2_32(VT.getSizeInBits());
49443 SDLoc dl(Op);
49444 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
49445 // The result of the shift is true or false, and on X86, the 32-bit
49446 // encoding of shr and lzcnt is more desirable.
49447 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
49448 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
49449 DAG.getConstant(Log2b, dl, MVT::i8));
49450 return Scc;
49451}
49452
49453// Try to transform:
49454// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
49455// into:
49456// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
49457// Will also attempt to match more generic cases, eg:
49458// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
49459// Only applies if the target supports the FastLZCNT feature.
49462 const X86Subtarget &Subtarget) {
49463 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
49464 return SDValue();
49465
49466 auto isORCandidate = [](SDValue N) {
49467 return (N->getOpcode() == ISD::OR && N->hasOneUse());
49468 };
49469
49470 // Check the zero extend is extending to 32-bit or more. The code generated by
49471 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
49472 // instructions to clear the upper bits.
49473 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
49474 !isORCandidate(N->getOperand(0)))
49475 return SDValue();
49476
49477 // Check the node matches: setcc(eq, cmp 0)
49478 auto isSetCCCandidate = [](SDValue N) {
49479 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
49480 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
49481 N->getOperand(1).getOpcode() == X86ISD::CMP &&
49482 isNullConstant(N->getOperand(1).getOperand(1)) &&
49483 N->getOperand(1).getValueType().bitsGE(MVT::i32);
49484 };
49485
49486 SDNode *OR = N->getOperand(0).getNode();
49487 SDValue LHS = OR->getOperand(0);
49488 SDValue RHS = OR->getOperand(1);
49489
49490 // Save nodes matching or(or, setcc(eq, cmp 0)).
49492 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
49493 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
49494 ORNodes.push_back(OR);
49495 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
49496 LHS = OR->getOperand(0);
49497 RHS = OR->getOperand(1);
49498 }
49499
49500 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
49501 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
49502 !isORCandidate(SDValue(OR, 0)))
49503 return SDValue();
49504
49505 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
49506 // to
49507 // or(srl(ctlz),srl(ctlz)).
49508 // The dag combiner can then fold it into:
49509 // srl(or(ctlz, ctlz)).
49510 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);
49511 SDValue Ret, NewRHS;
49512 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))
49513 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);
49514
49515 if (!Ret)
49516 return SDValue();
49517
49518 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
49519 while (!ORNodes.empty()) {
49520 OR = ORNodes.pop_back_val();
49521 LHS = OR->getOperand(0);
49522 RHS = OR->getOperand(1);
49523 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
49524 if (RHS->getOpcode() == ISD::OR)
49525 std::swap(LHS, RHS);
49526 NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);
49527 if (!NewRHS)
49528 return SDValue();
49529 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);
49530 }
49531
49532 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
49533}
49534
49536 SDValue And1_L, SDValue And1_R,
49537 const SDLoc &DL, SelectionDAG &DAG) {
49538 if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse())
49539 return SDValue();
49540 SDValue NotOp = And0_L->getOperand(0);
49541 if (NotOp == And1_R)
49542 std::swap(And1_R, And1_L);
49543 if (NotOp != And1_L)
49544 return SDValue();
49545
49546 // (~(NotOp) & And0_R) | (NotOp & And1_R)
49547 // --> ((And0_R ^ And1_R) & NotOp) ^ And1_R
49548 EVT VT = And1_L->getValueType(0);
49549 SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R);
49550 SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R);
49551 SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);
49552 SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R);
49553 return Xor1;
49554}
49555
49556/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
49557/// equivalent `((x ^ y) & m) ^ y)` pattern.
49558/// This is typically a better representation for targets without a fused
49559/// "and-not" operation. This function is intended to be called from a
49560/// `TargetLowering::PerformDAGCombine` callback on `ISD::OR` nodes.
49562 // Note that masked-merge variants using XOR or ADD expressions are
49563 // normalized to OR by InstCombine so we only check for OR.
49564 assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node");
49565 SDValue N0 = Node->getOperand(0);
49566 if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())
49567 return SDValue();
49568 SDValue N1 = Node->getOperand(1);
49569 if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())
49570 return SDValue();
49571
49572 SDLoc DL(Node);
49573 SDValue N00 = N0->getOperand(0);
49574 SDValue N01 = N0->getOperand(1);
49575 SDValue N10 = N1->getOperand(0);
49576 SDValue N11 = N1->getOperand(1);
49577 if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))
49578 return Result;
49579 if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))
49580 return Result;
49581 if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))
49582 return Result;
49583 if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))
49584 return Result;
49585 return SDValue();
49586}
49587
49588/// If this is an add or subtract where one operand is produced by a cmp+setcc,
49589/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
49590/// with CMP+{ADC, SBB}.
49591/// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.
49592static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
49593 SDValue X, SDValue Y,
49594 SelectionDAG &DAG,
49595 bool ZeroSecondOpOnly = false) {
49596 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
49597 return SDValue();
49598
49599 // Look through a one-use zext.
49600 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())
49601 Y = Y.getOperand(0);
49602
49604 SDValue EFLAGS;
49605 if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {
49606 CC = (X86::CondCode)Y.getConstantOperandVal(0);
49607 EFLAGS = Y.getOperand(1);
49608 } else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&
49609 Y.hasOneUse()) {
49610 EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);
49611 }
49612
49613 if (!EFLAGS)
49614 return SDValue();
49615
49616 // If X is -1 or 0, then we have an opportunity to avoid constants required in
49617 // the general case below.
49618 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
49619 if (ConstantX && !ZeroSecondOpOnly) {
49620 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
49621 (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
49622 // This is a complicated way to get -1 or 0 from the carry flag:
49623 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
49624 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
49625 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49626 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49627 EFLAGS);
49628 }
49629
49630 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
49631 (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
49632 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
49633 EFLAGS.getValueType().isInteger() &&
49634 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
49635 // Swap the operands of a SUB, and we have the same pattern as above.
49636 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
49637 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
49638 SDValue NewSub = DAG.getNode(
49639 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
49640 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
49641 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
49642 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49643 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49644 NewEFLAGS);
49645 }
49646 }
49647 }
49648
49649 if (CC == X86::COND_B) {
49650 // X + SETB Z --> adc X, 0
49651 // X - SETB Z --> sbb X, 0
49652 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
49653 DAG.getVTList(VT, MVT::i32), X,
49654 DAG.getConstant(0, DL, VT), EFLAGS);
49655 }
49656
49657 if (ZeroSecondOpOnly)
49658 return SDValue();
49659
49660 if (CC == X86::COND_A) {
49661 // Try to convert COND_A into COND_B in an attempt to facilitate
49662 // materializing "setb reg".
49663 //
49664 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
49665 // cannot take an immediate as its first operand.
49666 //
49667 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
49668 EFLAGS.getValueType().isInteger() &&
49669 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
49670 SDValue NewSub =
49671 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
49672 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
49673 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
49674 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
49675 DAG.getVTList(VT, MVT::i32), X,
49676 DAG.getConstant(0, DL, VT), NewEFLAGS);
49677 }
49678 }
49679
49680 if (CC == X86::COND_AE) {
49681 // X + SETAE --> sbb X, -1
49682 // X - SETAE --> adc X, -1
49683 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
49684 DAG.getVTList(VT, MVT::i32), X,
49685 DAG.getConstant(-1, DL, VT), EFLAGS);
49686 }
49687
49688 if (CC == X86::COND_BE) {
49689 // X + SETBE --> sbb X, -1
49690 // X - SETBE --> adc X, -1
49691 // Try to convert COND_BE into COND_AE in an attempt to facilitate
49692 // materializing "setae reg".
49693 //
49694 // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
49695 // cannot take an immediate as its first operand.
49696 //
49697 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
49698 EFLAGS.getValueType().isInteger() &&
49699 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
49700 SDValue NewSub =
49701 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
49702 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
49703 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
49704 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
49705 DAG.getVTList(VT, MVT::i32), X,
49706 DAG.getConstant(-1, DL, VT), NewEFLAGS);
49707 }
49708 }
49709
49710 if (CC != X86::COND_E && CC != X86::COND_NE)
49711 return SDValue();
49712
49713 if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||
49714 !X86::isZeroNode(EFLAGS.getOperand(1)) ||
49715 !EFLAGS.getOperand(0).getValueType().isInteger())
49716 return SDValue();
49717
49718 SDValue Z = EFLAGS.getOperand(0);
49719 EVT ZVT = Z.getValueType();
49720
49721 // If X is -1 or 0, then we have an opportunity to avoid constants required in
49722 // the general case below.
49723 if (ConstantX) {
49724 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
49725 // fake operands:
49726 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
49727 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
49728 if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
49729 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
49730 SDValue Zero = DAG.getConstant(0, DL, ZVT);
49731 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
49732 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
49733 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49734 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49735 SDValue(Neg.getNode(), 1));
49736 }
49737
49738 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
49739 // with fake operands:
49740 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
49741 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
49742 if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
49743 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
49744 SDValue One = DAG.getConstant(1, DL, ZVT);
49745 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
49746 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
49747 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49748 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49749 Cmp1.getValue(1));
49750 }
49751 }
49752
49753 // (cmp Z, 1) sets the carry flag if Z is 0.
49754 SDValue One = DAG.getConstant(1, DL, ZVT);
49755 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
49756 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
49757
49758 // Add the flags type for ADC/SBB nodes.
49759 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
49760
49761 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
49762 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
49763 if (CC == X86::COND_NE)
49764 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
49765 DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));
49766
49767 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
49768 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
49769 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
49770 DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
49771}
49772
49773/// If this is an add or subtract where one operand is produced by a cmp+setcc,
49774/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
49775/// with CMP+{ADC, SBB}.
49777 bool IsSub = N->getOpcode() == ISD::SUB;
49778 SDValue X = N->getOperand(0);
49779 SDValue Y = N->getOperand(1);
49780 EVT VT = N->getValueType(0);
49781 SDLoc DL(N);
49782
49783 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))
49784 return ADCOrSBB;
49785
49786 // Commute and try again (negate the result for subtracts).
49787 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {
49788 if (IsSub)
49789 ADCOrSBB = DAG.getNegative(ADCOrSBB, DL, VT);
49790 return ADCOrSBB;
49791 }
49792
49793 return SDValue();
49794}
49795
49797 SelectionDAG &DAG) {
49798 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::OR) &&
49799 "Unexpected opcode");
49800
49801 // Delegate to combineAddOrSubToADCOrSBB if we have:
49802 //
49803 // (xor/or (zero_extend (setcc)) imm)
49804 //
49805 // where imm is odd if and only if we have xor, in which case the XOR/OR are
49806 // equivalent to a SUB/ADD, respectively.
49807 if (N0.getOpcode() == ISD::ZERO_EXTEND &&
49808 N0.getOperand(0).getOpcode() == X86ISD::SETCC && N0.hasOneUse()) {
49809 if (auto *N1C = dyn_cast<ConstantSDNode>(N1)) {
49810 bool IsSub = N->getOpcode() == ISD::XOR;
49811 bool N1COdd = N1C->getZExtValue() & 1;
49812 if (IsSub ? N1COdd : !N1COdd) {
49813 SDLoc DL(N);
49814 EVT VT = N->getValueType(0);
49815 if (SDValue R = combineAddOrSubToADCOrSBB(IsSub, DL, VT, N1, N0, DAG))
49816 return R;
49817 }
49818 }
49819 }
49820
49821 // not(pcmpeq(and(X,CstPow2),0)) -> pcmpeq(and(X,CstPow2),CstPow2)
49822 if (N->getOpcode() == ISD::XOR && N0.getOpcode() == X86ISD::PCMPEQ &&
49823 N0.getOperand(0).getOpcode() == ISD::AND &&
49826 MVT VT = N->getSimpleValueType(0);
49827 APInt UndefElts;
49828 SmallVector<APInt> EltBits;
49830 VT.getScalarSizeInBits(), UndefElts,
49831 EltBits)) {
49832 bool IsPow2OrUndef = true;
49833 for (unsigned I = 0, E = EltBits.size(); I != E; ++I)
49834 IsPow2OrUndef &= UndefElts[I] || EltBits[I].isPowerOf2();
49835
49836 if (IsPow2OrUndef)
49837 return DAG.getNode(X86ISD::PCMPEQ, SDLoc(N), VT, N0.getOperand(0),
49838 N0.getOperand(0).getOperand(1));
49839 }
49840 }
49841
49842 return SDValue();
49843}
49844
49847 const X86Subtarget &Subtarget) {
49848 SDValue N0 = N->getOperand(0);
49849 SDValue N1 = N->getOperand(1);
49850 EVT VT = N->getValueType(0);
49851 SDLoc dl(N);
49852 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49853
49854 // If this is SSE1 only convert to FOR to avoid scalarization.
49855 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
49856 return DAG.getBitcast(MVT::v4i32,
49857 DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,
49858 DAG.getBitcast(MVT::v4f32, N0),
49859 DAG.getBitcast(MVT::v4f32, N1)));
49860 }
49861
49862 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
49863 // TODO: Support multiple SrcOps.
49864 if (VT == MVT::i1) {
49866 SmallVector<APInt, 2> SrcPartials;
49867 if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
49868 SrcOps.size() == 1) {
49869 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
49870 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
49871 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
49872 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
49873 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
49874 if (Mask) {
49875 assert(SrcPartials[0].getBitWidth() == NumElts &&
49876 "Unexpected partial reduction mask");
49877 SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
49878 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
49879 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
49880 return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
49881 }
49882 }
49883 }
49884
49885 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
49886 return R;
49887
49888 if (SDValue R = combineBitOpWithShift(N, DAG))
49889 return R;
49890
49891 if (SDValue R = combineBitOpWithPACK(N, DAG))
49892 return R;
49893
49894 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
49895 return FPLogic;
49896
49897 if (DCI.isBeforeLegalizeOps())
49898 return SDValue();
49899
49900 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
49901 return R;
49902
49903 if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
49904 return R;
49905
49906 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
49907 return R;
49908
49909 // (0 - SetCC) | C -> (zext (not SetCC)) * (C + 1) - 1 if we can get a LEA out of it.
49910 if ((VT == MVT::i32 || VT == MVT::i64) &&
49911 N0.getOpcode() == ISD::SUB && N0.hasOneUse() &&
49912 isNullConstant(N0.getOperand(0))) {
49913 SDValue Cond = N0.getOperand(1);
49914 if (Cond.getOpcode() == ISD::ZERO_EXTEND && Cond.hasOneUse())
49915 Cond = Cond.getOperand(0);
49916
49917 if (Cond.getOpcode() == X86ISD::SETCC && Cond.hasOneUse()) {
49918 if (auto *CN = dyn_cast<ConstantSDNode>(N1)) {
49919 uint64_t Val = CN->getZExtValue();
49920 if (Val == 1 || Val == 2 || Val == 3 || Val == 4 || Val == 7 || Val == 8) {
49921 X86::CondCode CCode = (X86::CondCode)Cond.getConstantOperandVal(0);
49922 CCode = X86::GetOppositeBranchCondition(CCode);
49923 SDValue NotCond = getSETCC(CCode, Cond.getOperand(1), SDLoc(Cond), DAG);
49924
49925 SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT);
49926 R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT));
49927 R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT));
49928 return R;
49929 }
49930 }
49931 }
49932 }
49933
49934 // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
49935 // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
49936 // iff the upper elements of the non-shifted arg are zero.
49937 // KUNPCK require 16+ bool vector elements.
49938 if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
49939 unsigned NumElts = VT.getVectorNumElements();
49940 unsigned HalfElts = NumElts / 2;
49941 APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
49942 if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
49943 N1.getConstantOperandAPInt(1) == HalfElts &&
49944 DAG.MaskedVectorIsZero(N0, UpperElts)) {
49945 return DAG.getNode(
49946 ISD::CONCAT_VECTORS, dl, VT,
49947 extractSubVector(N0, 0, DAG, dl, HalfElts),
49948 extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
49949 }
49950 if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
49951 N0.getConstantOperandAPInt(1) == HalfElts &&
49952 DAG.MaskedVectorIsZero(N1, UpperElts)) {
49953 return DAG.getNode(
49954 ISD::CONCAT_VECTORS, dl, VT,
49955 extractSubVector(N1, 0, DAG, dl, HalfElts),
49956 extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
49957 }
49958 }
49959
49960 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
49961 // Attempt to recursively combine an OR of shuffles.
49962 SDValue Op(N, 0);
49963 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
49964 return Res;
49965
49966 // If either operand is a constant mask, then only the elements that aren't
49967 // allones are actually demanded by the other operand.
49968 auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
49969 APInt UndefElts;
49970 SmallVector<APInt> EltBits;
49971 int NumElts = VT.getVectorNumElements();
49972 int EltSizeInBits = VT.getScalarSizeInBits();
49973 if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
49974 return false;
49975
49976 APInt DemandedElts = APInt::getZero(NumElts);
49977 for (int I = 0; I != NumElts; ++I)
49978 if (!EltBits[I].isAllOnes())
49979 DemandedElts.setBit(I);
49980
49981 return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);
49982 };
49983 if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
49984 if (N->getOpcode() != ISD::DELETED_NODE)
49985 DCI.AddToWorklist(N);
49986 return SDValue(N, 0);
49987 }
49988 }
49989
49990 // We should fold "masked merge" patterns when `andn` is not available.
49991 if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1)
49992 if (SDValue R = foldMaskedMerge(N, DAG))
49993 return R;
49994
49995 if (SDValue R = combineOrXorWithSETCC(N, N0, N1, DAG))
49996 return R;
49997
49998 return SDValue();
49999}
50000
50001/// Try to turn tests against the signbit in the form of:
50002/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
50003/// into:
50004/// SETGT(X, -1)
50006 // This is only worth doing if the output type is i8 or i1.
50007 EVT ResultType = N->getValueType(0);
50008 if (ResultType != MVT::i8 && ResultType != MVT::i1)
50009 return SDValue();
50010
50011 SDValue N0 = N->getOperand(0);
50012 SDValue N1 = N->getOperand(1);
50013
50014 // We should be performing an xor against a truncated shift.
50015 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
50016 return SDValue();
50017
50018 // Make sure we are performing an xor against one.
50019 if (!isOneConstant(N1))
50020 return SDValue();
50021
50022 // SetCC on x86 zero extends so only act on this if it's a logical shift.
50023 SDValue Shift = N0.getOperand(0);
50024 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
50025 return SDValue();
50026
50027 // Make sure we are truncating from one of i16, i32 or i64.
50028 EVT ShiftTy = Shift.getValueType();
50029 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
50030 return SDValue();
50031
50032 // Make sure the shift amount extracts the sign bit.
50033 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
50034 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
50035 return SDValue();
50036
50037 // Create a greater-than comparison against -1.
50038 // N.B. Using SETGE against 0 works but we want a canonical looking
50039 // comparison, using SETGT matches up with what TranslateX86CC.
50040 SDLoc DL(N);
50041 SDValue ShiftOp = Shift.getOperand(0);
50042 EVT ShiftOpTy = ShiftOp.getValueType();
50043 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50044 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
50045 *DAG.getContext(), ResultType);
50046 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
50047 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
50048 if (SetCCResultType != ResultType)
50049 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
50050 return Cond;
50051}
50052
50053/// Turn vector tests of the signbit in the form of:
50054/// xor (sra X, elt_size(X)-1), -1
50055/// into:
50056/// pcmpgt X, -1
50057///
50058/// This should be called before type legalization because the pattern may not
50059/// persist after that.
50061 const X86Subtarget &Subtarget) {
50062 EVT VT = N->getValueType(0);
50063 if (!VT.isSimple())
50064 return SDValue();
50065
50066 switch (VT.getSimpleVT().SimpleTy) {
50067 // clang-format off
50068 default: return SDValue();
50069 case MVT::v16i8:
50070 case MVT::v8i16:
50071 case MVT::v4i32:
50072 case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
50073 case MVT::v32i8:
50074 case MVT::v16i16:
50075 case MVT::v8i32:
50076 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
50077 // clang-format on
50078 }
50079
50080 // There must be a shift right algebraic before the xor, and the xor must be a
50081 // 'not' operation.
50082 SDValue Shift = N->getOperand(0);
50083 SDValue Ones = N->getOperand(1);
50084 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
50086 return SDValue();
50087
50088 // The shift should be smearing the sign bit across each vector element.
50089 auto *ShiftAmt =
50090 isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
50091 if (!ShiftAmt ||
50092 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
50093 return SDValue();
50094
50095 // Create a greater-than comparison against -1. We don't use the more obvious
50096 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
50097 return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
50098}
50099
50100/// Detect patterns of truncation with unsigned saturation:
50101///
50102/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
50103/// Return the source value x to be truncated or SDValue() if the pattern was
50104/// not matched.
50105///
50106/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
50107/// where C1 >= 0 and C2 is unsigned max of destination type.
50108///
50109/// (truncate (smax (smin (x, C2), C1)) to dest_type)
50110/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
50111///
50112/// These two patterns are equivalent to:
50113/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
50114/// So return the smax(x, C1) value to be truncated or SDValue() if the
50115/// pattern was not matched.
50117 const SDLoc &DL) {
50118 EVT InVT = In.getValueType();
50119
50120 // Saturation with truncation. We truncate from InVT to VT.
50122 "Unexpected types for truncate operation");
50123
50124 // Match min/max and return limit value as a parameter.
50125 auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
50126 if (V.getOpcode() == Opcode &&
50127 ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
50128 return V.getOperand(0);
50129 return SDValue();
50130 };
50131
50132 APInt C1, C2;
50133 if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
50134 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
50135 // the element size of the destination type.
50136 if (C2.isMask(VT.getScalarSizeInBits()))
50137 return UMin;
50138
50139 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
50140 if (MatchMinMax(SMin, ISD::SMAX, C1))
50141 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
50142 return SMin;
50143
50144 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
50145 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
50146 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
50147 C2.uge(C1)) {
50148 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
50149 }
50150
50151 return SDValue();
50152}
50153
50154/// Detect patterns of truncation with signed saturation:
50155/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
50156/// signed_max_of_dest_type)) to dest_type)
50157/// or:
50158/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
50159/// signed_min_of_dest_type)) to dest_type).
50160/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
50161/// Return the source value to be truncated or SDValue() if the pattern was not
50162/// matched.
50163static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
50164 unsigned NumDstBits = VT.getScalarSizeInBits();
50165 unsigned NumSrcBits = In.getScalarValueSizeInBits();
50166 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
50167
50168 auto MatchMinMax = [](SDValue V, unsigned Opcode,
50169 const APInt &Limit) -> SDValue {
50170 APInt C;
50171 if (V.getOpcode() == Opcode &&
50172 ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
50173 return V.getOperand(0);
50174 return SDValue();
50175 };
50176
50177 APInt SignedMax, SignedMin;
50178 if (MatchPackUS) {
50179 SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);
50180 SignedMin = APInt(NumSrcBits, 0);
50181 } else {
50182 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
50183 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
50184 }
50185
50186 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
50187 if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
50188 return SMax;
50189
50190 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
50191 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
50192 return SMin;
50193
50194 return SDValue();
50195}
50196
50198 SelectionDAG &DAG,
50199 const X86Subtarget &Subtarget) {
50200 if (!Subtarget.hasSSE2() || !VT.isVector())
50201 return SDValue();
50202
50203 EVT SVT = VT.getVectorElementType();
50204 EVT InVT = In.getValueType();
50205 EVT InSVT = InVT.getVectorElementType();
50206
50207 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
50208 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
50209 // and concatenate at the same time. Then we can use a final vpmovuswb to
50210 // clip to 0-255.
50211 if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
50212 InVT == MVT::v16i32 && VT == MVT::v16i8) {
50213 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
50214 // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
50215 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
50216 DL, DAG, Subtarget);
50217 assert(Mid && "Failed to pack!");
50218 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
50219 }
50220 }
50221
50222 // vXi32 truncate instructions are available with AVX512F.
50223 // vXi16 truncate instructions are only available with AVX512BW.
50224 // For 256-bit or smaller vectors, we require VLX.
50225 // FIXME: We could widen truncates to 512 to remove the VLX restriction.
50226 // If the result type is 256-bits or larger and we have disable 512-bit
50227 // registers, we should go ahead and use the pack instructions if possible.
50228 bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
50229 (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
50230 (InVT.getSizeInBits() > 128) &&
50231 (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
50232 !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
50233
50234 if (!PreferAVX512 && VT.getVectorNumElements() > 1 &&
50236 (SVT == MVT::i8 || SVT == MVT::i16) &&
50237 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
50238 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
50239 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
50240 if (SVT == MVT::i8 && InSVT == MVT::i32) {
50241 EVT MidVT = VT.changeVectorElementType(MVT::i16);
50242 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
50243 DAG, Subtarget);
50244 assert(Mid && "Failed to pack!");
50246 Subtarget);
50247 assert(V && "Failed to pack!");
50248 return V;
50249 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
50250 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
50251 Subtarget);
50252 }
50253 if (SDValue SSatVal = detectSSatPattern(In, VT))
50254 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
50255 Subtarget);
50256 }
50257
50258 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50259 if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
50260 Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
50261 (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
50262 unsigned TruncOpc = 0;
50263 SDValue SatVal;
50264 if (SDValue SSatVal = detectSSatPattern(In, VT)) {
50265 SatVal = SSatVal;
50266 TruncOpc = X86ISD::VTRUNCS;
50267 } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {
50268 SatVal = USatVal;
50269 TruncOpc = X86ISD::VTRUNCUS;
50270 }
50271 if (SatVal) {
50272 unsigned ResElts = VT.getVectorNumElements();
50273 // If the input type is less than 512 bits and we don't have VLX, we need
50274 // to widen to 512 bits.
50275 if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
50276 unsigned NumConcats = 512 / InVT.getSizeInBits();
50277 ResElts *= NumConcats;
50278 SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
50279 ConcatOps[0] = SatVal;
50280 InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
50281 NumConcats * InVT.getVectorNumElements());
50282 SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
50283 }
50284 // Widen the result if its narrower than 128 bits.
50285 if (ResElts * SVT.getSizeInBits() < 128)
50286 ResElts = 128 / SVT.getSizeInBits();
50287 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
50288 SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
50289 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
50290 DAG.getIntPtrConstant(0, DL));
50291 }
50292 }
50293
50294 return SDValue();
50295}
50296
50297/// This function detects the AVG pattern between vectors of unsigned i8/i16,
50298/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
50299/// ISD::AVGCEILU (AVG) instruction.
50301 const X86Subtarget &Subtarget,
50302 const SDLoc &DL) {
50303 if (!VT.isVector())
50304 return SDValue();
50305 EVT InVT = In.getValueType();
50306 unsigned NumElems = VT.getVectorNumElements();
50307
50308 EVT ScalarVT = VT.getVectorElementType();
50309 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2))
50310 return SDValue();
50311
50312 // InScalarVT is the intermediate type in AVG pattern and it should be greater
50313 // than the original input type (i8/i16).
50314 EVT InScalarVT = InVT.getVectorElementType();
50315 if (InScalarVT.getFixedSizeInBits() <= ScalarVT.getFixedSizeInBits())
50316 return SDValue();
50317
50318 if (!Subtarget.hasSSE2())
50319 return SDValue();
50320
50321 // Detect the following pattern:
50322 //
50323 // %1 = zext <N x i8> %a to <N x i32>
50324 // %2 = zext <N x i8> %b to <N x i32>
50325 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
50326 // %4 = add nuw nsw <N x i32> %3, %2
50327 // %5 = lshr <N x i32> %N, <i32 1 x N>
50328 // %6 = trunc <N x i32> %5 to <N x i8>
50329 //
50330 // In AVX512, the last instruction can also be a trunc store.
50331 if (In.getOpcode() != ISD::SRL)
50332 return SDValue();
50333
50334 // A lambda checking the given SDValue is a constant vector and each element
50335 // is in the range [Min, Max].
50336 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
50337 return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) {
50338 return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max));
50339 });
50340 };
50341
50342 auto IsZExtLike = [DAG = &DAG, ScalarVT](SDValue V) {
50343 unsigned MaxActiveBits = DAG->computeKnownBits(V).countMaxActiveBits();
50344 return MaxActiveBits <= ScalarVT.getSizeInBits();
50345 };
50346
50347 // Check if each element of the vector is right-shifted by one.
50348 SDValue LHS = In.getOperand(0);
50349 SDValue RHS = In.getOperand(1);
50350 if (!IsConstVectorInRange(RHS, 1, 1))
50351 return SDValue();
50352 if (LHS.getOpcode() != ISD::ADD)
50353 return SDValue();
50354
50355 // Detect a pattern of a + b + 1 where the order doesn't matter.
50356 SDValue Operands[3];
50357 Operands[0] = LHS.getOperand(0);
50358 Operands[1] = LHS.getOperand(1);
50359
50360 auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
50361 ArrayRef<SDValue> Ops) {
50362 return DAG.getNode(ISD::AVGCEILU, DL, Ops[0].getValueType(), Ops);
50363 };
50364
50365 auto AVGSplitter = [&](std::array<SDValue, 2> Ops) {
50366 for (SDValue &Op : Ops)
50367 if (Op.getValueType() != VT)
50368 Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
50369 // Pad to a power-of-2 vector, split+apply and extract the original vector.
50370 unsigned NumElemsPow2 = PowerOf2Ceil(NumElems);
50371 EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2);
50372 if (NumElemsPow2 != NumElems) {
50373 for (SDValue &Op : Ops) {
50374 SmallVector<SDValue, 32> EltsOfOp(NumElemsPow2, DAG.getUNDEF(ScalarVT));
50375 for (unsigned i = 0; i != NumElems; ++i) {
50376 SDValue Idx = DAG.getIntPtrConstant(i, DL);
50377 EltsOfOp[i] =
50378 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op, Idx);
50379 }
50380 Op = DAG.getBuildVector(Pow2VT, DL, EltsOfOp);
50381 }
50382 }
50383 SDValue Res = SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, Ops, AVGBuilder);
50384 if (NumElemsPow2 == NumElems)
50385 return Res;
50386 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
50387 DAG.getIntPtrConstant(0, DL));
50388 };
50389
50390 // Take care of the case when one of the operands is a constant vector whose
50391 // element is in the range [1, 256].
50392 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
50393 IsZExtLike(Operands[0])) {
50394 // The pattern is detected. Subtract one from the constant vector, then
50395 // demote it and emit X86ISD::AVG instruction.
50396 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
50397 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
50398 return AVGSplitter({Operands[0], Operands[1]});
50399 }
50400
50401 // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
50402 // Match the or case only if its 'add-like' - can be replaced by an add.
50403 auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
50404 if (ISD::ADD == V.getOpcode()) {
50405 Op0 = V.getOperand(0);
50406 Op1 = V.getOperand(1);
50407 return true;
50408 }
50409 if (ISD::ZERO_EXTEND != V.getOpcode())
50410 return false;
50411 V = V.getOperand(0);
50412 if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||
50413 !DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))
50414 return false;
50415 Op0 = V.getOperand(0);
50416 Op1 = V.getOperand(1);
50417 return true;
50418 };
50419
50420 SDValue Op0, Op1;
50421 if (FindAddLike(Operands[0], Op0, Op1))
50422 std::swap(Operands[0], Operands[1]);
50423 else if (!FindAddLike(Operands[1], Op0, Op1))
50424 return SDValue();
50425 Operands[2] = Op0;
50426 Operands[1] = Op1;
50427
50428 // Now we have three operands of two additions. Check that one of them is a
50429 // constant vector with ones, and the other two can be promoted from i8/i16.
50430 for (SDValue &Op : Operands) {
50431 if (!IsConstVectorInRange(Op, 1, 1))
50432 continue;
50433 std::swap(Op, Operands[2]);
50434
50435 // Check if Operands[0] and Operands[1] are results of type promotion.
50436 for (int j = 0; j < 2; ++j)
50437 if (Operands[j].getValueType() != VT)
50438 if (!IsZExtLike(Operands[j]))
50439 return SDValue();
50440
50441 // The pattern is detected, emit X86ISD::AVG instruction(s).
50442 return AVGSplitter({Operands[0], Operands[1]});
50443 }
50444
50445 return SDValue();
50446}
50447
50450 const X86Subtarget &Subtarget) {
50451 LoadSDNode *Ld = cast<LoadSDNode>(N);
50452 EVT RegVT = Ld->getValueType(0);
50453 EVT MemVT = Ld->getMemoryVT();
50454 SDLoc dl(Ld);
50455 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50456
50457 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
50458 // into two 16-byte operations. Also split non-temporal aligned loads on
50459 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
50461 unsigned Fast;
50462 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
50463 Ext == ISD::NON_EXTLOAD &&
50464 ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
50465 Ld->getAlign() >= Align(16)) ||
50466 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
50467 *Ld->getMemOperand(), &Fast) &&
50468 !Fast))) {
50469 unsigned NumElems = RegVT.getVectorNumElements();
50470 if (NumElems < 2)
50471 return SDValue();
50472
50473 unsigned HalfOffset = 16;
50474 SDValue Ptr1 = Ld->getBasePtr();
50475 SDValue Ptr2 =
50476 DAG.getMemBasePlusOffset(Ptr1, TypeSize::getFixed(HalfOffset), dl);
50477 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
50478 NumElems / 2);
50479 SDValue Load1 =
50480 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
50481 Ld->getOriginalAlign(),
50482 Ld->getMemOperand()->getFlags());
50483 SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
50484 Ld->getPointerInfo().getWithOffset(HalfOffset),
50485 Ld->getOriginalAlign(),
50486 Ld->getMemOperand()->getFlags());
50487 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
50488 Load1.getValue(1), Load2.getValue(1));
50489
50490 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
50491 return DCI.CombineTo(N, NewVec, TF, true);
50492 }
50493
50494 // Bool vector load - attempt to cast to an integer, as we have good
50495 // (vXiY *ext(vXi1 bitcast(iX))) handling.
50496 if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
50497 RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
50498 unsigned NumElts = RegVT.getVectorNumElements();
50499 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
50500 if (TLI.isTypeLegal(IntVT)) {
50501 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
50502 Ld->getPointerInfo(),
50503 Ld->getOriginalAlign(),
50504 Ld->getMemOperand()->getFlags());
50505 SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
50506 return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
50507 }
50508 }
50509
50510 // If we also load/broadcast this to a wider type, then just extract the
50511 // lowest subvector.
50512 if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
50513 (RegVT.is128BitVector() || RegVT.is256BitVector())) {
50514 SDValue Ptr = Ld->getBasePtr();
50515 SDValue Chain = Ld->getChain();
50516 for (SDNode *User : Chain->uses()) {
50517 auto *UserLd = dyn_cast<MemSDNode>(User);
50518 if (User != N && UserLd &&
50519 (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||
50520 User->getOpcode() == X86ISD::VBROADCAST_LOAD ||
50522 UserLd->getChain() == Chain && !User->hasAnyUseOfValue(1) &&
50523 User->getValueSizeInBits(0).getFixedValue() >
50524 RegVT.getFixedSizeInBits()) {
50525 if (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
50526 UserLd->getBasePtr() == Ptr &&
50527 UserLd->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits()) {
50528 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
50529 RegVT.getSizeInBits());
50530 Extract = DAG.getBitcast(RegVT, Extract);
50531 return DCI.CombineTo(N, Extract, SDValue(User, 1));
50532 }
50533 auto MatchingBits = [](const APInt &Undefs, const APInt &UserUndefs,
50534 ArrayRef<APInt> Bits, ArrayRef<APInt> UserBits) {
50535 for (unsigned I = 0, E = Undefs.getBitWidth(); I != E; ++I) {
50536 if (Undefs[I])
50537 continue;
50538 if (UserUndefs[I] || Bits[I] != UserBits[I])
50539 return false;
50540 }
50541 return true;
50542 };
50543 // See if we are loading a constant that matches in the lower
50544 // bits of a longer constant (but from a different constant pool ptr).
50545 EVT UserVT = User->getValueType(0);
50546 SDValue UserPtr = UserLd->getBasePtr();
50548 const Constant *UserC = getTargetConstantFromBasePtr(UserPtr);
50549 if (LdC && UserC && UserPtr != Ptr) {
50550 unsigned LdSize = LdC->getType()->getPrimitiveSizeInBits();
50551 unsigned UserSize = UserC->getType()->getPrimitiveSizeInBits();
50552 if (LdSize < UserSize || !ISD::isNormalLoad(User)) {
50553 APInt Undefs, UserUndefs;
50554 SmallVector<APInt> Bits, UserBits;
50555 unsigned NumBits = std::min(RegVT.getScalarSizeInBits(),
50556 UserVT.getScalarSizeInBits());
50557 if (getTargetConstantBitsFromNode(SDValue(N, 0), NumBits, Undefs,
50558 Bits) &&
50560 UserUndefs, UserBits)) {
50561 if (MatchingBits(Undefs, UserUndefs, Bits, UserBits)) {
50562 SDValue Extract = extractSubVector(
50563 SDValue(User, 0), 0, DAG, SDLoc(N), RegVT.getSizeInBits());
50564 Extract = DAG.getBitcast(RegVT, Extract);
50565 return DCI.CombineTo(N, Extract, SDValue(User, 1));
50566 }
50567 }
50568 }
50569 }
50570 }
50571 }
50572 }
50573
50574 // Cast ptr32 and ptr64 pointers to the default address space before a load.
50575 unsigned AddrSpace = Ld->getAddressSpace();
50576 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
50577 AddrSpace == X86AS::PTR32_UPTR) {
50578 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
50579 if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
50580 SDValue Cast =
50581 DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
50582 return DAG.getExtLoad(Ext, dl, RegVT, Ld->getChain(), Cast,
50583 Ld->getPointerInfo(), MemVT, Ld->getOriginalAlign(),
50584 Ld->getMemOperand()->getFlags());
50585 }
50586 }
50587
50588 return SDValue();
50589}
50590
50591/// If V is a build vector of boolean constants and exactly one of those
50592/// constants is true, return the operand index of that true element.
50593/// Otherwise, return -1.
50594static int getOneTrueElt(SDValue V) {
50595 // This needs to be a build vector of booleans.
50596 // TODO: Checking for the i1 type matches the IR definition for the mask,
50597 // but the mask check could be loosened to i8 or other types. That might
50598 // also require checking more than 'allOnesValue'; eg, the x86 HW
50599 // instructions only require that the MSB is set for each mask element.
50600 // The ISD::MSTORE comments/definition do not specify how the mask operand
50601 // is formatted.
50602 auto *BV = dyn_cast<BuildVectorSDNode>(V);
50603 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
50604 return -1;
50605
50606 int TrueIndex = -1;
50607 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
50608 for (unsigned i = 0; i < NumElts; ++i) {
50609 const SDValue &Op = BV->getOperand(i);
50610 if (Op.isUndef())
50611 continue;
50612 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
50613 if (!ConstNode)
50614 return -1;
50615 if (ConstNode->getAPIntValue().countr_one() >= 1) {
50616 // If we already found a one, this is too many.
50617 if (TrueIndex >= 0)
50618 return -1;
50619 TrueIndex = i;
50620 }
50621 }
50622 return TrueIndex;
50623}
50624
50625/// Given a masked memory load/store operation, return true if it has one mask
50626/// bit set. If it has one mask bit set, then also return the memory address of
50627/// the scalar element to load/store, the vector index to insert/extract that
50628/// scalar element, and the alignment for the scalar memory access.
50630 SelectionDAG &DAG, SDValue &Addr,
50631 SDValue &Index, Align &Alignment,
50632 unsigned &Offset) {
50633 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
50634 if (TrueMaskElt < 0)
50635 return false;
50636
50637 // Get the address of the one scalar element that is specified by the mask
50638 // using the appropriate offset from the base pointer.
50639 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
50640 Offset = 0;
50641 Addr = MaskedOp->getBasePtr();
50642 if (TrueMaskElt != 0) {
50643 Offset = TrueMaskElt * EltVT.getStoreSize();
50645 SDLoc(MaskedOp));
50646 }
50647
50648 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
50649 Alignment = commonAlignment(MaskedOp->getOriginalAlign(),
50650 EltVT.getStoreSize());
50651 return true;
50652}
50653
50654/// If exactly one element of the mask is set for a non-extending masked load,
50655/// it is a scalar load and vector insert.
50656/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
50657/// mask have already been optimized in IR, so we don't bother with those here.
50658static SDValue
50661 const X86Subtarget &Subtarget) {
50662 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
50663 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
50664 // However, some target hooks may need to be added to know when the transform
50665 // is profitable. Endianness would also have to be considered.
50666
50667 SDValue Addr, VecIndex;
50668 Align Alignment;
50669 unsigned Offset;
50670 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
50671 return SDValue();
50672
50673 // Load the one scalar element that is specified by the mask using the
50674 // appropriate offset from the base pointer.
50675 SDLoc DL(ML);
50676 EVT VT = ML->getValueType(0);
50677 EVT EltVT = VT.getVectorElementType();
50678
50679 EVT CastVT = VT;
50680 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
50681 EltVT = MVT::f64;
50682 CastVT = VT.changeVectorElementType(EltVT);
50683 }
50684
50685 SDValue Load =
50686 DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
50687 ML->getPointerInfo().getWithOffset(Offset),
50688 Alignment, ML->getMemOperand()->getFlags());
50689
50690 SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
50691
50692 // Insert the loaded element into the appropriate place in the vector.
50693 SDValue Insert =
50694 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
50695 Insert = DAG.getBitcast(VT, Insert);
50696 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
50697}
50698
50699static SDValue
50702 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
50703 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
50704 return SDValue();
50705
50706 SDLoc DL(ML);
50707 EVT VT = ML->getValueType(0);
50708
50709 // If we are loading the first and last elements of a vector, it is safe and
50710 // always faster to load the whole vector. Replace the masked load with a
50711 // vector load and select.
50712 unsigned NumElts = VT.getVectorNumElements();
50713 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
50714 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
50715 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
50716 if (LoadFirstElt && LoadLastElt) {
50717 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
50718 ML->getMemOperand());
50719 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
50720 ML->getPassThru());
50721 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
50722 }
50723
50724 // Convert a masked load with a constant mask into a masked load and a select.
50725 // This allows the select operation to use a faster kind of select instruction
50726 // (for example, vblendvps -> vblendps).
50727
50728 // Don't try this if the pass-through operand is already undefined. That would
50729 // cause an infinite loop because that's what we're about to create.
50730 if (ML->getPassThru().isUndef())
50731 return SDValue();
50732
50733 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
50734 return SDValue();
50735
50736 // The new masked load has an undef pass-through operand. The select uses the
50737 // original pass-through operand.
50738 SDValue NewML = DAG.getMaskedLoad(
50739 VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
50740 DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
50741 ML->getAddressingMode(), ML->getExtensionType());
50742 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
50743 ML->getPassThru());
50744
50745 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
50746}
50747
50750 const X86Subtarget &Subtarget) {
50751 auto *Mld = cast<MaskedLoadSDNode>(N);
50752
50753 // TODO: Expanding load with constant mask may be optimized as well.
50754 if (Mld->isExpandingLoad())
50755 return SDValue();
50756
50757 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
50758 if (SDValue ScalarLoad =
50759 reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
50760 return ScalarLoad;
50761
50762 // TODO: Do some AVX512 subsets benefit from this transform?
50763 if (!Subtarget.hasAVX512())
50764 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
50765 return Blend;
50766 }
50767
50768 // If the mask value has been legalized to a non-boolean vector, try to
50769 // simplify ops leading up to it. We only demand the MSB of each lane.
50770 SDValue Mask = Mld->getMask();
50771 if (Mask.getScalarValueSizeInBits() != 1) {
50772 EVT VT = Mld->getValueType(0);
50773 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50775 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
50776 if (N->getOpcode() != ISD::DELETED_NODE)
50777 DCI.AddToWorklist(N);
50778 return SDValue(N, 0);
50779 }
50780 if (SDValue NewMask =
50782 return DAG.getMaskedLoad(
50783 VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
50784 NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
50785 Mld->getAddressingMode(), Mld->getExtensionType());
50786 }
50787
50788 return SDValue();
50789}
50790
50791/// If exactly one element of the mask is set for a non-truncating masked store,
50792/// it is a vector extract and scalar store.
50793/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
50794/// mask have already been optimized in IR, so we don't bother with those here.
50796 SelectionDAG &DAG,
50797 const X86Subtarget &Subtarget) {
50798 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
50799 // However, some target hooks may need to be added to know when the transform
50800 // is profitable. Endianness would also have to be considered.
50801
50802 SDValue Addr, VecIndex;
50803 Align Alignment;
50804 unsigned Offset;
50805 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
50806 return SDValue();
50807
50808 // Extract the one scalar element that is actually being stored.
50809 SDLoc DL(MS);
50810 SDValue Value = MS->getValue();
50811 EVT VT = Value.getValueType();
50812 EVT EltVT = VT.getVectorElementType();
50813 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
50814 EltVT = MVT::f64;
50815 EVT CastVT = VT.changeVectorElementType(EltVT);
50816 Value = DAG.getBitcast(CastVT, Value);
50817 }
50818 SDValue Extract =
50819 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
50820
50821 // Store that element at the appropriate offset from the base pointer.
50822 return DAG.getStore(MS->getChain(), DL, Extract, Addr,
50824 Alignment, MS->getMemOperand()->getFlags());
50825}
50826
50829 const X86Subtarget &Subtarget) {
50830 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
50831 if (Mst->isCompressingStore())
50832 return SDValue();
50833
50834 EVT VT = Mst->getValue().getValueType();
50835 SDLoc dl(Mst);
50836 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50837
50838 if (Mst->isTruncatingStore())
50839 return SDValue();
50840
50841 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
50842 return ScalarStore;
50843
50844 // If the mask value has been legalized to a non-boolean vector, try to
50845 // simplify ops leading up to it. We only demand the MSB of each lane.
50846 SDValue Mask = Mst->getMask();
50847 if (Mask.getScalarValueSizeInBits() != 1) {
50849 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
50850 if (N->getOpcode() != ISD::DELETED_NODE)
50851 DCI.AddToWorklist(N);
50852 return SDValue(N, 0);
50853 }
50854 if (SDValue NewMask =
50856 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
50857 Mst->getBasePtr(), Mst->getOffset(), NewMask,
50858 Mst->getMemoryVT(), Mst->getMemOperand(),
50859 Mst->getAddressingMode());
50860 }
50861
50862 SDValue Value = Mst->getValue();
50863 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
50864 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
50865 Mst->getMemoryVT())) {
50866 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
50867 Mst->getBasePtr(), Mst->getOffset(), Mask,
50868 Mst->getMemoryVT(), Mst->getMemOperand(),
50869 Mst->getAddressingMode(), true);
50870 }
50871
50872 return SDValue();
50873}
50874
50877 const X86Subtarget &Subtarget) {
50878 StoreSDNode *St = cast<StoreSDNode>(N);
50879 EVT StVT = St->getMemoryVT();
50880 SDLoc dl(St);
50881 SDValue StoredVal = St->getValue();
50882 EVT VT = StoredVal.getValueType();
50883 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50884
50885 // Convert a store of vXi1 into a store of iX and a bitcast.
50886 if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
50887 VT.getVectorElementType() == MVT::i1) {
50888
50890 StoredVal = DAG.getBitcast(NewVT, StoredVal);
50891
50892 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
50893 St->getPointerInfo(), St->getOriginalAlign(),
50894 St->getMemOperand()->getFlags());
50895 }
50896
50897 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
50898 // This will avoid a copy to k-register.
50899 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
50900 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
50901 StoredVal.getOperand(0).getValueType() == MVT::i8) {
50902 SDValue Val = StoredVal.getOperand(0);
50903 // We must store zeros to the unused bits.
50904 Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
50905 return DAG.getStore(St->getChain(), dl, Val,
50906 St->getBasePtr(), St->getPointerInfo(),
50907 St->getOriginalAlign(),
50908 St->getMemOperand()->getFlags());
50909 }
50910
50911 // Widen v2i1/v4i1 stores to v8i1.
50912 if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
50913 Subtarget.hasAVX512()) {
50914 unsigned NumConcats = 8 / VT.getVectorNumElements();
50915 // We must store zeros to the unused bits.
50916 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
50917 Ops[0] = StoredVal;
50918 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
50919 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
50920 St->getPointerInfo(), St->getOriginalAlign(),
50921 St->getMemOperand()->getFlags());
50922 }
50923
50924 // Turn vXi1 stores of constants into a scalar store.
50925 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
50926 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
50928 // If its a v64i1 store without 64-bit support, we need two stores.
50929 if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
50930 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
50931 StoredVal->ops().slice(0, 32));
50933 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
50934 StoredVal->ops().slice(32, 32));
50936
50937 SDValue Ptr0 = St->getBasePtr();
50938 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(4), dl);
50939
50940 SDValue Ch0 =
50941 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
50942 St->getOriginalAlign(),
50943 St->getMemOperand()->getFlags());
50944 SDValue Ch1 =
50945 DAG.getStore(St->getChain(), dl, Hi, Ptr1,
50947 St->getOriginalAlign(),
50948 St->getMemOperand()->getFlags());
50949 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
50950 }
50951
50952 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
50953 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
50954 St->getPointerInfo(), St->getOriginalAlign(),
50955 St->getMemOperand()->getFlags());
50956 }
50957
50958 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
50959 // Sandy Bridge, perform two 16-byte stores.
50960 unsigned Fast;
50961 if (VT.is256BitVector() && StVT == VT &&
50962 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
50963 *St->getMemOperand(), &Fast) &&
50964 !Fast) {
50965 unsigned NumElems = VT.getVectorNumElements();
50966 if (NumElems < 2)
50967 return SDValue();
50968
50969 return splitVectorStore(St, DAG);
50970 }
50971
50972 // Split under-aligned vector non-temporal stores.
50973 if (St->isNonTemporal() && StVT == VT &&
50974 St->getAlign().value() < VT.getStoreSize()) {
50975 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
50976 // vectors or the legalizer can scalarize it to use MOVNTI.
50977 if (VT.is256BitVector() || VT.is512BitVector()) {
50978 unsigned NumElems = VT.getVectorNumElements();
50979 if (NumElems < 2)
50980 return SDValue();
50981 return splitVectorStore(St, DAG);
50982 }
50983
50984 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
50985 // to use MOVNTI.
50986 if (VT.is128BitVector() && Subtarget.hasSSE2()) {
50987 MVT NTVT = Subtarget.hasSSE4A()
50988 ? MVT::v2f64
50989 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
50990 return scalarizeVectorStore(St, NTVT, DAG);
50991 }
50992 }
50993
50994 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
50995 // supported, but avx512f is by extending to v16i32 and truncating.
50996 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
50997 St->getValue().getOpcode() == ISD::TRUNCATE &&
50998 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
50999 TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
51000 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
51001 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,
51002 St->getValue().getOperand(0));
51003 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
51004 MVT::v16i8, St->getMemOperand());
51005 }
51006
51007 // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
51008 if (!St->isTruncatingStore() &&
51009 (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
51010 StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
51011 StoredVal.hasOneUse() &&
51012 TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
51013 bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
51014 return EmitTruncSStore(IsSigned, St->getChain(),
51015 dl, StoredVal.getOperand(0), St->getBasePtr(),
51016 VT, St->getMemOperand(), DAG);
51017 }
51018
51019 // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
51020 if (!St->isTruncatingStore()) {
51021 auto IsExtractedElement = [](SDValue V) {
51022 if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())
51023 V = V.getOperand(0);
51024 unsigned Opc = V.getOpcode();
51025 if ((Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) &&
51026 isNullConstant(V.getOperand(1)) && V.hasOneUse() &&
51027 V.getOperand(0).hasOneUse())
51028 return V.getOperand(0);
51029 return SDValue();
51030 };
51031 if (SDValue Extract = IsExtractedElement(StoredVal)) {
51032 SDValue Trunc = peekThroughOneUseBitcasts(Extract);
51033 if (Trunc.getOpcode() == X86ISD::VTRUNC) {
51034 SDValue Src = Trunc.getOperand(0);
51035 MVT DstVT = Trunc.getSimpleValueType();
51036 MVT SrcVT = Src.getSimpleValueType();
51037 unsigned NumSrcElts = SrcVT.getVectorNumElements();
51038 unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
51039 MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
51040 if (NumTruncBits == VT.getSizeInBits() &&
51041 TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
51042 return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
51043 TruncVT, St->getMemOperand());
51044 }
51045 }
51046 }
51047 }
51048
51049 // Optimize trunc store (of multiple scalars) to shuffle and store.
51050 // First, pack all of the elements in one place. Next, store to memory
51051 // in fewer chunks.
51052 if (St->isTruncatingStore() && VT.isVector()) {
51053 // Check if we can detect an AVG pattern from the truncation. If yes,
51054 // replace the trunc store by a normal store with the result of X86ISD::AVG
51055 // instruction.
51056 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT()))
51057 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
51058 Subtarget, dl))
51059 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
51060 St->getPointerInfo(), St->getOriginalAlign(),
51061 St->getMemOperand()->getFlags());
51062
51063 if (TLI.isTruncStoreLegal(VT, StVT)) {
51064 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
51065 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
51066 dl, Val, St->getBasePtr(),
51067 St->getMemoryVT(), St->getMemOperand(), DAG);
51068 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
51069 DAG, dl))
51070 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
51071 dl, Val, St->getBasePtr(),
51072 St->getMemoryVT(), St->getMemOperand(), DAG);
51073 }
51074
51075 return SDValue();
51076 }
51077
51078 // Cast ptr32 and ptr64 pointers to the default address space before a store.
51079 unsigned AddrSpace = St->getAddressSpace();
51080 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
51081 AddrSpace == X86AS::PTR32_UPTR) {
51082 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
51083 if (PtrVT != St->getBasePtr().getSimpleValueType()) {
51084 SDValue Cast =
51085 DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
51086 return DAG.getTruncStore(
51087 St->getChain(), dl, StoredVal, Cast, St->getPointerInfo(), StVT,
51088 St->getOriginalAlign(), St->getMemOperand()->getFlags(),
51089 St->getAAInfo());
51090 }
51091 }
51092
51093 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
51094 // the FP state in cases where an emms may be missing.
51095 // A preferable solution to the general problem is to figure out the right
51096 // places to insert EMMS. This qualifies as a quick hack.
51097
51098 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
51099 if (VT.getSizeInBits() != 64)
51100 return SDValue();
51101
51102 const Function &F = DAG.getMachineFunction().getFunction();
51103 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
51104 bool F64IsLegal =
51105 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
51106
51107 if (!F64IsLegal || Subtarget.is64Bit())
51108 return SDValue();
51109
51110 if (VT == MVT::i64 && isa<LoadSDNode>(St->getValue()) &&
51111 cast<LoadSDNode>(St->getValue())->isSimple() &&
51112 St->getChain().hasOneUse() && St->isSimple()) {
51113 auto *Ld = cast<LoadSDNode>(St->getValue());
51114
51115 if (!ISD::isNormalLoad(Ld))
51116 return SDValue();
51117
51118 // Avoid the transformation if there are multiple uses of the loaded value.
51119 if (!Ld->hasNUsesOfValue(1, 0))
51120 return SDValue();
51121
51122 SDLoc LdDL(Ld);
51123 SDLoc StDL(N);
51124 // Lower to a single movq load/store pair.
51125 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
51126 Ld->getBasePtr(), Ld->getMemOperand());
51127
51128 // Make sure new load is placed in same chain order.
51129 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
51130 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
51131 St->getMemOperand());
51132 }
51133
51134 // This is similar to the above case, but here we handle a scalar 64-bit
51135 // integer store that is extracted from a vector on a 32-bit target.
51136 // If we have SSE2, then we can treat it like a floating-point double
51137 // to get past legalization. The execution dependencies fixup pass will
51138 // choose the optimal machine instruction for the store if this really is
51139 // an integer or v2f32 rather than an f64.
51140 if (VT == MVT::i64 &&
51142 SDValue OldExtract = St->getOperand(1);
51143 SDValue ExtOp0 = OldExtract.getOperand(0);
51144 unsigned VecSize = ExtOp0.getValueSizeInBits();
51145 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
51146 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
51147 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
51148 BitCast, OldExtract.getOperand(1));
51149 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
51150 St->getPointerInfo(), St->getOriginalAlign(),
51151 St->getMemOperand()->getFlags());
51152 }
51153
51154 return SDValue();
51155}
51156
51159 const X86Subtarget &Subtarget) {
51160 auto *St = cast<MemIntrinsicSDNode>(N);
51161
51162 SDValue StoredVal = N->getOperand(1);
51163 MVT VT = StoredVal.getSimpleValueType();
51164 EVT MemVT = St->getMemoryVT();
51165
51166 // Figure out which elements we demand.
51167 unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
51168 APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
51169
51170 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51171 if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {
51172 if (N->getOpcode() != ISD::DELETED_NODE)
51173 DCI.AddToWorklist(N);
51174 return SDValue(N, 0);
51175 }
51176
51177 return SDValue();
51178}
51179
51180/// Return 'true' if this vector operation is "horizontal"
51181/// and return the operands for the horizontal operation in LHS and RHS. A
51182/// horizontal operation performs the binary operation on successive elements
51183/// of its first operand, then on successive elements of its second operand,
51184/// returning the resulting values in a vector. For example, if
51185/// A = < float a0, float a1, float a2, float a3 >
51186/// and
51187/// B = < float b0, float b1, float b2, float b3 >
51188/// then the result of doing a horizontal operation on A and B is
51189/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
51190/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
51191/// A horizontal-op B, for some already available A and B, and if so then LHS is
51192/// set to A, RHS to B, and the routine returns 'true'.
51193static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
51194 SelectionDAG &DAG, const X86Subtarget &Subtarget,
51195 bool IsCommutative,
51196 SmallVectorImpl<int> &PostShuffleMask) {
51197 // If either operand is undef, bail out. The binop should be simplified.
51198 if (LHS.isUndef() || RHS.isUndef())
51199 return false;
51200
51201 // Look for the following pattern:
51202 // A = < float a0, float a1, float a2, float a3 >
51203 // B = < float b0, float b1, float b2, float b3 >
51204 // and
51205 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
51206 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
51207 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
51208 // which is A horizontal-op B.
51209
51210 MVT VT = LHS.getSimpleValueType();
51211 assert((VT.is128BitVector() || VT.is256BitVector()) &&
51212 "Unsupported vector type for horizontal add/sub");
51213 unsigned NumElts = VT.getVectorNumElements();
51214
51215 auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
51216 SmallVectorImpl<int> &ShuffleMask) {
51217 bool UseSubVector = false;
51218 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
51219 Op.getOperand(0).getValueType().is256BitVector() &&
51220 llvm::isNullConstant(Op.getOperand(1))) {
51221 Op = Op.getOperand(0);
51222 UseSubVector = true;
51223 }
51225 SmallVector<int, 16> SrcMask, ScaledMask;
51227 if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
51228 !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
51229 return Op.getValueSizeInBits() == BC.getValueSizeInBits();
51230 })) {
51231 resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
51232 if (!UseSubVector && SrcOps.size() <= 2 &&
51233 scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
51234 N0 = !SrcOps.empty() ? SrcOps[0] : SDValue();
51235 N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
51236 ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
51237 }
51238 if (UseSubVector && SrcOps.size() == 1 &&
51239 scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
51240 std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
51241 ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
51242 ShuffleMask.assign(Mask.begin(), Mask.end());
51243 }
51244 }
51245 };
51246
51247 // View LHS in the form
51248 // LHS = VECTOR_SHUFFLE A, B, LMask
51249 // If LHS is not a shuffle, then pretend it is the identity shuffle:
51250 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
51251 // NOTE: A default initialized SDValue represents an UNDEF of type VT.
51252 SDValue A, B;
51254 GetShuffle(LHS, A, B, LMask);
51255
51256 // Likewise, view RHS in the form
51257 // RHS = VECTOR_SHUFFLE C, D, RMask
51258 SDValue C, D;
51260 GetShuffle(RHS, C, D, RMask);
51261
51262 // At least one of the operands should be a vector shuffle.
51263 unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
51264 if (NumShuffles == 0)
51265 return false;
51266
51267 if (LMask.empty()) {
51268 A = LHS;
51269 for (unsigned i = 0; i != NumElts; ++i)
51270 LMask.push_back(i);
51271 }
51272
51273 if (RMask.empty()) {
51274 C = RHS;
51275 for (unsigned i = 0; i != NumElts; ++i)
51276 RMask.push_back(i);
51277 }
51278
51279 // If we have an unary mask, ensure the other op is set to null.
51280 if (isUndefOrInRange(LMask, 0, NumElts))
51281 B = SDValue();
51282 else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
51283 A = SDValue();
51284
51285 if (isUndefOrInRange(RMask, 0, NumElts))
51286 D = SDValue();
51287 else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
51288 C = SDValue();
51289
51290 // If A and B occur in reverse order in RHS, then canonicalize by commuting
51291 // RHS operands and shuffle mask.
51292 if (A != C) {
51293 std::swap(C, D);
51295 }
51296 // Check that the shuffles are both shuffling the same vectors.
51297 if (!(A == C && B == D))
51298 return false;
51299
51300 PostShuffleMask.clear();
51301 PostShuffleMask.append(NumElts, SM_SentinelUndef);
51302
51303 // LHS and RHS are now:
51304 // LHS = shuffle A, B, LMask
51305 // RHS = shuffle A, B, RMask
51306 // Check that the masks correspond to performing a horizontal operation.
51307 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
51308 // so we just repeat the inner loop if this is a 256-bit op.
51309 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
51310 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
51311 unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
51312 assert((NumEltsPer128BitChunk % 2 == 0) &&
51313 "Vector type should have an even number of elements in each lane");
51314 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
51315 for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
51316 // Ignore undefined components.
51317 int LIdx = LMask[i + j], RIdx = RMask[i + j];
51318 if (LIdx < 0 || RIdx < 0 ||
51319 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
51320 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
51321 continue;
51322
51323 // Check that successive odd/even elements are being operated on. If not,
51324 // this is not a horizontal operation.
51325 if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
51326 !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
51327 return false;
51328
51329 // Compute the post-shuffle mask index based on where the element
51330 // is stored in the HOP result, and where it needs to be moved to.
51331 int Base = LIdx & ~1u;
51332 int Index = ((Base % NumEltsPer128BitChunk) / 2) +
51333 ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
51334
51335 // The low half of the 128-bit result must choose from A.
51336 // The high half of the 128-bit result must choose from B,
51337 // unless B is undef. In that case, we are always choosing from A.
51338 if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
51339 Index += NumEltsPer64BitChunk;
51340 PostShuffleMask[i + j] = Index;
51341 }
51342 }
51343
51344 SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
51345 SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
51346
51347 bool IsIdentityPostShuffle =
51348 isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
51349 if (IsIdentityPostShuffle)
51350 PostShuffleMask.clear();
51351
51352 // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
51353 if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
51354 isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
51355 return false;
51356
51357 // If the source nodes are already used in HorizOps then always accept this.
51358 // Shuffle folding should merge these back together.
51359 bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) {
51360 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
51361 });
51362 bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) {
51363 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
51364 });
51365 bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS;
51366
51367 // Assume a SingleSource HOP if we only shuffle one input and don't need to
51368 // shuffle the result.
51369 if (!ForceHorizOp &&
51370 !shouldUseHorizontalOp(NewLHS == NewRHS &&
51371 (NumShuffles < 2 || !IsIdentityPostShuffle),
51372 DAG, Subtarget))
51373 return false;
51374
51375 LHS = DAG.getBitcast(VT, NewLHS);
51376 RHS = DAG.getBitcast(VT, NewRHS);
51377 return true;
51378}
51379
51380// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
51382 const X86Subtarget &Subtarget) {
51383 EVT VT = N->getValueType(0);
51384 unsigned Opcode = N->getOpcode();
51385 bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
51386 SmallVector<int, 8> PostShuffleMask;
51387
51388 switch (Opcode) {
51389 case ISD::FADD:
51390 case ISD::FSUB:
51391 if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
51392 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
51393 SDValue LHS = N->getOperand(0);
51394 SDValue RHS = N->getOperand(1);
51395 auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
51396 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
51397 PostShuffleMask)) {
51398 SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
51399 if (!PostShuffleMask.empty())
51400 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
51401 DAG.getUNDEF(VT), PostShuffleMask);
51402 return HorizBinOp;
51403 }
51404 }
51405 break;
51406 case ISD::ADD:
51407 case ISD::SUB:
51408 if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
51409 VT == MVT::v16i16 || VT == MVT::v8i32)) {
51410 SDValue LHS = N->getOperand(0);
51411 SDValue RHS = N->getOperand(1);
51412 auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
51413 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
51414 PostShuffleMask)) {
51415 auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
51416 ArrayRef<SDValue> Ops) {
51417 return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
51418 };
51419 SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
51420 {LHS, RHS}, HOpBuilder);
51421 if (!PostShuffleMask.empty())
51422 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
51423 DAG.getUNDEF(VT), PostShuffleMask);
51424 return HorizBinOp;
51425 }
51426 }
51427 break;
51428 }
51429
51430 return SDValue();
51431}
51432
51433// Try to combine the following nodes
51434// t29: i64 = X86ISD::Wrapper TargetConstantPool:i64
51435// <i32 -2147483648[float -0.000000e+00]> 0
51436// t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD
51437// <(load 4 from constant-pool)> t0, t29
51438// [t30: v16i32 = bitcast t27]
51439// t6: v16i32 = xor t7, t27[t30]
51440// t11: v16f32 = bitcast t6
51441// t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8
51442// into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:
51443// t22: v16f32 = bitcast t7
51444// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22
51445// t24: v32f16 = bitcast t23
51447 const X86Subtarget &Subtarget) {
51448 EVT VT = N->getValueType(0);
51449 SDValue LHS = N->getOperand(0);
51450 SDValue RHS = N->getOperand(1);
51451 int CombineOpcode =
51452 N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
51453 auto combineConjugation = [&](SDValue &r) {
51454 if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {
51455 SDValue XOR = LHS.getOperand(0);
51456 if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {
51457 KnownBits XORRHS = DAG.computeKnownBits(XOR.getOperand(1));
51458 if (XORRHS.isConstant()) {
51459 APInt ConjugationInt32 = APInt(32, 0x80000000, true);
51460 APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL, true);
51461 if ((XORRHS.getBitWidth() == 32 &&
51462 XORRHS.getConstant() == ConjugationInt32) ||
51463 (XORRHS.getBitWidth() == 64 &&
51464 XORRHS.getConstant() == ConjugationInt64)) {
51465 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
51466 SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));
51467 SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);
51468 r = DAG.getBitcast(VT, FCMulC);
51469 return true;
51470 }
51471 }
51472 }
51473 }
51474 return false;
51475 };
51476 SDValue Res;
51477 if (combineConjugation(Res))
51478 return Res;
51479 std::swap(LHS, RHS);
51480 if (combineConjugation(Res))
51481 return Res;
51482 return Res;
51483}
51484
51485// Try to combine the following nodes:
51486// FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)
51488 const X86Subtarget &Subtarget) {
51489 auto AllowContract = [&DAG](const SDNodeFlags &Flags) {
51491 Flags.hasAllowContract();
51492 };
51493
51494 auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {
51495 return DAG.getTarget().Options.NoSignedZerosFPMath ||
51496 Flags.hasNoSignedZeros();
51497 };
51498 auto IsVectorAllNegativeZero = [&DAG](SDValue Op) {
51499 APInt AI = APInt(32, 0x80008000, true);
51500 KnownBits Bits = DAG.computeKnownBits(Op);
51501 return Bits.getBitWidth() == 32 && Bits.isConstant() &&
51502 Bits.getConstant() == AI;
51503 };
51504
51505 if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||
51506 !AllowContract(N->getFlags()))
51507 return SDValue();
51508
51509 EVT VT = N->getValueType(0);
51510 if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)
51511 return SDValue();
51512
51513 SDValue LHS = N->getOperand(0);
51514 SDValue RHS = N->getOperand(1);
51515 bool IsConj;
51516 SDValue FAddOp1, MulOp0, MulOp1;
51517 auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,
51518 &IsVectorAllNegativeZero,
51519 &HasNoSignedZero](SDValue N) -> bool {
51520 if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)
51521 return false;
51522 SDValue Op0 = N.getOperand(0);
51523 unsigned Opcode = Op0.getOpcode();
51524 if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
51525 if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {
51526 MulOp0 = Op0.getOperand(0);
51527 MulOp1 = Op0.getOperand(1);
51528 IsConj = Opcode == X86ISD::VFCMULC;
51529 return true;
51530 }
51531 if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&
51533 HasNoSignedZero(Op0->getFlags())) ||
51534 IsVectorAllNegativeZero(Op0->getOperand(2)))) {
51535 MulOp0 = Op0.getOperand(0);
51536 MulOp1 = Op0.getOperand(1);
51537 IsConj = Opcode == X86ISD::VFCMADDC;
51538 return true;
51539 }
51540 }
51541 return false;
51542 };
51543
51544 if (GetCFmulFrom(LHS))
51545 FAddOp1 = RHS;
51546 else if (GetCFmulFrom(RHS))
51547 FAddOp1 = LHS;
51548 else
51549 return SDValue();
51550
51551 MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);
51552 FAddOp1 = DAG.getBitcast(CVT, FAddOp1);
51553 unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;
51554 // FIXME: How do we handle when fast math flags of FADD are different from
51555 // CFMUL's?
51556 SDValue CFmul =
51557 DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());
51558 return DAG.getBitcast(VT, CFmul);
51559}
51560
51561/// Do target-specific dag combines on floating-point adds/subs.
51563 const X86Subtarget &Subtarget) {
51564 if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
51565 return HOp;
51566
51567 if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))
51568 return COp;
51569
51570 return SDValue();
51571}
51572
51574 const X86Subtarget &Subtarget) {
51575 EVT VT = N->getValueType(0);
51576 SDValue Src = N->getOperand(0);
51577 EVT SrcVT = Src.getValueType();
51578 SDLoc DL(N);
51579
51580 if (!Subtarget.hasDQI() || !Subtarget.hasVLX() || VT != MVT::v2i64 ||
51581 SrcVT != MVT::v2f32)
51582 return SDValue();
51583
51584 return DAG.getNode(X86ISD::CVTP2SI, DL, VT,
51585 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, Src,
51586 DAG.getUNDEF(SrcVT)));
51587}
51588
51589/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
51590/// the codegen.
51591/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
51592/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
51593/// anything that is guaranteed to be transformed by DAGCombiner.
51595 const X86Subtarget &Subtarget,
51596 const SDLoc &DL) {
51597 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
51598 SDValue Src = N->getOperand(0);
51599 unsigned SrcOpcode = Src.getOpcode();
51600 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51601
51602 EVT VT = N->getValueType(0);
51603 EVT SrcVT = Src.getValueType();
51604
51605 auto IsFreeTruncation = [VT](SDValue Op) {
51606 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
51607
51608 // See if this has been extended from a smaller/equal size to
51609 // the truncation size, allowing a truncation to combine with the extend.
51610 unsigned Opcode = Op.getOpcode();
51611 if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
51612 Opcode == ISD::ZERO_EXTEND) &&
51613 Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
51614 return true;
51615
51616 // See if this is a single use constant which can be constant folded.
51617 // NOTE: We don't peek throught bitcasts here because there is currently
51618 // no support for constant folding truncate+bitcast+vector_of_constants. So
51619 // we'll just send up with a truncate on both operands which will
51620 // get turned back into (truncate (binop)) causing an infinite loop.
51621 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
51622 };
51623
51624 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
51625 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
51626 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
51627 return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
51628 };
51629
51630 // Don't combine if the operation has other uses.
51631 if (!Src.hasOneUse())
51632 return SDValue();
51633
51634 // Only support vector truncation for now.
51635 // TODO: i64 scalar math would benefit as well.
51636 if (!VT.isVector())
51637 return SDValue();
51638
51639 // In most cases its only worth pre-truncating if we're only facing the cost
51640 // of one truncation.
51641 // i.e. if one of the inputs will constant fold or the input is repeated.
51642 switch (SrcOpcode) {
51643 case ISD::MUL:
51644 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
51645 // better to truncate if we have the chance.
51646 if (SrcVT.getScalarType() == MVT::i64 &&
51647 TLI.isOperationLegal(SrcOpcode, VT) &&
51648 !TLI.isOperationLegal(SrcOpcode, SrcVT))
51649 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
51650 [[fallthrough]];
51651 case ISD::AND:
51652 case ISD::XOR:
51653 case ISD::OR:
51654 case ISD::ADD:
51655 case ISD::SUB: {
51656 SDValue Op0 = Src.getOperand(0);
51657 SDValue Op1 = Src.getOperand(1);
51658 if (TLI.isOperationLegal(SrcOpcode, VT) &&
51659 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
51660 return TruncateArithmetic(Op0, Op1);
51661 break;
51662 }
51663 }
51664
51665 return SDValue();
51666}
51667
51668// Try to form a MULHU or MULHS node by looking for
51669// (trunc (srl (mul ext, ext), 16))
51670// TODO: This is X86 specific because we want to be able to handle wide types
51671// before type legalization. But we can only do it if the vector will be
51672// legalized via widening/splitting. Type legalization can't handle promotion
51673// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
51674// combiner.
51675static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
51676 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
51677 // First instruction should be a right shift of a multiply.
51678 if (Src.getOpcode() != ISD::SRL ||
51679 Src.getOperand(0).getOpcode() != ISD::MUL)
51680 return SDValue();
51681
51682 if (!Subtarget.hasSSE2())
51683 return SDValue();
51684
51685 // Only handle vXi16 types that are at least 128-bits unless they will be
51686 // widened.
51687 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
51688 return SDValue();
51689
51690 // Input type should be at least vXi32.
51691 EVT InVT = Src.getValueType();
51692 if (InVT.getVectorElementType().getSizeInBits() < 32)
51693 return SDValue();
51694
51695 // Need a shift by 16.
51696 APInt ShiftAmt;
51697 if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
51698 ShiftAmt != 16)
51699 return SDValue();
51700
51701 SDValue LHS = Src.getOperand(0).getOperand(0);
51702 SDValue RHS = Src.getOperand(0).getOperand(1);
51703
51704 // Count leading sign/zero bits on both inputs - if there are enough then
51705 // truncation back to vXi16 will be cheap - either as a pack/shuffle
51706 // sequence or using AVX512 truncations. If the inputs are sext/zext then the
51707 // truncations may actually be free by peeking through to the ext source.
51708 auto IsSext = [&DAG](SDValue V) {
51709 return DAG.ComputeMaxSignificantBits(V) <= 16;
51710 };
51711 auto IsZext = [&DAG](SDValue V) {
51712 return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;
51713 };
51714
51715 bool IsSigned = IsSext(LHS) && IsSext(RHS);
51716 bool IsUnsigned = IsZext(LHS) && IsZext(RHS);
51717 if (!IsSigned && !IsUnsigned)
51718 return SDValue();
51719
51720 // Check if both inputs are extensions, which will be removed by truncation.
51721 bool IsTruncateFree = (LHS.getOpcode() == ISD::SIGN_EXTEND ||
51722 LHS.getOpcode() == ISD::ZERO_EXTEND) &&
51723 (RHS.getOpcode() == ISD::SIGN_EXTEND ||
51724 RHS.getOpcode() == ISD::ZERO_EXTEND) &&
51725 LHS.getOperand(0).getScalarValueSizeInBits() <= 16 &&
51726 RHS.getOperand(0).getScalarValueSizeInBits() <= 16;
51727
51728 // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on
51729 // the (bitcasted) inputs directly, and then cheaply pack/truncate the result
51730 // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU
51731 // will have to split anyway.
51732 unsigned InSizeInBits = InVT.getSizeInBits();
51733 if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&
51734 !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&
51735 (InSizeInBits % 16) == 0) {
51736 EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
51737 InVT.getSizeInBits() / 16);
51738 SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),
51739 DAG.getBitcast(BCVT, RHS));
51740 return DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));
51741 }
51742
51743 // Truncate back to source type.
51744 LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);
51745 RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);
51746
51747 unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;
51748 return DAG.getNode(Opc, DL, VT, LHS, RHS);
51749}
51750
51751// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
51752// from one vector with signed bytes from another vector, adds together
51753// adjacent pairs of 16-bit products, and saturates the result before
51754// truncating to 16-bits.
51755//
51756// Which looks something like this:
51757// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
51758// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
51760 const X86Subtarget &Subtarget,
51761 const SDLoc &DL) {
51762 if (!VT.isVector() || !Subtarget.hasSSSE3())
51763 return SDValue();
51764
51765 unsigned NumElems = VT.getVectorNumElements();
51766 EVT ScalarVT = VT.getVectorElementType();
51767 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
51768 return SDValue();
51769
51770 SDValue SSatVal = detectSSatPattern(In, VT);
51771 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
51772 return SDValue();
51773
51774 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
51775 // of multiplies from even/odd elements.
51776 SDValue N0 = SSatVal.getOperand(0);
51777 SDValue N1 = SSatVal.getOperand(1);
51778
51779 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
51780 return SDValue();
51781
51782 SDValue N00 = N0.getOperand(0);
51783 SDValue N01 = N0.getOperand(1);
51784 SDValue N10 = N1.getOperand(0);
51785 SDValue N11 = N1.getOperand(1);
51786
51787 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
51788 // Canonicalize zero_extend to LHS.
51789 if (N01.getOpcode() == ISD::ZERO_EXTEND)
51790 std::swap(N00, N01);
51791 if (N11.getOpcode() == ISD::ZERO_EXTEND)
51792 std::swap(N10, N11);
51793
51794 // Ensure we have a zero_extend and a sign_extend.
51795 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
51796 N01.getOpcode() != ISD::SIGN_EXTEND ||
51797 N10.getOpcode() != ISD::ZERO_EXTEND ||
51798 N11.getOpcode() != ISD::SIGN_EXTEND)
51799 return SDValue();
51800
51801 // Peek through the extends.
51802 N00 = N00.getOperand(0);
51803 N01 = N01.getOperand(0);
51804 N10 = N10.getOperand(0);
51805 N11 = N11.getOperand(0);
51806
51807 // Ensure the extend is from vXi8.
51808 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
51809 N01.getValueType().getVectorElementType() != MVT::i8 ||
51810 N10.getValueType().getVectorElementType() != MVT::i8 ||
51811 N11.getValueType().getVectorElementType() != MVT::i8)
51812 return SDValue();
51813
51814 // All inputs should be build_vectors.
51815 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
51816 N01.getOpcode() != ISD::BUILD_VECTOR ||
51817 N10.getOpcode() != ISD::BUILD_VECTOR ||
51819 return SDValue();
51820
51821 // N00/N10 are zero extended. N01/N11 are sign extended.
51822
51823 // For each element, we need to ensure we have an odd element from one vector
51824 // multiplied by the odd element of another vector and the even element from
51825 // one of the same vectors being multiplied by the even element from the
51826 // other vector. So we need to make sure for each element i, this operator
51827 // is being performed:
51828 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
51829 SDValue ZExtIn, SExtIn;
51830 for (unsigned i = 0; i != NumElems; ++i) {
51831 SDValue N00Elt = N00.getOperand(i);
51832 SDValue N01Elt = N01.getOperand(i);
51833 SDValue N10Elt = N10.getOperand(i);
51834 SDValue N11Elt = N11.getOperand(i);
51835 // TODO: Be more tolerant to undefs.
51836 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
51837 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
51838 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
51840 return SDValue();
51841 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
51842 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
51843 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
51844 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
51845 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
51846 return SDValue();
51847 unsigned IdxN00 = ConstN00Elt->getZExtValue();
51848 unsigned IdxN01 = ConstN01Elt->getZExtValue();
51849 unsigned IdxN10 = ConstN10Elt->getZExtValue();
51850 unsigned IdxN11 = ConstN11Elt->getZExtValue();
51851 // Add is commutative so indices can be reordered.
51852 if (IdxN00 > IdxN10) {
51853 std::swap(IdxN00, IdxN10);
51854 std::swap(IdxN01, IdxN11);
51855 }
51856 // N0 indices be the even element. N1 indices must be the next odd element.
51857 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
51858 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
51859 return SDValue();
51860 SDValue N00In = N00Elt.getOperand(0);
51861 SDValue N01In = N01Elt.getOperand(0);
51862 SDValue N10In = N10Elt.getOperand(0);
51863 SDValue N11In = N11Elt.getOperand(0);
51864 // First time we find an input capture it.
51865 if (!ZExtIn) {
51866 ZExtIn = N00In;
51867 SExtIn = N01In;
51868 }
51869 if (ZExtIn != N00In || SExtIn != N01In ||
51870 ZExtIn != N10In || SExtIn != N11In)
51871 return SDValue();
51872 }
51873
51874 auto ExtractVec = [&DAG, &DL, NumElems](SDValue &Ext) {
51875 EVT ExtVT = Ext.getValueType();
51876 if (ExtVT.getVectorNumElements() != NumElems * 2) {
51877 MVT NVT = MVT::getVectorVT(MVT::i8, NumElems * 2);
51878 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, Ext,
51879 DAG.getIntPtrConstant(0, DL));
51880 }
51881 };
51882 ExtractVec(ZExtIn);
51883 ExtractVec(SExtIn);
51884
51885 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
51886 ArrayRef<SDValue> Ops) {
51887 // Shrink by adding truncate nodes and let DAGCombine fold with the
51888 // sources.
51889 EVT InVT = Ops[0].getValueType();
51890 assert(InVT.getScalarType() == MVT::i8 &&
51891 "Unexpected scalar element type");
51892 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
51893 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
51894 InVT.getVectorNumElements() / 2);
51895 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
51896 };
51897 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
51898 PMADDBuilder);
51899}
51900
51902 const X86Subtarget &Subtarget) {
51903 EVT VT = N->getValueType(0);
51904 SDValue Src = N->getOperand(0);
51905 SDLoc DL(N);
51906
51907 // Attempt to pre-truncate inputs to arithmetic ops instead.
51908 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
51909 return V;
51910
51911 // Try to detect AVG pattern first.
51912 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
51913 return Avg;
51914
51915 // Try to detect PMADD
51916 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
51917 return PMAdd;
51918
51919 // Try to combine truncation with signed/unsigned saturation.
51920 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
51921 return Val;
51922
51923 // Try to combine PMULHUW/PMULHW for vXi16.
51924 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
51925 return V;
51926
51927 // The bitcast source is a direct mmx result.
51928 // Detect bitcasts between i32 to x86mmx
51929 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
51930 SDValue BCSrc = Src.getOperand(0);
51931 if (BCSrc.getValueType() == MVT::x86mmx)
51932 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
51933 }
51934
51935 // Try to combine (trunc (vNi64 (lrint x))) to (vNi32 (lrint x)).
51936 if (Src.getOpcode() == ISD::LRINT && VT.getScalarType() == MVT::i32 &&
51937 Src.hasOneUse())
51938 return DAG.getNode(ISD::LRINT, DL, VT, Src.getOperand(0));
51939
51940 return SDValue();
51941}
51942
51945 EVT VT = N->getValueType(0);
51946 SDValue In = N->getOperand(0);
51947 SDLoc DL(N);
51948
51949 if (SDValue SSatVal = detectSSatPattern(In, VT))
51950 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
51951 if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL))
51952 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
51953
51954 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51955 APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));
51956 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
51957 return SDValue(N, 0);
51958
51959 return SDValue();
51960}
51961
51962/// Returns the negated value if the node \p N flips sign of FP value.
51963///
51964/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
51965/// or FSUB(0, x)
51966/// AVX512F does not have FXOR, so FNEG is lowered as
51967/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
51968/// In this case we go though all bitcasts.
51969/// This also recognizes splat of a negated value and returns the splat of that
51970/// value.
51971static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
51972 if (N->getOpcode() == ISD::FNEG)
51973 return N->getOperand(0);
51974
51975 // Don't recurse exponentially.
51977 return SDValue();
51978
51979 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
51980
51982 EVT VT = Op->getValueType(0);
51983
51984 // Make sure the element size doesn't change.
51985 if (VT.getScalarSizeInBits() != ScalarSize)
51986 return SDValue();
51987
51988 unsigned Opc = Op.getOpcode();
51989 switch (Opc) {
51990 case ISD::VECTOR_SHUFFLE: {
51991 // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
51992 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
51993 if (!Op.getOperand(1).isUndef())
51994 return SDValue();
51995 if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
51996 if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
51997 return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
51998 cast<ShuffleVectorSDNode>(Op)->getMask());
51999 break;
52000 }
52002 // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
52003 // -V, INDEX).
52004 SDValue InsVector = Op.getOperand(0);
52005 SDValue InsVal = Op.getOperand(1);
52006 if (!InsVector.isUndef())
52007 return SDValue();
52008 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
52009 if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
52010 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
52011 NegInsVal, Op.getOperand(2));
52012 break;
52013 }
52014 case ISD::FSUB:
52015 case ISD::XOR:
52016 case X86ISD::FXOR: {
52017 SDValue Op1 = Op.getOperand(1);
52018 SDValue Op0 = Op.getOperand(0);
52019
52020 // For XOR and FXOR, we want to check if constant
52021 // bits of Op1 are sign bit masks. For FSUB, we
52022 // have to check if constant bits of Op0 are sign
52023 // bit masks and hence we swap the operands.
52024 if (Opc == ISD::FSUB)
52025 std::swap(Op0, Op1);
52026
52027 APInt UndefElts;
52028 SmallVector<APInt, 16> EltBits;
52029 // Extract constant bits and see if they are all
52030 // sign bit masks. Ignore the undef elements.
52031 if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
52032 /* AllowWholeUndefs */ true,
52033 /* AllowPartialUndefs */ false)) {
52034 for (unsigned I = 0, E = EltBits.size(); I < E; I++)
52035 if (!UndefElts[I] && !EltBits[I].isSignMask())
52036 return SDValue();
52037
52038 // Only allow bitcast from correctly-sized constant.
52039 Op0 = peekThroughBitcasts(Op0);
52040 if (Op0.getScalarValueSizeInBits() == ScalarSize)
52041 return Op0;
52042 }
52043 break;
52044 } // case
52045 } // switch
52046
52047 return SDValue();
52048}
52049
52050static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
52051 bool NegRes) {
52052 if (NegMul) {
52053 switch (Opcode) {
52054 // clang-format off
52055 default: llvm_unreachable("Unexpected opcode");
52056 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
52057 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
52058 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
52059 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
52060 case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
52061 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
52062 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
52063 case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
52064 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
52065 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
52066 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
52067 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
52068 // clang-format on
52069 }
52070 }
52071
52072 if (NegAcc) {
52073 switch (Opcode) {
52074 // clang-format off
52075 default: llvm_unreachable("Unexpected opcode");
52076 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
52077 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
52078 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
52079 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
52080 case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
52081 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
52082 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
52083 case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
52084 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
52085 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
52086 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
52087 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
52088 case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
52089 case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
52090 case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
52091 case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
52092 // clang-format on
52093 }
52094 }
52095
52096 if (NegRes) {
52097 switch (Opcode) {
52098 // For accuracy reason, we never combine fneg and fma under strict FP.
52099 // clang-format off
52100 default: llvm_unreachable("Unexpected opcode");
52101 case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
52102 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
52103 case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
52104 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
52105 case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
52106 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
52107 case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
52108 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
52109 // clang-format on
52110 }
52111 }
52112
52113 return Opcode;
52114}
52115
52116/// Do target-specific dag combines on floating point negations.
52119 const X86Subtarget &Subtarget) {
52120 EVT OrigVT = N->getValueType(0);
52121 SDValue Arg = isFNEG(DAG, N);
52122 if (!Arg)
52123 return SDValue();
52124
52125 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52126 EVT VT = Arg.getValueType();
52127 EVT SVT = VT.getScalarType();
52128 SDLoc DL(N);
52129
52130 // Let legalize expand this if it isn't a legal type yet.
52131 if (!TLI.isTypeLegal(VT))
52132 return SDValue();
52133
52134 // If we're negating a FMUL node on a target with FMA, then we can avoid the
52135 // use of a constant by performing (-0 - A*B) instead.
52136 // FIXME: Check rounding control flags as well once it becomes available.
52137 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
52138 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
52139 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
52140 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
52141 Arg.getOperand(1), Zero);
52142 return DAG.getBitcast(OrigVT, NewNode);
52143 }
52144
52145 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
52146 bool LegalOperations = !DCI.isBeforeLegalizeOps();
52147 if (SDValue NegArg =
52148 TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
52149 return DAG.getBitcast(OrigVT, NegArg);
52150
52151 return SDValue();
52152}
52153
52155 bool LegalOperations,
52156 bool ForCodeSize,
52158 unsigned Depth) const {
52159 // fneg patterns are removable even if they have multiple uses.
52160 if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
52162 return DAG.getBitcast(Op.getValueType(), Arg);
52163 }
52164
52165 EVT VT = Op.getValueType();
52166 EVT SVT = VT.getScalarType();
52167 unsigned Opc = Op.getOpcode();
52168 SDNodeFlags Flags = Op.getNode()->getFlags();
52169 switch (Opc) {
52170 case ISD::FMA:
52171 case X86ISD::FMSUB:
52172 case X86ISD::FNMADD:
52173 case X86ISD::FNMSUB:
52174 case X86ISD::FMADD_RND:
52175 case X86ISD::FMSUB_RND:
52176 case X86ISD::FNMADD_RND:
52177 case X86ISD::FNMSUB_RND: {
52178 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
52179 !(SVT == MVT::f32 || SVT == MVT::f64) ||
52181 break;
52182
52183 // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
52184 // if it may have signed zeros.
52185 if (!Flags.hasNoSignedZeros())
52186 break;
52187
52188 // This is always negatible for free but we might be able to remove some
52189 // extra operand negations as well.
52191 for (int i = 0; i != 3; ++i)
52192 NewOps[i] = getCheaperNegatedExpression(
52193 Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
52194
52195 bool NegA = !!NewOps[0];
52196 bool NegB = !!NewOps[1];
52197 bool NegC = !!NewOps[2];
52198 unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
52199
52200 Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
52202
52203 // Fill in the non-negated ops with the original values.
52204 for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
52205 if (!NewOps[i])
52206 NewOps[i] = Op.getOperand(i);
52207 return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
52208 }
52209 case X86ISD::FRCP:
52210 if (SDValue NegOp0 =
52211 getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
52212 ForCodeSize, Cost, Depth + 1))
52213 return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
52214 break;
52215 }
52216
52217 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
52218 ForCodeSize, Cost, Depth);
52219}
52220
52222 const X86Subtarget &Subtarget) {
52223 MVT VT = N->getSimpleValueType(0);
52224 // If we have integer vector types available, use the integer opcodes.
52225 if (!VT.isVector() || !Subtarget.hasSSE2())
52226 return SDValue();
52227
52228 SDLoc dl(N);
52229
52230 unsigned IntBits = VT.getScalarSizeInBits();
52231 MVT IntSVT = MVT::getIntegerVT(IntBits);
52232 MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
52233
52234 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
52235 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
52236 unsigned IntOpcode;
52237 switch (N->getOpcode()) {
52238 // clang-format off
52239 default: llvm_unreachable("Unexpected FP logic op");
52240 case X86ISD::FOR: IntOpcode = ISD::OR; break;
52241 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
52242 case X86ISD::FAND: IntOpcode = ISD::AND; break;
52243 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
52244 // clang-format on
52245 }
52246 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
52247 return DAG.getBitcast(VT, IntOp);
52248}
52249
52250
52251/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
52253 if (N->getOpcode() != ISD::XOR)
52254 return SDValue();
52255
52256 SDValue LHS = N->getOperand(0);
52257 if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
52258 return SDValue();
52259
52261 X86::CondCode(LHS->getConstantOperandVal(0)));
52262 SDLoc DL(N);
52263 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
52264}
52265
52267 const X86Subtarget &Subtarget) {
52268 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&
52269 "Invalid opcode for combing with CTLZ");
52270 if (Subtarget.hasFastLZCNT())
52271 return SDValue();
52272
52273 EVT VT = N->getValueType(0);
52274 if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32 &&
52275 (VT != MVT::i64 || !Subtarget.is64Bit()))
52276 return SDValue();
52277
52278 SDValue N0 = N->getOperand(0);
52279 SDValue N1 = N->getOperand(1);
52280
52281 if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF &&
52283 return SDValue();
52284
52285 SDValue OpCTLZ;
52286 SDValue OpSizeTM1;
52287
52288 if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
52289 OpCTLZ = N1;
52290 OpSizeTM1 = N0;
52291 } else if (N->getOpcode() == ISD::SUB) {
52292 return SDValue();
52293 } else {
52294 OpCTLZ = N0;
52295 OpSizeTM1 = N1;
52296 }
52297
52298 if (!OpCTLZ.hasOneUse())
52299 return SDValue();
52300 auto *C = dyn_cast<ConstantSDNode>(OpSizeTM1);
52301 if (!C)
52302 return SDValue();
52303
52304 if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))
52305 return SDValue();
52306 SDLoc DL(N);
52307 EVT OpVT = VT;
52308 SDValue Op = OpCTLZ.getOperand(0);
52309 if (VT == MVT::i8) {
52310 // Zero extend to i32 since there is not an i8 bsr.
52311 OpVT = MVT::i32;
52312 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, OpVT, Op);
52313 }
52314
52315 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
52316 Op = DAG.getNode(X86ISD::BSR, DL, VTs, Op);
52317 if (VT == MVT::i8)
52318 Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op);
52319
52320 return Op;
52321}
52322
52325 const X86Subtarget &Subtarget) {
52326 SDValue N0 = N->getOperand(0);
52327 SDValue N1 = N->getOperand(1);
52328 EVT VT = N->getValueType(0);
52329
52330 // If this is SSE1 only convert to FXOR to avoid scalarization.
52331 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
52332 return DAG.getBitcast(MVT::v4i32,
52333 DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
52334 DAG.getBitcast(MVT::v4f32, N0),
52335 DAG.getBitcast(MVT::v4f32, N1)));
52336 }
52337
52338 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
52339 return Cmp;
52340
52341 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
52342 return R;
52343
52344 if (SDValue R = combineBitOpWithShift(N, DAG))
52345 return R;
52346
52347 if (SDValue R = combineBitOpWithPACK(N, DAG))
52348 return R;
52349
52350 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
52351 return FPLogic;
52352
52353 if (SDValue R = combineXorSubCTLZ(N, DAG, Subtarget))
52354 return R;
52355
52356 if (DCI.isBeforeLegalizeOps())
52357 return SDValue();
52358
52359 if (SDValue SetCC = foldXor1SetCC(N, DAG))
52360 return SetCC;
52361
52362 if (SDValue R = combineOrXorWithSETCC(N, N0, N1, DAG))
52363 return R;
52364
52365 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
52366 return RV;
52367
52368 // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
52369 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52370 if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
52371 N0.getOperand(0).getValueType().isVector() &&
52372 N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
52373 TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
52374 return DAG.getBitcast(VT, DAG.getNOT(SDLoc(N), N0.getOperand(0),
52375 N0.getOperand(0).getValueType()));
52376 }
52377
52378 // Handle AVX512 mask widening.
52379 // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
52380 if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
52381 VT.getVectorElementType() == MVT::i1 &&
52383 TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
52384 return DAG.getNode(
52386 DAG.getNOT(SDLoc(N), N0.getOperand(1), N0.getOperand(1).getValueType()),
52387 N0.getOperand(2));
52388 }
52389
52390 // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
52391 // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
52392 // TODO: Under what circumstances could this be performed in DAGCombine?
52393 if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
52394 N0.getOperand(0).getOpcode() == N->getOpcode()) {
52395 SDValue TruncExtSrc = N0.getOperand(0);
52396 auto *N1C = dyn_cast<ConstantSDNode>(N1);
52397 auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
52398 if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
52399 SDLoc DL(N);
52400 SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
52401 SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
52402 return DAG.getNode(ISD::XOR, DL, VT, LHS,
52403 DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
52404 }
52405 }
52406
52407 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
52408 return R;
52409
52410 return combineFneg(N, DAG, DCI, Subtarget);
52411}
52412
52415 const X86Subtarget &Subtarget) {
52416 SDValue N0 = N->getOperand(0);
52417 EVT VT = N->getValueType(0);
52418
52419 // Convert a (iX bitreverse(bitcast(vXi1 X))) -> (iX bitcast(shuffle(X)))
52420 if (VT.isInteger() && N0.getOpcode() == ISD::BITCAST && N0.hasOneUse()) {
52421 SDValue Src = N0.getOperand(0);
52422 EVT SrcVT = Src.getValueType();
52423 if (SrcVT.isVector() && SrcVT.getScalarType() == MVT::i1 &&
52424 (DCI.isBeforeLegalize() ||
52425 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)) &&
52426 Subtarget.hasSSSE3()) {
52427 unsigned NumElts = SrcVT.getVectorNumElements();
52428 SmallVector<int, 32> ReverseMask(NumElts);
52429 for (unsigned I = 0; I != NumElts; ++I)
52430 ReverseMask[I] = (NumElts - 1) - I;
52431 SDValue Rev =
52432 DAG.getVectorShuffle(SrcVT, SDLoc(N), Src, Src, ReverseMask);
52433 return DAG.getBitcast(VT, Rev);
52434 }
52435 }
52436
52437 return SDValue();
52438}
52439
52442 const X86Subtarget &Subtarget) {
52443 EVT VT = N->getValueType(0);
52444 unsigned NumBits = VT.getSizeInBits();
52445
52446 // TODO - Constant Folding.
52447
52448 // Simplify the inputs.
52449 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52450 APInt DemandedMask(APInt::getAllOnes(NumBits));
52451 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
52452 return SDValue(N, 0);
52453
52454 return SDValue();
52455}
52456
52458 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
52459}
52460
52461/// If a value is a scalar FP zero or a vector FP zero (potentially including
52462/// undefined elements), return a zero constant that may be used to fold away
52463/// that value. In the case of a vector, the returned constant will not contain
52464/// undefined elements even if the input parameter does. This makes it suitable
52465/// to be used as a replacement operand with operations (eg, bitwise-and) where
52466/// an undef should not propagate.
52468 const X86Subtarget &Subtarget) {
52470 return SDValue();
52471
52472 if (V.getValueType().isVector())
52473 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
52474
52475 return V;
52476}
52477
52479 const X86Subtarget &Subtarget) {
52480 SDValue N0 = N->getOperand(0);
52481 SDValue N1 = N->getOperand(1);
52482 EVT VT = N->getValueType(0);
52483 SDLoc DL(N);
52484
52485 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
52486 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
52487 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
52488 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
52489 return SDValue();
52490
52491 auto isAllOnesConstantFP = [](SDValue V) {
52492 if (V.getSimpleValueType().isVector())
52493 return ISD::isBuildVectorAllOnes(V.getNode());
52494 auto *C = dyn_cast<ConstantFPSDNode>(V);
52495 return C && C->getConstantFPValue()->isAllOnesValue();
52496 };
52497
52498 // fand (fxor X, -1), Y --> fandn X, Y
52499 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
52500 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
52501
52502 // fand X, (fxor Y, -1) --> fandn Y, X
52503 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
52504 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
52505
52506 return SDValue();
52507}
52508
52509/// Do target-specific dag combines on X86ISD::FAND nodes.
52511 const X86Subtarget &Subtarget) {
52512 // FAND(0.0, x) -> 0.0
52513 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
52514 return V;
52515
52516 // FAND(x, 0.0) -> 0.0
52517 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
52518 return V;
52519
52520 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
52521 return V;
52522
52523 return lowerX86FPLogicOp(N, DAG, Subtarget);
52524}
52525
52526/// Do target-specific dag combines on X86ISD::FANDN nodes.
52528 const X86Subtarget &Subtarget) {
52529 // FANDN(0.0, x) -> x
52530 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
52531 return N->getOperand(1);
52532
52533 // FANDN(x, 0.0) -> 0.0
52534 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
52535 return V;
52536
52537 return lowerX86FPLogicOp(N, DAG, Subtarget);
52538}
52539
52540/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
52543 const X86Subtarget &Subtarget) {
52544 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
52545
52546 // F[X]OR(0.0, x) -> x
52547 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
52548 return N->getOperand(1);
52549
52550 // F[X]OR(x, 0.0) -> x
52551 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
52552 return N->getOperand(0);
52553
52554 if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
52555 return NewVal;
52556
52557 return lowerX86FPLogicOp(N, DAG, Subtarget);
52558}
52559
52560/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
52562 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
52563
52564 // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
52565 if (!DAG.getTarget().Options.NoNaNsFPMath ||
52567 return SDValue();
52568
52569 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
52570 // into FMINC and FMAXC, which are Commutative operations.
52571 unsigned NewOp = 0;
52572 switch (N->getOpcode()) {
52573 default: llvm_unreachable("unknown opcode");
52574 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
52575 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
52576 }
52577
52578 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
52579 N->getOperand(0), N->getOperand(1));
52580}
52581
52583 const X86Subtarget &Subtarget) {
52584 EVT VT = N->getValueType(0);
52585 if (Subtarget.useSoftFloat() || isSoftF16(VT, Subtarget))
52586 return SDValue();
52587
52588 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52589
52590 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
52591 (Subtarget.hasSSE2() && VT == MVT::f64) ||
52592 (Subtarget.hasFP16() && VT == MVT::f16) ||
52593 (VT.isVector() && TLI.isTypeLegal(VT))))
52594 return SDValue();
52595
52596 SDValue Op0 = N->getOperand(0);
52597 SDValue Op1 = N->getOperand(1);
52598 SDLoc DL(N);
52599 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
52600
52601 // If we don't have to respect NaN inputs, this is a direct translation to x86
52602 // min/max instructions.
52603 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
52604 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
52605
52606 // If one of the operands is known non-NaN use the native min/max instructions
52607 // with the non-NaN input as second operand.
52608 if (DAG.isKnownNeverNaN(Op1))
52609 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
52610 if (DAG.isKnownNeverNaN(Op0))
52611 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
52612
52613 // If we have to respect NaN inputs, this takes at least 3 instructions.
52614 // Favor a library call when operating on a scalar and minimizing code size.
52615 if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
52616 return SDValue();
52617
52618 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
52619 VT);
52620
52621 // There are 4 possibilities involving NaN inputs, and these are the required
52622 // outputs:
52623 // Op1
52624 // Num NaN
52625 // ----------------
52626 // Num | Max | Op0 |
52627 // Op0 ----------------
52628 // NaN | Op1 | NaN |
52629 // ----------------
52630 //
52631 // The SSE FP max/min instructions were not designed for this case, but rather
52632 // to implement:
52633 // Min = Op1 < Op0 ? Op1 : Op0
52634 // Max = Op1 > Op0 ? Op1 : Op0
52635 //
52636 // So they always return Op0 if either input is a NaN. However, we can still
52637 // use those instructions for fmaxnum by selecting away a NaN input.
52638
52639 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
52640 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
52641 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
52642
52643 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
52644 // are NaN, the NaN value of Op1 is the result.
52645 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
52646}
52647
52650 EVT VT = N->getValueType(0);
52651 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52652
52653 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
52654 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
52655 return SDValue(N, 0);
52656
52657 // Convert a full vector load into vzload when not all bits are needed.
52658 SDValue In = N->getOperand(0);
52659 MVT InVT = In.getSimpleValueType();
52660 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
52661 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
52662 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
52663 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
52664 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
52665 MVT MemVT = MVT::getIntegerVT(NumBits);
52666 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
52667 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
52668 SDLoc dl(N);
52669 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
52670 DAG.getBitcast(InVT, VZLoad));
52671 DCI.CombineTo(N, Convert);
52672 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
52674 return SDValue(N, 0);
52675 }
52676 }
52677
52678 return SDValue();
52679}
52680
52683 bool IsStrict = N->isTargetStrictFPOpcode();
52684 EVT VT = N->getValueType(0);
52685
52686 // Convert a full vector load into vzload when not all bits are needed.
52687 SDValue In = N->getOperand(IsStrict ? 1 : 0);
52688 MVT InVT = In.getSimpleValueType();
52689 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
52690 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
52691 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
52692 LoadSDNode *LN = cast<LoadSDNode>(In);
52693 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
52694 MVT MemVT = MVT::getFloatingPointVT(NumBits);
52695 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
52696 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
52697 SDLoc dl(N);
52698 if (IsStrict) {
52699 SDValue Convert =
52700 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
52701 {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
52702 DCI.CombineTo(N, Convert, Convert.getValue(1));
52703 } else {
52704 SDValue Convert =
52705 DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
52706 DCI.CombineTo(N, Convert);
52707 }
52708 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
52710 return SDValue(N, 0);
52711 }
52712 }
52713
52714 return SDValue();
52715}
52716
52717/// Do target-specific dag combines on X86ISD::ANDNP nodes.
52720 const X86Subtarget &Subtarget) {
52721 SDValue N0 = N->getOperand(0);
52722 SDValue N1 = N->getOperand(1);
52723 MVT VT = N->getSimpleValueType(0);
52724 int NumElts = VT.getVectorNumElements();
52725 unsigned EltSizeInBits = VT.getScalarSizeInBits();
52726 SDLoc DL(N);
52727
52728 // ANDNP(undef, x) -> 0
52729 // ANDNP(x, undef) -> 0
52730 if (N0.isUndef() || N1.isUndef())
52731 return DAG.getConstant(0, DL, VT);
52732
52733 // ANDNP(0, x) -> x
52735 return N1;
52736
52737 // ANDNP(x, 0) -> 0
52739 return DAG.getConstant(0, DL, VT);
52740
52741 // ANDNP(x, -1) -> NOT(x) -> XOR(x, -1)
52743 return DAG.getNOT(DL, N0, VT);
52744
52745 // Turn ANDNP back to AND if input is inverted.
52746 if (SDValue Not = IsNOT(N0, DAG))
52747 return DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, Not), N1);
52748
52749 // Fold for better commutativity:
52750 // ANDNP(x,NOT(y)) -> AND(NOT(x),NOT(y)) -> NOT(OR(X,Y)).
52751 if (N1->hasOneUse())
52752 if (SDValue Not = IsNOT(N1, DAG))
52753 return DAG.getNOT(
52754 DL, DAG.getNode(ISD::OR, DL, VT, N0, DAG.getBitcast(VT, Not)), VT);
52755
52756 // Constant Folding
52757 APInt Undefs0, Undefs1;
52758 SmallVector<APInt> EltBits0, EltBits1;
52759 if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0,
52760 /*AllowWholeUndefs*/ true,
52761 /*AllowPartialUndefs*/ true)) {
52762 if (getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1,
52763 /*AllowWholeUndefs*/ true,
52764 /*AllowPartialUndefs*/ true)) {
52765 SmallVector<APInt> ResultBits;
52766 for (int I = 0; I != NumElts; ++I)
52767 ResultBits.push_back(~EltBits0[I] & EltBits1[I]);
52768 return getConstVector(ResultBits, VT, DAG, DL);
52769 }
52770
52771 // Constant fold NOT(N0) to allow us to use AND.
52772 // Ensure this is only performed if we can confirm that the bitcasted source
52773 // has oneuse to prevent an infinite loop with canonicalizeBitSelect.
52774 if (N0->hasOneUse()) {
52776 if (BC0.getOpcode() != ISD::BITCAST) {
52777 for (APInt &Elt : EltBits0)
52778 Elt = ~Elt;
52779 SDValue Not = getConstVector(EltBits0, VT, DAG, DL);
52780 return DAG.getNode(ISD::AND, DL, VT, Not, N1);
52781 }
52782 }
52783 }
52784
52785 // Attempt to recursively combine a bitmask ANDNP with shuffles.
52786 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
52787 SDValue Op(N, 0);
52788 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
52789 return Res;
52790
52791 // If either operand is a constant mask, then only the elements that aren't
52792 // zero are actually demanded by the other operand.
52793 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
52794 APInt UndefElts;
52795 SmallVector<APInt> EltBits;
52796 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
52797 APInt DemandedElts = APInt::getAllOnes(NumElts);
52798 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
52799 EltBits)) {
52800 DemandedBits.clearAllBits();
52801 DemandedElts.clearAllBits();
52802 for (int I = 0; I != NumElts; ++I) {
52803 if (UndefElts[I]) {
52804 // We can't assume an undef src element gives an undef dst - the
52805 // other src might be zero.
52806 DemandedBits.setAllBits();
52807 DemandedElts.setBit(I);
52808 } else if ((Invert && !EltBits[I].isAllOnes()) ||
52809 (!Invert && !EltBits[I].isZero())) {
52810 DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];
52811 DemandedElts.setBit(I);
52812 }
52813 }
52814 }
52815 return std::make_pair(DemandedBits, DemandedElts);
52816 };
52817 APInt Bits0, Elts0;
52818 APInt Bits1, Elts1;
52819 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
52820 std::tie(Bits1, Elts1) = GetDemandedMasks(N0, true);
52821
52822 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52823 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
52824 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
52825 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
52826 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
52827 if (N->getOpcode() != ISD::DELETED_NODE)
52828 DCI.AddToWorklist(N);
52829 return SDValue(N, 0);
52830 }
52831 }
52832
52833 return SDValue();
52834}
52835
52838 SDValue N1 = N->getOperand(1);
52839
52840 // BT ignores high bits in the bit index operand.
52841 unsigned BitWidth = N1.getValueSizeInBits();
52843 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
52844 if (N->getOpcode() != ISD::DELETED_NODE)
52845 DCI.AddToWorklist(N);
52846 return SDValue(N, 0);
52847 }
52848
52849 return SDValue();
52850}
52851
52854 bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
52855 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
52856
52857 if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
52858 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52859 APInt DemandedElts = APInt::getLowBitsSet(8, 4);
52860 if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {
52861 if (N->getOpcode() != ISD::DELETED_NODE)
52862 DCI.AddToWorklist(N);
52863 return SDValue(N, 0);
52864 }
52865
52866 // Convert a full vector load into vzload when not all bits are needed.
52867 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
52868 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
52869 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
52870 SDLoc dl(N);
52871 if (IsStrict) {
52872 SDValue Convert = DAG.getNode(
52873 N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
52874 {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
52875 DCI.CombineTo(N, Convert, Convert.getValue(1));
52876 } else {
52877 SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
52878 DAG.getBitcast(MVT::v8i16, VZLoad));
52879 DCI.CombineTo(N, Convert);
52880 }
52881
52882 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
52884 return SDValue(N, 0);
52885 }
52886 }
52887 }
52888
52889 return SDValue();
52890}
52891
52892// Try to combine sext_in_reg of a cmov of constants by extending the constants.
52894 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
52895
52896 EVT DstVT = N->getValueType(0);
52897
52898 SDValue N0 = N->getOperand(0);
52899 SDValue N1 = N->getOperand(1);
52900 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
52901
52902 if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
52903 return SDValue();
52904
52905 // Look through single use any_extends / truncs.
52906 SDValue IntermediateBitwidthOp;
52907 if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
52908 N0.hasOneUse()) {
52909 IntermediateBitwidthOp = N0;
52910 N0 = N0.getOperand(0);
52911 }
52912
52913 // See if we have a single use cmov.
52914 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
52915 return SDValue();
52916
52917 SDValue CMovOp0 = N0.getOperand(0);
52918 SDValue CMovOp1 = N0.getOperand(1);
52919
52920 // Make sure both operands are constants.
52921 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
52922 !isa<ConstantSDNode>(CMovOp1.getNode()))
52923 return SDValue();
52924
52925 SDLoc DL(N);
52926
52927 // If we looked through an any_extend/trunc above, add one to the constants.
52928 if (IntermediateBitwidthOp) {
52929 unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
52930 CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
52931 CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
52932 }
52933
52934 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
52935 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
52936
52937 EVT CMovVT = DstVT;
52938 // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
52939 if (DstVT == MVT::i16) {
52940 CMovVT = MVT::i32;
52941 CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
52942 CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
52943 }
52944
52945 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
52946 N0.getOperand(2), N0.getOperand(3));
52947
52948 if (CMovVT != DstVT)
52949 CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
52950
52951 return CMov;
52952}
52953
52955 const X86Subtarget &Subtarget) {
52956 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
52957
52958 if (SDValue V = combineSextInRegCmov(N, DAG))
52959 return V;
52960
52961 EVT VT = N->getValueType(0);
52962 SDValue N0 = N->getOperand(0);
52963 SDValue N1 = N->getOperand(1);
52964 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
52965 SDLoc dl(N);
52966
52967 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
52968 // both SSE and AVX2 since there is no sign-extended shift right
52969 // operation on a vector with 64-bit elements.
52970 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
52971 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
52972 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
52973 N0.getOpcode() == ISD::SIGN_EXTEND)) {
52974 SDValue N00 = N0.getOperand(0);
52975
52976 // EXTLOAD has a better solution on AVX2,
52977 // it may be replaced with X86ISD::VSEXT node.
52978 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
52979 if (!ISD::isNormalLoad(N00.getNode()))
52980 return SDValue();
52981
52982 // Attempt to promote any comparison mask ops before moving the
52983 // SIGN_EXTEND_INREG in the way.
52984 if (SDValue Promote = PromoteMaskArithmetic(N0, dl, DAG, Subtarget))
52985 return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
52986
52987 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
52988 SDValue Tmp =
52989 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
52990 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
52991 }
52992 }
52993 return SDValue();
52994}
52995
52996/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
52997/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
52998/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
52999/// opportunities to combine math ops, use an LEA, or use a complex addressing
53000/// mode. This can eliminate extend, add, and shift instructions.
53002 const X86Subtarget &Subtarget) {
53003 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
53004 Ext->getOpcode() != ISD::ZERO_EXTEND)
53005 return SDValue();
53006
53007 // TODO: This should be valid for other integer types.
53008 EVT VT = Ext->getValueType(0);
53009 if (VT != MVT::i64)
53010 return SDValue();
53011
53012 SDValue Add = Ext->getOperand(0);
53013 if (Add.getOpcode() != ISD::ADD)
53014 return SDValue();
53015
53016 SDValue AddOp0 = Add.getOperand(0);
53017 SDValue AddOp1 = Add.getOperand(1);
53018 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
53019 bool NSW = Add->getFlags().hasNoSignedWrap();
53020 bool NUW = Add->getFlags().hasNoUnsignedWrap();
53021 NSW = NSW || (Sext && DAG.willNotOverflowAdd(true, AddOp0, AddOp1));
53022 NUW = NUW || (!Sext && DAG.willNotOverflowAdd(false, AddOp0, AddOp1));
53023
53024 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
53025 // into the 'zext'
53026 if ((Sext && !NSW) || (!Sext && !NUW))
53027 return SDValue();
53028
53029 // Having a constant operand to the 'add' ensures that we are not increasing
53030 // the instruction count because the constant is extended for free below.
53031 // A constant operand can also become the displacement field of an LEA.
53032 auto *AddOp1C = dyn_cast<ConstantSDNode>(AddOp1);
53033 if (!AddOp1C)
53034 return SDValue();
53035
53036 // Don't make the 'add' bigger if there's no hope of combining it with some
53037 // other 'add' or 'shl' instruction.
53038 // TODO: It may be profitable to generate simpler LEA instructions in place
53039 // of single 'add' instructions, but the cost model for selecting an LEA
53040 // currently has a high threshold.
53041 bool HasLEAPotential = false;
53042 for (auto *User : Ext->uses()) {
53043 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
53044 HasLEAPotential = true;
53045 break;
53046 }
53047 }
53048 if (!HasLEAPotential)
53049 return SDValue();
53050
53051 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
53052 int64_t AddC = Sext ? AddOp1C->getSExtValue() : AddOp1C->getZExtValue();
53053 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
53054 SDValue NewConstant = DAG.getConstant(AddC, SDLoc(Add), VT);
53055
53056 // The wider add is guaranteed to not wrap because both operands are
53057 // sign-extended.
53058 SDNodeFlags Flags;
53059 Flags.setNoSignedWrap(NSW);
53060 Flags.setNoUnsignedWrap(NUW);
53061 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
53062}
53063
53064// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
53065// operands and the result of CMOV is not used anywhere else - promote CMOV
53066// itself instead of promoting its result. This could be beneficial, because:
53067// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
53068// (or more) pseudo-CMOVs only when they go one-after-another and
53069// getting rid of result extension code after CMOV will help that.
53070// 2) Promotion of constant CMOV arguments is free, hence the
53071// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
53072// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
53073// promotion is also good in terms of code-size.
53074// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
53075// promotion).
53077 SDValue CMovN = Extend->getOperand(0);
53078 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
53079 return SDValue();
53080
53081 EVT TargetVT = Extend->getValueType(0);
53082 unsigned ExtendOpcode = Extend->getOpcode();
53083 SDLoc DL(Extend);
53084
53085 EVT VT = CMovN.getValueType();
53086 SDValue CMovOp0 = CMovN.getOperand(0);
53087 SDValue CMovOp1 = CMovN.getOperand(1);
53088
53089 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
53090 !isa<ConstantSDNode>(CMovOp1.getNode()))
53091 return SDValue();
53092
53093 // Only extend to i32 or i64.
53094 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
53095 return SDValue();
53096
53097 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
53098 // are free.
53099 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
53100 return SDValue();
53101
53102 // If this a zero extend to i64, we should only extend to i32 and use a free
53103 // zero extend to finish.
53104 EVT ExtendVT = TargetVT;
53105 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
53106 ExtendVT = MVT::i32;
53107
53108 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
53109 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
53110
53111 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
53112 CMovN.getOperand(2), CMovN.getOperand(3));
53113
53114 // Finish extending if needed.
53115 if (ExtendVT != TargetVT)
53116 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
53117
53118 return Res;
53119}
53120
53121// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
53122// result type.
53124 const X86Subtarget &Subtarget) {
53125 SDValue N0 = N->getOperand(0);
53126 EVT VT = N->getValueType(0);
53127 SDLoc dl(N);
53128
53129 // Only do this combine with AVX512 for vector extends.
53130 if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
53131 return SDValue();
53132
53133 // Only combine legal element types.
53134 EVT SVT = VT.getVectorElementType();
53135 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
53136 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
53137 return SDValue();
53138
53139 // We don't have CMPP Instruction for vxf16
53140 if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)
53141 return SDValue();
53142 // We can only do this if the vector size in 256 bits or less.
53143 unsigned Size = VT.getSizeInBits();
53144 if (Size > 256 && Subtarget.useAVX512Regs())
53145 return SDValue();
53146
53147 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
53148 // that's the only integer compares with we have.
53149 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
53151 return SDValue();
53152
53153 // Only do this combine if the extension will be fully consumed by the setcc.
53154 EVT N00VT = N0.getOperand(0).getValueType();
53155 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
53156 if (Size != MatchingVecType.getSizeInBits())
53157 return SDValue();
53158
53159 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
53160
53161 if (N->getOpcode() == ISD::ZERO_EXTEND)
53162 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
53163
53164 return Res;
53165}
53166
53169 const X86Subtarget &Subtarget) {
53170 SDValue N0 = N->getOperand(0);
53171 EVT VT = N->getValueType(0);
53172 SDLoc DL(N);
53173
53174 // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
53175 if (!DCI.isBeforeLegalizeOps() &&
53177 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
53178 N0->getOperand(1));
53179 bool ReplaceOtherUses = !N0.hasOneUse();
53180 DCI.CombineTo(N, Setcc);
53181 // Replace other uses with a truncate of the widened setcc_carry.
53182 if (ReplaceOtherUses) {
53183 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
53184 N0.getValueType(), Setcc);
53185 DCI.CombineTo(N0.getNode(), Trunc);
53186 }
53187
53188 return SDValue(N, 0);
53189 }
53190
53191 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
53192 return NewCMov;
53193
53194 if (!DCI.isBeforeLegalizeOps())
53195 return SDValue();
53196
53197 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
53198 return V;
53199
53200 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,
53201 DAG, DCI, Subtarget))
53202 return V;
53203
53204 if (VT.isVector()) {
53205 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), DL, DAG, Subtarget))
53206 return R;
53207
53209 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
53210 }
53211
53212 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
53213 return NewAdd;
53214
53215 return SDValue();
53216}
53217
53218// Inverting a constant vector is profitable if it can be eliminated and the
53219// inverted vector is already present in DAG. Otherwise, it will be loaded
53220// anyway.
53221//
53222// We determine which of the values can be completely eliminated and invert it.
53223// If both are eliminable, select a vector with the first negative element.
53226 "ConstantFP build vector expected");
53227 // Check if we can eliminate V. We assume if a value is only used in FMAs, we
53228 // can eliminate it. Since this function is invoked for each FMA with this
53229 // vector.
53230 auto IsNotFMA = [](SDNode *Use) {
53231 return Use->getOpcode() != ISD::FMA && Use->getOpcode() != ISD::STRICT_FMA;
53232 };
53233 if (llvm::any_of(V->uses(), IsNotFMA))
53234 return SDValue();
53235
53237 EVT VT = V.getValueType();
53238 EVT EltVT = VT.getVectorElementType();
53239 for (const SDValue &Op : V->op_values()) {
53240 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
53241 Ops.push_back(DAG.getConstantFP(-Cst->getValueAPF(), SDLoc(Op), EltVT));
53242 } else {
53243 assert(Op.isUndef());
53244 Ops.push_back(DAG.getUNDEF(EltVT));
53245 }
53246 }
53247
53248 SDNode *NV = DAG.getNodeIfExists(ISD::BUILD_VECTOR, DAG.getVTList(VT), Ops);
53249 if (!NV)
53250 return SDValue();
53251
53252 // If an inverted version cannot be eliminated, choose it instead of the
53253 // original version.
53254 if (llvm::any_of(NV->uses(), IsNotFMA))
53255 return SDValue(NV, 0);
53256
53257 // If the inverted version also can be eliminated, we have to consistently
53258 // prefer one of the values. We prefer a constant with a negative value on
53259 // the first place.
53260 // N.B. We need to skip undefs that may precede a value.
53261 for (const SDValue &Op : V->op_values()) {
53262 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
53263 if (Cst->isNegative())
53264 return SDValue();
53265 break;
53266 }
53267 }
53268 return SDValue(NV, 0);
53269}
53270
53273 const X86Subtarget &Subtarget) {
53274 SDLoc dl(N);
53275 EVT VT = N->getValueType(0);
53276 bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();
53277
53278 // Let legalize expand this if it isn't a legal type yet.
53279 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53280 if (!TLI.isTypeLegal(VT))
53281 return SDValue();
53282
53283 SDValue A = N->getOperand(IsStrict ? 1 : 0);
53284 SDValue B = N->getOperand(IsStrict ? 2 : 1);
53285 SDValue C = N->getOperand(IsStrict ? 3 : 2);
53286
53287 // If the operation allows fast-math and the target does not support FMA,
53288 // split this into mul+add to avoid libcall(s).
53289 SDNodeFlags Flags = N->getFlags();
53290 if (!IsStrict && Flags.hasAllowReassociation() &&
53291 TLI.isOperationExpand(ISD::FMA, VT)) {
53292 SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
53293 return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
53294 }
53295
53296 EVT ScalarVT = VT.getScalarType();
53297 if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
53298 !Subtarget.hasAnyFMA()) &&
53299 !(ScalarVT == MVT::f16 && Subtarget.hasFP16()))
53300 return SDValue();
53301
53302 auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
53303 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
53304 bool LegalOperations = !DCI.isBeforeLegalizeOps();
53305 if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
53306 CodeSize)) {
53307 V = NegV;
53308 return true;
53309 }
53310 // Look through extract_vector_elts. If it comes from an FNEG, create a
53311 // new extract from the FNEG input.
53312 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
53313 isNullConstant(V.getOperand(1))) {
53314 SDValue Vec = V.getOperand(0);
53315 if (SDValue NegV = TLI.getCheaperNegatedExpression(
53316 Vec, DAG, LegalOperations, CodeSize)) {
53317 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
53318 NegV, V.getOperand(1));
53319 return true;
53320 }
53321 }
53322 // Lookup if there is an inverted version of constant vector V in DAG.
53323 if (ISD::isBuildVectorOfConstantFPSDNodes(V.getNode())) {
53324 if (SDValue NegV = getInvertedVectorForFMA(V, DAG)) {
53325 V = NegV;
53326 return true;
53327 }
53328 }
53329 return false;
53330 };
53331
53332 // Do not convert the passthru input of scalar intrinsics.
53333 // FIXME: We could allow negations of the lower element only.
53334 bool NegA = invertIfNegative(A);
53335 bool NegB = invertIfNegative(B);
53336 bool NegC = invertIfNegative(C);
53337
53338 if (!NegA && !NegB && !NegC)
53339 return SDValue();
53340
53341 unsigned NewOpcode =
53342 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
53343
53344 // Propagate fast-math-flags to new FMA node.
53345 SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
53346 if (IsStrict) {
53347 assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
53348 return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
53349 {N->getOperand(0), A, B, C});
53350 } else {
53351 if (N->getNumOperands() == 4)
53352 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
53353 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
53354 }
53355}
53356
53357// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
53358// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
53361 SDLoc dl(N);
53362 EVT VT = N->getValueType(0);
53363 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53364 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
53365 bool LegalOperations = !DCI.isBeforeLegalizeOps();
53366
53367 SDValue N2 = N->getOperand(2);
53368
53369 SDValue NegN2 =
53370 TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
53371 if (!NegN2)
53372 return SDValue();
53373 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
53374
53375 if (N->getNumOperands() == 4)
53376 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
53377 NegN2, N->getOperand(3));
53378 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
53379 NegN2);
53380}
53381
53384 const X86Subtarget &Subtarget) {
53385 SDLoc dl(N);
53386 SDValue N0 = N->getOperand(0);
53387 EVT VT = N->getValueType(0);
53388
53389 // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
53390 // FIXME: Is this needed? We don't seem to have any tests for it.
53391 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
53393 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
53394 N0->getOperand(1));
53395 bool ReplaceOtherUses = !N0.hasOneUse();
53396 DCI.CombineTo(N, Setcc);
53397 // Replace other uses with a truncate of the widened setcc_carry.
53398 if (ReplaceOtherUses) {
53399 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
53400 N0.getValueType(), Setcc);
53401 DCI.CombineTo(N0.getNode(), Trunc);
53402 }
53403
53404 return SDValue(N, 0);
53405 }
53406
53407 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
53408 return NewCMov;
53409
53410 if (DCI.isBeforeLegalizeOps())
53411 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
53412 return V;
53413
53414 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,
53415 DAG, DCI, Subtarget))
53416 return V;
53417
53418 if (VT.isVector())
53419 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), dl, DAG, Subtarget))
53420 return R;
53421
53422 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
53423 return NewAdd;
53424
53425 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
53426 return R;
53427
53428 // TODO: Combine with any target/faux shuffle.
53429 if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
53431 SDValue N00 = N0.getOperand(0);
53432 SDValue N01 = N0.getOperand(1);
53433 unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
53434 APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
53435 if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
53436 (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
53437 return concatSubVectors(N00, N01, DAG, dl);
53438 }
53439 }
53440
53441 return SDValue();
53442}
53443
53444/// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
53445/// pre-promote its result type since vXi1 vectors don't get promoted
53446/// during type legalization.
53449 const SDLoc &DL, SelectionDAG &DAG,
53450 const X86Subtarget &Subtarget) {
53451 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
53452 VT.getVectorElementType() == MVT::i1 &&
53453 (OpVT.getVectorElementType() == MVT::i8 ||
53454 OpVT.getVectorElementType() == MVT::i16)) {
53455 SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
53456 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
53457 }
53458 return SDValue();
53459}
53460
53463 const X86Subtarget &Subtarget) {
53464 const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
53465 const SDValue LHS = N->getOperand(0);
53466 const SDValue RHS = N->getOperand(1);
53467 EVT VT = N->getValueType(0);
53468 EVT OpVT = LHS.getValueType();
53469 SDLoc DL(N);
53470
53471 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
53472 if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG,
53473 Subtarget))
53474 return V;
53475
53476 if (VT == MVT::i1) {
53477 X86::CondCode X86CC;
53478 if (SDValue V =
53479 MatchVectorAllEqualTest(LHS, RHS, CC, DL, Subtarget, DAG, X86CC))
53480 return DAG.getNode(ISD::TRUNCATE, DL, VT, getSETCC(X86CC, V, DL, DAG));
53481 }
53482
53483 if (OpVT.isScalarInteger()) {
53484 // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
53485 // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
53486 auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
53487 if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
53488 if (N0.getOperand(0) == N1)
53489 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
53490 N0.getOperand(1));
53491 if (N0.getOperand(1) == N1)
53492 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
53493 N0.getOperand(0));
53494 }
53495 return SDValue();
53496 };
53497 if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
53498 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
53499 if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
53500 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
53501
53502 // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
53503 // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
53504 auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
53505 if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
53506 if (N0.getOperand(0) == N1)
53507 return DAG.getNode(ISD::AND, DL, OpVT, N1,
53508 DAG.getNOT(DL, N0.getOperand(1), OpVT));
53509 if (N0.getOperand(1) == N1)
53510 return DAG.getNode(ISD::AND, DL, OpVT, N1,
53511 DAG.getNOT(DL, N0.getOperand(0), OpVT));
53512 }
53513 return SDValue();
53514 };
53515 if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
53516 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
53517 if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
53518 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
53519
53520 // cmpeq(trunc(x),C) --> cmpeq(x,C)
53521 // cmpne(trunc(x),C) --> cmpne(x,C)
53522 // iff x upper bits are zero.
53523 if (LHS.getOpcode() == ISD::TRUNCATE &&
53524 LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
53525 isa<ConstantSDNode>(RHS) && !DCI.isBeforeLegalize()) {
53526 EVT SrcVT = LHS.getOperand(0).getValueType();
53528 OpVT.getScalarSizeInBits());
53529 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53530 if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
53531 TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
53532 return DAG.getSetCC(DL, VT, LHS.getOperand(0),
53533 DAG.getZExtOrTrunc(RHS, DL, SrcVT), CC);
53534 }
53535
53536 // With C as a power of 2 and C != 0 and C != INT_MIN:
53537 // icmp eq Abs(X) C ->
53538 // (icmp eq A, C) | (icmp eq A, -C)
53539 // icmp ne Abs(X) C ->
53540 // (icmp ne A, C) & (icmp ne A, -C)
53541 // Both of these patterns can be better optimized in
53542 // DAGCombiner::foldAndOrOfSETCC. Note this only applies for scalar
53543 // integers which is checked above.
53544 if (LHS.getOpcode() == ISD::ABS && LHS.hasOneUse()) {
53545 if (auto *C = dyn_cast<ConstantSDNode>(RHS)) {
53546 const APInt &CInt = C->getAPIntValue();
53547 // We can better optimize this case in DAGCombiner::foldAndOrOfSETCC.
53548 if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) {
53549 SDValue BaseOp = LHS.getOperand(0);
53550 SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC);
53551 SDValue SETCC1 = DAG.getSetCC(
53552 DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);
53553 return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT,
53554 SETCC0, SETCC1);
53555 }
53556 }
53557 }
53558 }
53559 }
53560
53561 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
53563 // Using temporaries to avoid messing up operand ordering for later
53564 // transformations if this doesn't work.
53565 SDValue Op0 = LHS;
53566 SDValue Op1 = RHS;
53567 ISD::CondCode TmpCC = CC;
53568 // Put build_vector on the right.
53569 if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
53570 std::swap(Op0, Op1);
53571 TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
53572 }
53573
53574 bool IsSEXT0 =
53575 (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
53576 (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
53577 bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
53578
53579 if (IsSEXT0 && IsVZero1) {
53580 assert(VT == Op0.getOperand(0).getValueType() &&
53581 "Unexpected operand type");
53582 if (TmpCC == ISD::SETGT)
53583 return DAG.getConstant(0, DL, VT);
53584 if (TmpCC == ISD::SETLE)
53585 return DAG.getConstant(1, DL, VT);
53586 if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
53587 return DAG.getNOT(DL, Op0.getOperand(0), VT);
53588
53589 assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&
53590 "Unexpected condition code!");
53591 return Op0.getOperand(0);
53592 }
53593 }
53594
53595 // Try and make unsigned vector comparison signed. On pre AVX512 targets there
53596 // only are unsigned comparisons (`PCMPGT`) and on AVX512 its often better to
53597 // use `PCMPGT` if the result is mean to stay in a vector (and if its going to
53598 // a mask, there are signed AVX512 comparisons).
53599 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger()) {
53600 bool CanMakeSigned = false;
53602 KnownBits CmpKnown =
53604 // If we know LHS/RHS share the same sign bit at each element we can
53605 // make this signed.
53606 // NOTE: `computeKnownBits` on a vector type aggregates common bits
53607 // across all lanes. So a pattern where the sign varies from lane to
53608 // lane, but at each lane Sign(LHS) is known to equal Sign(RHS), will be
53609 // missed. We could get around this by demanding each lane
53610 // independently, but this isn't the most important optimization and
53611 // that may eat into compile time.
53612 CanMakeSigned =
53613 CmpKnown.Zero.isSignBitSet() || CmpKnown.One.isSignBitSet();
53614 }
53615 if (CanMakeSigned || ISD::isSignedIntSetCC(CC)) {
53616 SDValue LHSOut = LHS;
53617 SDValue RHSOut = RHS;
53618 ISD::CondCode NewCC = CC;
53619 switch (CC) {
53620 case ISD::SETGE:
53621 case ISD::SETUGE:
53622 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ true,
53623 /*NSW*/ true))
53624 LHSOut = NewLHS;
53625 else if (SDValue NewRHS = incDecVectorConstant(
53626 RHS, DAG, /*IsInc*/ false, /*NSW*/ true))
53627 RHSOut = NewRHS;
53628 else
53629 break;
53630
53631 [[fallthrough]];
53632 case ISD::SETUGT:
53633 NewCC = ISD::SETGT;
53634 break;
53635
53636 case ISD::SETLE:
53637 case ISD::SETULE:
53638 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ false,
53639 /*NSW*/ true))
53640 LHSOut = NewLHS;
53641 else if (SDValue NewRHS = incDecVectorConstant(RHS, DAG, /*IsInc*/ true,
53642 /*NSW*/ true))
53643 RHSOut = NewRHS;
53644 else
53645 break;
53646
53647 [[fallthrough]];
53648 case ISD::SETULT:
53649 // Will be swapped to SETGT in LowerVSETCC*.
53650 NewCC = ISD::SETLT;
53651 break;
53652 default:
53653 break;
53654 }
53655 if (NewCC != CC) {
53656 if (SDValue R = truncateAVX512SetCCNoBWI(VT, OpVT, LHSOut, RHSOut,
53657 NewCC, DL, DAG, Subtarget))
53658 return R;
53659 return DAG.getSetCC(DL, VT, LHSOut, RHSOut, NewCC);
53660 }
53661 }
53662 }
53663
53664 if (SDValue R =
53665 truncateAVX512SetCCNoBWI(VT, OpVT, LHS, RHS, CC, DL, DAG, Subtarget))
53666 return R;
53667
53668 // In the middle end transforms:
53669 // `(or (icmp eq X, C), (icmp eq X, C+1))`
53670 // -> `(icmp ult (add x, -C), 2)`
53671 // Likewise inverted cases with `ugt`.
53672 //
53673 // Since x86, pre avx512, doesn't have unsigned vector compares, this results
53674 // in worse codegen. So, undo the middle-end transform and go back to `(or
53675 // (icmp eq), (icmp eq))` form.
53676 // Also skip AVX1 with ymm vectors, as the umin approach combines better than
53677 // the xmm approach.
53678 //
53679 // NB: We don't handle the similiar simplication of `(and (icmp ne), (icmp
53680 // ne))` as it doesn't end up instruction positive.
53681 // TODO: We might want to do this for avx512 as well if we `sext` the result.
53682 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger() &&
53683 ISD::isUnsignedIntSetCC(CC) && LHS.getOpcode() == ISD::ADD &&
53684 !Subtarget.hasAVX512() &&
53685 (OpVT.getSizeInBits() <= 128 || !Subtarget.hasAVX() ||
53686 Subtarget.hasAVX2()) &&
53687 LHS.hasOneUse()) {
53688
53689 APInt CmpC;
53690 SDValue AddC = LHS.getOperand(1);
53691 if (ISD::isConstantSplatVector(RHS.getNode(), CmpC) &&
53693 // See which form we have depending on the constant/condition.
53694 SDValue C0 = SDValue();
53695 SDValue C1 = SDValue();
53696
53697 // If we had `(add x, -1)` and can lower with `umin`, don't transform as
53698 // we will end up generating an additional constant. Keeping in the
53699 // current form has a slight latency cost, but it probably worth saving a
53700 // constant.
53703 // Pass
53704 }
53705 // Normal Cases
53706 else if ((CC == ISD::SETULT && CmpC == 2) ||
53707 (CC == ISD::SETULE && CmpC == 1)) {
53708 // These will constant fold.
53709 C0 = DAG.getNegative(AddC, DL, OpVT);
53710 C1 = DAG.getNode(ISD::SUB, DL, OpVT, C0,
53711 DAG.getAllOnesConstant(DL, OpVT));
53712 }
53713 // Inverted Cases
53714 else if ((CC == ISD::SETUGT && (-CmpC) == 3) ||
53715 (CC == ISD::SETUGE && (-CmpC) == 2)) {
53716 // These will constant fold.
53717 C0 = DAG.getNOT(DL, AddC, OpVT);
53718 C1 = DAG.getNode(ISD::ADD, DL, OpVT, C0,
53719 DAG.getAllOnesConstant(DL, OpVT));
53720 }
53721 if (C0 && C1) {
53722 SDValue NewLHS =
53723 DAG.getSetCC(DL, VT, LHS.getOperand(0), C0, ISD::SETEQ);
53724 SDValue NewRHS =
53725 DAG.getSetCC(DL, VT, LHS.getOperand(0), C1, ISD::SETEQ);
53726 return DAG.getNode(ISD::OR, DL, VT, NewLHS, NewRHS);
53727 }
53728 }
53729 }
53730
53731 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
53732 // to avoid scalarization via legalization because v4i32 is not a legal type.
53733 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
53734 LHS.getValueType() == MVT::v4f32)
53735 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
53736
53737 // X pred 0.0 --> X pred -X
53738 // If the negation of X already exists, use it in the comparison. This removes
53739 // the need to materialize 0.0 and allows matching to SSE's MIN/MAX
53740 // instructions in patterns with a 'select' node.
53742 SDVTList FNegVT = DAG.getVTList(OpVT);
53743 if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))
53744 return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);
53745 }
53746
53747 return SDValue();
53748}
53749
53752 const X86Subtarget &Subtarget) {
53753 SDValue Src = N->getOperand(0);
53754 MVT SrcVT = Src.getSimpleValueType();
53755 MVT VT = N->getSimpleValueType(0);
53756 unsigned NumBits = VT.getScalarSizeInBits();
53757 unsigned NumElts = SrcVT.getVectorNumElements();
53758 unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits();
53759 assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types");
53760
53761 // Perform constant folding.
53762 APInt UndefElts;
53763 SmallVector<APInt, 32> EltBits;
53764 if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits,
53765 /*AllowWholeUndefs*/ true,
53766 /*AllowPartialUndefs*/ true)) {
53767 APInt Imm(32, 0);
53768 for (unsigned Idx = 0; Idx != NumElts; ++Idx)
53769 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
53770 Imm.setBit(Idx);
53771
53772 return DAG.getConstant(Imm, SDLoc(N), VT);
53773 }
53774
53775 // Look through int->fp bitcasts that don't change the element width.
53776 unsigned EltWidth = SrcVT.getScalarSizeInBits();
53777 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
53778 Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
53779 return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
53780
53781 // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
53782 // with scalar comparisons.
53783 if (SDValue NotSrc = IsNOT(Src, DAG)) {
53784 SDLoc DL(N);
53785 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
53786 NotSrc = DAG.getBitcast(SrcVT, NotSrc);
53787 return DAG.getNode(ISD::XOR, DL, VT,
53788 DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
53789 DAG.getConstant(NotMask, DL, VT));
53790 }
53791
53792 // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
53793 // results with scalar comparisons.
53794 if (Src.getOpcode() == X86ISD::PCMPGT &&
53795 ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
53796 SDLoc DL(N);
53797 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
53798 return DAG.getNode(ISD::XOR, DL, VT,
53799 DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
53800 DAG.getConstant(NotMask, DL, VT));
53801 }
53802
53803 // Fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2))
53804 // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))
53805 // iff pow2splat(c1).
53806 // Use KnownBits to determine if only a single bit is non-zero
53807 // in each element (pow2 or zero), and shift that bit to the msb.
53808 if (Src.getOpcode() == X86ISD::PCMPEQ) {
53809 KnownBits KnownLHS = DAG.computeKnownBits(Src.getOperand(0));
53810 KnownBits KnownRHS = DAG.computeKnownBits(Src.getOperand(1));
53811 unsigned ShiftAmt = KnownLHS.countMinLeadingZeros();
53812 if (KnownLHS.countMaxPopulation() == 1 &&
53813 (KnownRHS.isZero() || (KnownRHS.countMaxPopulation() == 1 &&
53814 ShiftAmt == KnownRHS.countMinLeadingZeros()))) {
53815 SDLoc DL(N);
53816 MVT ShiftVT = SrcVT;
53817 SDValue ShiftLHS = Src.getOperand(0);
53818 SDValue ShiftRHS = Src.getOperand(1);
53819 if (ShiftVT.getScalarType() == MVT::i8) {
53820 // vXi8 shifts - we only care about the signbit so can use PSLLW.
53821 ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
53822 ShiftLHS = DAG.getBitcast(ShiftVT, ShiftLHS);
53823 ShiftRHS = DAG.getBitcast(ShiftVT, ShiftRHS);
53824 }
53825 ShiftLHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
53826 ShiftLHS, ShiftAmt, DAG);
53827 ShiftRHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
53828 ShiftRHS, ShiftAmt, DAG);
53829 ShiftLHS = DAG.getBitcast(SrcVT, ShiftLHS);
53830 ShiftRHS = DAG.getBitcast(SrcVT, ShiftRHS);
53831 SDValue Res = DAG.getNode(ISD::XOR, DL, SrcVT, ShiftLHS, ShiftRHS);
53832 return DAG.getNode(X86ISD::MOVMSK, DL, VT, DAG.getNOT(DL, Res, SrcVT));
53833 }
53834 }
53835
53836 // Fold movmsk(logic(X,C)) -> logic(movmsk(X),C)
53837 if (N->isOnlyUserOf(Src.getNode())) {
53839 if (ISD::isBitwiseLogicOp(SrcBC.getOpcode())) {
53840 APInt UndefElts;
53841 SmallVector<APInt, 32> EltBits;
53842 if (getTargetConstantBitsFromNode(SrcBC.getOperand(1), NumBitsPerElt,
53843 UndefElts, EltBits)) {
53844 APInt Mask = APInt::getZero(NumBits);
53845 for (unsigned Idx = 0; Idx != NumElts; ++Idx) {
53846 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
53847 Mask.setBit(Idx);
53848 }
53849 SDLoc DL(N);
53850 SDValue NewSrc = DAG.getBitcast(SrcVT, SrcBC.getOperand(0));
53851 SDValue NewMovMsk = DAG.getNode(X86ISD::MOVMSK, DL, VT, NewSrc);
53852 return DAG.getNode(SrcBC.getOpcode(), DL, VT, NewMovMsk,
53853 DAG.getConstant(Mask, DL, VT));
53854 }
53855 }
53856 }
53857
53858 // Simplify the inputs.
53859 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53860 APInt DemandedMask(APInt::getAllOnes(NumBits));
53861 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
53862 return SDValue(N, 0);
53863
53864 return SDValue();
53865}
53866
53869 const X86Subtarget &Subtarget) {
53870 MVT VT = N->getSimpleValueType(0);
53871 unsigned NumBits = VT.getScalarSizeInBits();
53872
53873 // Simplify the inputs.
53874 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53875 APInt DemandedMask(APInt::getAllOnes(NumBits));
53876 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
53877 return SDValue(N, 0);
53878
53879 return SDValue();
53880}
53881
53884 auto *MemOp = cast<X86MaskedGatherScatterSDNode>(N);
53885 SDValue Mask = MemOp->getMask();
53886
53887 // With vector masks we only demand the upper bit of the mask.
53888 if (Mask.getScalarValueSizeInBits() != 1) {
53889 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53890 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
53891 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
53892 if (N->getOpcode() != ISD::DELETED_NODE)
53893 DCI.AddToWorklist(N);
53894 return SDValue(N, 0);
53895 }
53896 }
53897
53898 return SDValue();
53899}
53900
53903 SelectionDAG &DAG) {
53904 SDLoc DL(GorS);
53905
53906 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
53907 SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
53908 Gather->getMask(), Base, Index, Scale } ;
53909 return DAG.getMaskedGather(Gather->getVTList(),
53910 Gather->getMemoryVT(), DL, Ops,
53911 Gather->getMemOperand(),
53912 Gather->getIndexType(),
53913 Gather->getExtensionType());
53914 }
53915 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
53916 SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
53917 Scatter->getMask(), Base, Index, Scale };
53918 return DAG.getMaskedScatter(Scatter->getVTList(),
53919 Scatter->getMemoryVT(), DL,
53920 Ops, Scatter->getMemOperand(),
53921 Scatter->getIndexType(),
53922 Scatter->isTruncatingStore());
53923}
53924
53927 SDLoc DL(N);
53928 auto *GorS = cast<MaskedGatherScatterSDNode>(N);
53929 SDValue Index = GorS->getIndex();
53930 SDValue Base = GorS->getBasePtr();
53931 SDValue Scale = GorS->getScale();
53932 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53933
53934 if (DCI.isBeforeLegalize()) {
53935 unsigned IndexWidth = Index.getScalarValueSizeInBits();
53936
53937 // Shrink constant indices if they are larger than 32-bits.
53938 // Only do this before legalize types since v2i64 could become v2i32.
53939 // FIXME: We could check that the type is legal if we're after legalize
53940 // types, but then we would need to construct test cases where that happens.
53941 // FIXME: We could support more than just constant vectors, but we need to
53942 // careful with costing. A truncate that can be optimized out would be fine.
53943 // Otherwise we might only want to create a truncate if it avoids a split.
53944 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
53945 if (BV->isConstant() && IndexWidth > 32 &&
53946 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
53947 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
53948 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
53949 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
53950 }
53951 }
53952
53953 // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
53954 // there are sufficient sign bits. Only do this before legalize types to
53955 // avoid creating illegal types in truncate.
53956 if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
53957 Index.getOpcode() == ISD::ZERO_EXTEND) &&
53958 IndexWidth > 32 &&
53959 Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
53960 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
53961 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
53962 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
53963 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
53964 }
53965 }
53966
53967 EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
53968 // Try to move splat constant adders from the index operand to the base
53969 // pointer operand. Taking care to multiply by the scale. We can only do
53970 // this when index element type is the same as the pointer type.
53971 // Otherwise we need to be sure the math doesn't wrap before the scale.
53972 if (Index.getOpcode() == ISD::ADD &&
53973 Index.getValueType().getVectorElementType() == PtrVT &&
53974 isa<ConstantSDNode>(Scale)) {
53975 uint64_t ScaleAmt = Scale->getAsZExtVal();
53976 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(1))) {
53977 BitVector UndefElts;
53978 if (ConstantSDNode *C = BV->getConstantSplatNode(&UndefElts)) {
53979 // FIXME: Allow non-constant?
53980 if (UndefElts.none()) {
53981 // Apply the scale.
53982 APInt Adder = C->getAPIntValue() * ScaleAmt;
53983 // Add it to the existing base.
53984 Base = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
53985 DAG.getConstant(Adder, DL, PtrVT));
53986 Index = Index.getOperand(0);
53987 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
53988 }
53989 }
53990
53991 // It's also possible base is just a constant. In that case, just
53992 // replace it with 0 and move the displacement into the index.
53993 if (BV->isConstant() && isa<ConstantSDNode>(Base) &&
53994 isOneConstant(Scale)) {
53995 SDValue Splat = DAG.getSplatBuildVector(Index.getValueType(), DL, Base);
53996 // Combine the constant build_vector and the constant base.
53997 Splat = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
53998 Index.getOperand(1), Splat);
53999 // Add to the LHS of the original Index add.
54000 Index = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
54001 Index.getOperand(0), Splat);
54002 Base = DAG.getConstant(0, DL, Base.getValueType());
54003 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
54004 }
54005 }
54006 }
54007
54008 if (DCI.isBeforeLegalizeOps()) {
54009 unsigned IndexWidth = Index.getScalarValueSizeInBits();
54010
54011 // Make sure the index is either i32 or i64
54012 if (IndexWidth != 32 && IndexWidth != 64) {
54013 MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
54014 EVT IndexVT = Index.getValueType().changeVectorElementType(EltVT);
54015 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
54016 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
54017 }
54018 }
54019
54020 // With vector masks we only demand the upper bit of the mask.
54021 SDValue Mask = GorS->getMask();
54022 if (Mask.getScalarValueSizeInBits() != 1) {
54023 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
54024 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
54025 if (N->getOpcode() != ISD::DELETED_NODE)
54026 DCI.AddToWorklist(N);
54027 return SDValue(N, 0);
54028 }
54029 }
54030
54031 return SDValue();
54032}
54033
54034// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
54036 const X86Subtarget &Subtarget) {
54037 SDLoc DL(N);
54038 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
54039 SDValue EFLAGS = N->getOperand(1);
54040
54041 // Try to simplify the EFLAGS and condition code operands.
54042 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
54043 return getSETCC(CC, Flags, DL, DAG);
54044
54045 return SDValue();
54046}
54047
54048/// Optimize branch condition evaluation.
54050 const X86Subtarget &Subtarget) {
54051 SDLoc DL(N);
54052 SDValue EFLAGS = N->getOperand(3);
54053 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
54054
54055 // Try to simplify the EFLAGS and condition code operands.
54056 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
54057 // RAUW them under us.
54058 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
54059 SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
54060 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
54061 N->getOperand(1), Cond, Flags);
54062 }
54063
54064 return SDValue();
54065}
54066
54067// TODO: Could we move this to DAGCombine?
54069 SelectionDAG &DAG) {
54070 // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
54071 // to optimize away operation when it's from a constant.
54072 //
54073 // The general transformation is:
54074 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
54075 // AND(VECTOR_CMP(x,y), constant2)
54076 // constant2 = UNARYOP(constant)
54077
54078 // Early exit if this isn't a vector operation, the operand of the
54079 // unary operation isn't a bitwise AND, or if the sizes of the operations
54080 // aren't the same.
54081 EVT VT = N->getValueType(0);
54082 bool IsStrict = N->isStrictFPOpcode();
54083 unsigned NumEltBits = VT.getScalarSizeInBits();
54084 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
54085 if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
54086 DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
54087 VT.getSizeInBits() != Op0.getValueSizeInBits())
54088 return SDValue();
54089
54090 // Now check that the other operand of the AND is a constant. We could
54091 // make the transformation for non-constant splats as well, but it's unclear
54092 // that would be a benefit as it would not eliminate any operations, just
54093 // perform one more step in scalar code before moving to the vector unit.
54094 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
54095 // Bail out if the vector isn't a constant.
54096 if (!BV->isConstant())
54097 return SDValue();
54098
54099 // Everything checks out. Build up the new and improved node.
54100 SDLoc DL(N);
54101 EVT IntVT = BV->getValueType(0);
54102 // Create a new constant of the appropriate type for the transformed
54103 // DAG.
54104 SDValue SourceConst;
54105 if (IsStrict)
54106 SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
54107 {N->getOperand(0), SDValue(BV, 0)});
54108 else
54109 SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
54110 // The AND node needs bitcasts to/from an integer vector type around it.
54111 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
54112 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
54113 MaskConst);
54114 SDValue Res = DAG.getBitcast(VT, NewAnd);
54115 if (IsStrict)
54116 return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
54117 return Res;
54118 }
54119
54120 return SDValue();
54121}
54122
54123/// If we are converting a value to floating-point, try to replace scalar
54124/// truncate of an extracted vector element with a bitcast. This tries to keep
54125/// the sequence on XMM registers rather than moving between vector and GPRs.
54127 // TODO: This is currently only used by combineSIntToFP, but it is generalized
54128 // to allow being called by any similar cast opcode.
54129 // TODO: Consider merging this into lowering: vectorizeExtractedCast().
54130 SDValue Trunc = N->getOperand(0);
54131 if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
54132 return SDValue();
54133
54134 SDValue ExtElt = Trunc.getOperand(0);
54135 if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54136 !isNullConstant(ExtElt.getOperand(1)))
54137 return SDValue();
54138
54139 EVT TruncVT = Trunc.getValueType();
54140 EVT SrcVT = ExtElt.getValueType();
54141 unsigned DestWidth = TruncVT.getSizeInBits();
54142 unsigned SrcWidth = SrcVT.getSizeInBits();
54143 if (SrcWidth % DestWidth != 0)
54144 return SDValue();
54145
54146 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
54147 EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
54148 unsigned VecWidth = SrcVecVT.getSizeInBits();
54149 unsigned NumElts = VecWidth / DestWidth;
54150 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
54151 SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
54152 SDLoc DL(N);
54153 SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
54154 BitcastVec, ExtElt.getOperand(1));
54155 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
54156}
54157
54159 const X86Subtarget &Subtarget) {
54160 bool IsStrict = N->isStrictFPOpcode();
54161 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
54162 EVT VT = N->getValueType(0);
54163 EVT InVT = Op0.getValueType();
54164
54165 // Using i16 as an intermediate type is a bad idea, unless we have HW support
54166 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
54167 // if hasFP16 support:
54168 // UINT_TO_FP(vXi1~15) -> SINT_TO_FP(ZEXT(vXi1~15 to vXi16))
54169 // UINT_TO_FP(vXi17~31) -> SINT_TO_FP(ZEXT(vXi17~31 to vXi32))
54170 // else
54171 // UINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
54172 // UINT_TO_FP(vXi33~63) -> SINT_TO_FP(ZEXT(vXi33~63 to vXi64))
54173 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
54174 unsigned ScalarSize = InVT.getScalarSizeInBits();
54175 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
54176 ScalarSize >= 64)
54177 return SDValue();
54178 SDLoc dl(N);
54179 EVT DstVT =
54181 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
54182 : ScalarSize < 32 ? MVT::i32
54183 : MVT::i64,
54184 InVT.getVectorNumElements());
54185 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
54186 if (IsStrict)
54187 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
54188 {N->getOperand(0), P});
54189 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
54190 }
54191
54192 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
54193 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
54194 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
54195 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
54196 VT.getScalarType() != MVT::f16) {
54197 SDLoc dl(N);
54198 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
54199 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
54200
54201 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
54202 if (IsStrict)
54203 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
54204 {N->getOperand(0), P});
54205 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
54206 }
54207
54208 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
54209 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
54210 // the optimization here.
54211 SDNodeFlags Flags = N->getFlags();
54212 if (Flags.hasNonNeg() || DAG.SignBitIsZero(Op0)) {
54213 if (IsStrict)
54214 return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
54215 {N->getOperand(0), Op0});
54216 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
54217 }
54218
54219 return SDValue();
54220}
54221
54224 const X86Subtarget &Subtarget) {
54225 // First try to optimize away the conversion entirely when it's
54226 // conditionally from a constant. Vectors only.
54227 bool IsStrict = N->isStrictFPOpcode();
54229 return Res;
54230
54231 // Now move on to more general possibilities.
54232 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
54233 EVT VT = N->getValueType(0);
54234 EVT InVT = Op0.getValueType();
54235
54236 // Using i16 as an intermediate type is a bad idea, unless we have HW support
54237 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
54238 // if hasFP16 support:
54239 // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))
54240 // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
54241 // else
54242 // SINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
54243 // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
54244 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
54245 unsigned ScalarSize = InVT.getScalarSizeInBits();
54246 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
54247 ScalarSize >= 64)
54248 return SDValue();
54249 SDLoc dl(N);
54250 EVT DstVT =
54252 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
54253 : ScalarSize < 32 ? MVT::i32
54254 : MVT::i64,
54255 InVT.getVectorNumElements());
54256 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
54257 if (IsStrict)
54258 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
54259 {N->getOperand(0), P});
54260 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
54261 }
54262
54263 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
54264 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
54265 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
54266 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
54267 VT.getScalarType() != MVT::f16) {
54268 SDLoc dl(N);
54269 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
54270 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
54271 if (IsStrict)
54272 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
54273 {N->getOperand(0), P});
54274 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
54275 }
54276
54277 // Without AVX512DQ we only support i64 to float scalar conversion. For both
54278 // vectors and scalars, see if we know that the upper bits are all the sign
54279 // bit, in which case we can truncate the input to i32 and convert from that.
54280 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
54281 unsigned BitWidth = InVT.getScalarSizeInBits();
54282 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
54283 if (NumSignBits >= (BitWidth - 31)) {
54284 EVT TruncVT = MVT::i32;
54285 if (InVT.isVector())
54286 TruncVT = InVT.changeVectorElementType(TruncVT);
54287 SDLoc dl(N);
54288 if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
54289 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
54290 if (IsStrict)
54291 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
54292 {N->getOperand(0), Trunc});
54293 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
54294 }
54295 // If we're after legalize and the type is v2i32 we need to shuffle and
54296 // use CVTSI2P.
54297 assert(InVT == MVT::v2i64 && "Unexpected VT!");
54298 SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
54299 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
54300 { 0, 2, -1, -1 });
54301 if (IsStrict)
54302 return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
54303 {N->getOperand(0), Shuf});
54304 return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
54305 }
54306 }
54307
54308 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
54309 // a 32-bit target where SSE doesn't support i64->FP operations.
54310 if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
54311 Op0.getOpcode() == ISD::LOAD) {
54312 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
54313
54314 // This transformation is not supported if the result type is f16 or f128.
54315 if (VT == MVT::f16 || VT == MVT::f128)
54316 return SDValue();
54317
54318 // If we have AVX512DQ we can use packed conversion instructions unless
54319 // the VT is f80.
54320 if (Subtarget.hasDQI() && VT != MVT::f80)
54321 return SDValue();
54322
54323 if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
54324 Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
54325 std::pair<SDValue, SDValue> Tmp =
54326 Subtarget.getTargetLowering()->BuildFILD(
54327 VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
54328 Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);
54329 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
54330 return Tmp.first;
54331 }
54332 }
54333
54334 if (IsStrict)
54335 return SDValue();
54336
54337 if (SDValue V = combineToFPTruncExtElt(N, DAG))
54338 return V;
54339
54340 return SDValue();
54341}
54342
54344 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
54345
54346 for (const SDNode *User : Flags->uses()) {
54348 switch (User->getOpcode()) {
54349 default:
54350 // Be conservative.
54351 return true;
54352 case X86ISD::SETCC:
54354 CC = (X86::CondCode)User->getConstantOperandVal(0);
54355 break;
54356 case X86ISD::BRCOND:
54357 case X86ISD::CMOV:
54358 CC = (X86::CondCode)User->getConstantOperandVal(2);
54359 break;
54360 }
54361
54362 switch (CC) {
54363 // clang-format off
54364 default: break;
54365 case X86::COND_A: case X86::COND_AE:
54366 case X86::COND_B: case X86::COND_BE:
54367 case X86::COND_O: case X86::COND_NO:
54368 case X86::COND_G: case X86::COND_GE:
54369 case X86::COND_L: case X86::COND_LE:
54370 return true;
54371 // clang-format on
54372 }
54373 }
54374
54375 return false;
54376}
54377
54378static bool onlyZeroFlagUsed(SDValue Flags) {
54379 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
54380
54381 for (const SDNode *User : Flags->uses()) {
54382 unsigned CCOpNo;
54383 switch (User->getOpcode()) {
54384 default:
54385 // Be conservative.
54386 return false;
54387 case X86ISD::SETCC:
54389 CCOpNo = 0;
54390 break;
54391 case X86ISD::BRCOND:
54392 case X86ISD::CMOV:
54393 CCOpNo = 2;
54394 break;
54395 }
54396
54397 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
54398 if (CC != X86::COND_E && CC != X86::COND_NE)
54399 return false;
54400 }
54401
54402 return true;
54403}
54404
54406 const X86Subtarget &Subtarget) {
54407 // Only handle test patterns.
54408 if (!isNullConstant(N->getOperand(1)))
54409 return SDValue();
54410
54411 // If we have a CMP of a truncated binop, see if we can make a smaller binop
54412 // and use its flags directly.
54413 // TODO: Maybe we should try promoting compares that only use the zero flag
54414 // first if we can prove the upper bits with computeKnownBits?
54415 SDLoc dl(N);
54416 SDValue Op = N->getOperand(0);
54417 EVT VT = Op.getValueType();
54418 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54419
54420 // If we have a constant logical shift that's only used in a comparison
54421 // against zero turn it into an equivalent AND. This allows turning it into
54422 // a TEST instruction later.
54423 if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
54424 Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
54425 onlyZeroFlagUsed(SDValue(N, 0))) {
54426 unsigned BitWidth = VT.getSizeInBits();
54427 const APInt &ShAmt = Op.getConstantOperandAPInt(1);
54428 if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
54429 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
54430 APInt Mask = Op.getOpcode() == ISD::SRL
54431 ? APInt::getHighBitsSet(BitWidth, MaskBits)
54432 : APInt::getLowBitsSet(BitWidth, MaskBits);
54433 if (Mask.isSignedIntN(32)) {
54434 Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
54435 DAG.getConstant(Mask, dl, VT));
54436 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
54437 DAG.getConstant(0, dl, VT));
54438 }
54439 }
54440 }
54441
54442 // If we're extracting from a avx512 bool vector and comparing against zero,
54443 // then try to just bitcast the vector to an integer to use TEST/BT directly.
54444 // (and (extract_elt (kshiftr vXi1, C), 0), 1) -> (and (bc vXi1), 1<<C)
54445 if (Op.getOpcode() == ISD::AND && isOneConstant(Op.getOperand(1)) &&
54446 Op.hasOneUse() && onlyZeroFlagUsed(SDValue(N, 0))) {
54447 SDValue Src = Op.getOperand(0);
54448 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
54449 isNullConstant(Src.getOperand(1)) &&
54450 Src.getOperand(0).getValueType().getScalarType() == MVT::i1) {
54451 SDValue BoolVec = Src.getOperand(0);
54452 unsigned ShAmt = 0;
54453 if (BoolVec.getOpcode() == X86ISD::KSHIFTR) {
54454 ShAmt = BoolVec.getConstantOperandVal(1);
54455 BoolVec = BoolVec.getOperand(0);
54456 }
54457 BoolVec = widenMaskVector(BoolVec, false, Subtarget, DAG, dl);
54458 EVT VecVT = BoolVec.getValueType();
54459 unsigned BitWidth = VecVT.getVectorNumElements();
54460 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), BitWidth);
54461 if (TLI.isTypeLegal(VecVT) && TLI.isTypeLegal(BCVT)) {
54462 APInt Mask = APInt::getOneBitSet(BitWidth, ShAmt);
54463 Op = DAG.getBitcast(BCVT, BoolVec);
54464 Op = DAG.getNode(ISD::AND, dl, BCVT, Op,
54465 DAG.getConstant(Mask, dl, BCVT));
54466 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
54467 DAG.getConstant(0, dl, BCVT));
54468 }
54469 }
54470 }
54471
54472 // Peek through any zero-extend if we're only testing for a zero result.
54473 if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {
54474 SDValue Src = Op.getOperand(0);
54475 EVT SrcVT = Src.getValueType();
54476 if (SrcVT.getScalarSizeInBits() >= 8 && TLI.isTypeLegal(SrcVT))
54477 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,
54478 DAG.getConstant(0, dl, SrcVT));
54479 }
54480
54481 // Look for a truncate.
54482 if (Op.getOpcode() != ISD::TRUNCATE)
54483 return SDValue();
54484
54485 SDValue Trunc = Op;
54486 Op = Op.getOperand(0);
54487
54488 // See if we can compare with zero against the truncation source,
54489 // which should help using the Z flag from many ops. Only do this for
54490 // i32 truncated op to prevent partial-reg compares of promoted ops.
54491 EVT OpVT = Op.getValueType();
54492 APInt UpperBits =
54494 if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
54495 onlyZeroFlagUsed(SDValue(N, 0))) {
54496 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
54497 DAG.getConstant(0, dl, OpVT));
54498 }
54499
54500 // After this the truncate and arithmetic op must have a single use.
54501 if (!Trunc.hasOneUse() || !Op.hasOneUse())
54502 return SDValue();
54503
54504 unsigned NewOpc;
54505 switch (Op.getOpcode()) {
54506 default: return SDValue();
54507 case ISD::AND:
54508 // Skip and with constant. We have special handling for and with immediate
54509 // during isel to generate test instructions.
54510 if (isa<ConstantSDNode>(Op.getOperand(1)))
54511 return SDValue();
54512 NewOpc = X86ISD::AND;
54513 break;
54514 case ISD::OR: NewOpc = X86ISD::OR; break;
54515 case ISD::XOR: NewOpc = X86ISD::XOR; break;
54516 case ISD::ADD:
54517 // If the carry or overflow flag is used, we can't truncate.
54519 return SDValue();
54520 NewOpc = X86ISD::ADD;
54521 break;
54522 case ISD::SUB:
54523 // If the carry or overflow flag is used, we can't truncate.
54525 return SDValue();
54526 NewOpc = X86ISD::SUB;
54527 break;
54528 }
54529
54530 // We found an op we can narrow. Truncate its inputs.
54531 SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
54532 SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
54533
54534 // Use a X86 specific opcode to avoid DAG combine messing with it.
54535 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
54536 Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
54537
54538 // For AND, keep a CMP so that we can match the test pattern.
54539 if (NewOpc == X86ISD::AND)
54540 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
54541 DAG.getConstant(0, dl, VT));
54542
54543 // Return the flags.
54544 return Op.getValue(1);
54545}
54546
54549 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&
54550 "Expected X86ISD::ADD or X86ISD::SUB");
54551
54552 SDLoc DL(N);
54553 SDValue LHS = N->getOperand(0);
54554 SDValue RHS = N->getOperand(1);
54555 MVT VT = LHS.getSimpleValueType();
54556 bool IsSub = X86ISD::SUB == N->getOpcode();
54557 unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD;
54558
54559 // If we don't use the flag result, simplify back to a generic ADD/SUB.
54560 if (!N->hasAnyUseOfValue(1)) {
54561 SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
54562 return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
54563 }
54564
54565 // Fold any similar generic ADD/SUB opcodes to reuse this node.
54566 auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
54567 SDValue Ops[] = {N0, N1};
54568 SDVTList VTs = DAG.getVTList(N->getValueType(0));
54569 if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
54570 SDValue Op(N, 0);
54571 if (Negate)
54572 Op = DAG.getNegative(Op, DL, VT);
54573 DCI.CombineTo(GenericAddSub, Op);
54574 }
54575 };
54576 MatchGeneric(LHS, RHS, false);
54577 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
54578
54579 // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the
54580 // EFLAGS result doesn't change.
54581 return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG,
54582 /*ZeroSecondOpOnly*/ true);
54583}
54584
54586 SDValue LHS = N->getOperand(0);
54587 SDValue RHS = N->getOperand(1);
54588 SDValue BorrowIn = N->getOperand(2);
54589
54590 if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {
54591 MVT VT = N->getSimpleValueType(0);
54592 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
54593 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);
54594 }
54595
54596 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
54597 // iff the flag result is dead.
54598 if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&
54599 !N->hasAnyUseOfValue(1))
54600 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),
54601 LHS.getOperand(1), BorrowIn);
54602
54603 return SDValue();
54604}
54605
54606// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
54609 SDValue LHS = N->getOperand(0);
54610 SDValue RHS = N->getOperand(1);
54611 SDValue CarryIn = N->getOperand(2);
54612 auto *LHSC = dyn_cast<ConstantSDNode>(LHS);
54613 auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
54614
54615 // Canonicalize constant to RHS.
54616 if (LHSC && !RHSC)
54617 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,
54618 CarryIn);
54619
54620 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
54621 // the result is either zero or one (depending on the input carry bit).
54622 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
54623 if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&
54624 // We don't have a good way to replace an EFLAGS use, so only do this when
54625 // dead right now.
54626 SDValue(N, 1).use_empty()) {
54627 SDLoc DL(N);
54628 EVT VT = N->getValueType(0);
54629 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
54630 SDValue Res1 = DAG.getNode(
54631 ISD::AND, DL, VT,
54633 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),
54634 DAG.getConstant(1, DL, VT));
54635 return DCI.CombineTo(N, Res1, CarryOut);
54636 }
54637
54638 // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)
54639 // iff the flag result is dead.
54640 // TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.
54641 if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {
54642 SDLoc DL(N);
54643 APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();
54644 return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),
54645 DAG.getConstant(0, DL, LHS.getValueType()),
54646 DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);
54647 }
54648
54649 if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {
54650 MVT VT = N->getSimpleValueType(0);
54651 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
54652 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);
54653 }
54654
54655 // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)
54656 // iff the flag result is dead.
54657 if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&
54658 !N->hasAnyUseOfValue(1))
54659 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),
54660 LHS.getOperand(1), CarryIn);
54661
54662 return SDValue();
54663}
54664
54666 const SDLoc &DL, EVT VT,
54667 const X86Subtarget &Subtarget) {
54668 // Example of pattern we try to detect:
54669 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
54670 //(add (build_vector (extract_elt t, 0),
54671 // (extract_elt t, 2),
54672 // (extract_elt t, 4),
54673 // (extract_elt t, 6)),
54674 // (build_vector (extract_elt t, 1),
54675 // (extract_elt t, 3),
54676 // (extract_elt t, 5),
54677 // (extract_elt t, 7)))
54678
54679 if (!Subtarget.hasSSE2())
54680 return SDValue();
54681
54682 if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
54684 return SDValue();
54685
54686 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
54687 VT.getVectorNumElements() < 4 ||
54689 return SDValue();
54690
54691 // Check if one of Op0,Op1 is of the form:
54692 // (build_vector (extract_elt Mul, 0),
54693 // (extract_elt Mul, 2),
54694 // (extract_elt Mul, 4),
54695 // ...
54696 // the other is of the form:
54697 // (build_vector (extract_elt Mul, 1),
54698 // (extract_elt Mul, 3),
54699 // (extract_elt Mul, 5),
54700 // ...
54701 // and identify Mul.
54702 SDValue Mul;
54703 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
54704 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
54705 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
54706 // TODO: Be more tolerant to undefs.
54707 if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54708 Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54709 Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54710 Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
54711 return SDValue();
54712 auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
54713 auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
54714 auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
54715 auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
54716 if (!Const0L || !Const1L || !Const0H || !Const1H)
54717 return SDValue();
54718 unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
54719 Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
54720 // Commutativity of mul allows factors of a product to reorder.
54721 if (Idx0L > Idx1L)
54722 std::swap(Idx0L, Idx1L);
54723 if (Idx0H > Idx1H)
54724 std::swap(Idx0H, Idx1H);
54725 // Commutativity of add allows pairs of factors to reorder.
54726 if (Idx0L > Idx0H) {
54727 std::swap(Idx0L, Idx0H);
54728 std::swap(Idx1L, Idx1H);
54729 }
54730 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
54731 Idx1H != 2 * i + 3)
54732 return SDValue();
54733 if (!Mul) {
54734 // First time an extract_elt's source vector is visited. Must be a MUL
54735 // with 2X number of vector elements than the BUILD_VECTOR.
54736 // Both extracts must be from same MUL.
54737 Mul = Op0L->getOperand(0);
54738 if (Mul->getOpcode() != ISD::MUL ||
54739 Mul.getValueType().getVectorNumElements() != 2 * e)
54740 return SDValue();
54741 }
54742 // Check that the extract is from the same MUL previously seen.
54743 if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
54744 Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
54745 return SDValue();
54746 }
54747
54748 // Check if the Mul source can be safely shrunk.
54749 ShrinkMode Mode;
54750 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
54751 Mode == ShrinkMode::MULU16)
54752 return SDValue();
54753
54754 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54755 VT.getVectorNumElements() * 2);
54756 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
54757 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
54758
54759 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
54760 ArrayRef<SDValue> Ops) {
54761 EVT InVT = Ops[0].getValueType();
54762 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
54763 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
54764 InVT.getVectorNumElements() / 2);
54765 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
54766 };
54767 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);
54768}
54769
54770// Attempt to turn this pattern into PMADDWD.
54771// (add (mul (sext (build_vector)), (sext (build_vector))),
54772// (mul (sext (build_vector)), (sext (build_vector)))
54774 const SDLoc &DL, EVT VT,
54775 const X86Subtarget &Subtarget) {
54776 if (!Subtarget.hasSSE2())
54777 return SDValue();
54778
54779 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
54780 return SDValue();
54781
54782 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
54783 VT.getVectorNumElements() < 4 ||
54785 return SDValue();
54786
54787 SDValue N00 = N0.getOperand(0);
54788 SDValue N01 = N0.getOperand(1);
54789 SDValue N10 = N1.getOperand(0);
54790 SDValue N11 = N1.getOperand(1);
54791
54792 // All inputs need to be sign extends.
54793 // TODO: Support ZERO_EXTEND from known positive?
54794 if (N00.getOpcode() != ISD::SIGN_EXTEND ||
54795 N01.getOpcode() != ISD::SIGN_EXTEND ||
54796 N10.getOpcode() != ISD::SIGN_EXTEND ||
54797 N11.getOpcode() != ISD::SIGN_EXTEND)
54798 return SDValue();
54799
54800 // Peek through the extends.
54801 N00 = N00.getOperand(0);
54802 N01 = N01.getOperand(0);
54803 N10 = N10.getOperand(0);
54804 N11 = N11.getOperand(0);
54805
54806 // Must be extending from vXi16.
54807 EVT InVT = N00.getValueType();
54808 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
54809 N10.getValueType() != InVT || N11.getValueType() != InVT)
54810 return SDValue();
54811
54812 // All inputs should be build_vectors.
54813 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
54814 N01.getOpcode() != ISD::BUILD_VECTOR ||
54815 N10.getOpcode() != ISD::BUILD_VECTOR ||
54817 return SDValue();
54818
54819 // For each element, we need to ensure we have an odd element from one vector
54820 // multiplied by the odd element of another vector and the even element from
54821 // one of the same vectors being multiplied by the even element from the
54822 // other vector. So we need to make sure for each element i, this operator
54823 // is being performed:
54824 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
54825 SDValue In0, In1;
54826 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
54827 SDValue N00Elt = N00.getOperand(i);
54828 SDValue N01Elt = N01.getOperand(i);
54829 SDValue N10Elt = N10.getOperand(i);
54830 SDValue N11Elt = N11.getOperand(i);
54831 // TODO: Be more tolerant to undefs.
54832 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54833 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54834 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54836 return SDValue();
54837 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
54838 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
54839 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
54840 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
54841 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
54842 return SDValue();
54843 unsigned IdxN00 = ConstN00Elt->getZExtValue();
54844 unsigned IdxN01 = ConstN01Elt->getZExtValue();
54845 unsigned IdxN10 = ConstN10Elt->getZExtValue();
54846 unsigned IdxN11 = ConstN11Elt->getZExtValue();
54847 // Add is commutative so indices can be reordered.
54848 if (IdxN00 > IdxN10) {
54849 std::swap(IdxN00, IdxN10);
54850 std::swap(IdxN01, IdxN11);
54851 }
54852 // N0 indices be the even element. N1 indices must be the next odd element.
54853 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
54854 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
54855 return SDValue();
54856 SDValue N00In = N00Elt.getOperand(0);
54857 SDValue N01In = N01Elt.getOperand(0);
54858 SDValue N10In = N10Elt.getOperand(0);
54859 SDValue N11In = N11Elt.getOperand(0);
54860
54861 // First time we find an input capture it.
54862 if (!In0) {
54863 In0 = N00In;
54864 In1 = N01In;
54865
54866 // The input vectors must be at least as wide as the output.
54867 // If they are larger than the output, we extract subvector below.
54868 if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
54869 In1.getValueSizeInBits() < VT.getSizeInBits())
54870 return SDValue();
54871 }
54872 // Mul is commutative so the input vectors can be in any order.
54873 // Canonicalize to make the compares easier.
54874 if (In0 != N00In)
54875 std::swap(N00In, N01In);
54876 if (In0 != N10In)
54877 std::swap(N10In, N11In);
54878 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
54879 return SDValue();
54880 }
54881
54882 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
54883 ArrayRef<SDValue> Ops) {
54884 EVT OpVT = Ops[0].getValueType();
54885 assert(OpVT.getScalarType() == MVT::i16 &&
54886 "Unexpected scalar element type");
54887 assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
54888 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
54889 OpVT.getVectorNumElements() / 2);
54890 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
54891 };
54892
54893 // If the output is narrower than an input, extract the low part of the input
54894 // vector.
54895 EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54896 VT.getVectorNumElements() * 2);
54897 if (OutVT16.bitsLT(In0.getValueType())) {
54898 In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
54899 DAG.getIntPtrConstant(0, DL));
54900 }
54901 if (OutVT16.bitsLT(In1.getValueType())) {
54902 In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
54903 DAG.getIntPtrConstant(0, DL));
54904 }
54905 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
54906 PMADDBuilder);
54907}
54908
54909// ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
54910// If upper element in each pair of both VPMADDWD are zero then we can merge
54911// the operand elements and use the implicit add of VPMADDWD.
54912// TODO: Add support for VPMADDUBSW (which isn't commutable).
54914 const SDLoc &DL, EVT VT) {
54915 if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)
54916 return SDValue();
54917
54918 // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.
54919 if (VT.getSizeInBits() > 128)
54920 return SDValue();
54921
54922 unsigned NumElts = VT.getVectorNumElements();
54923 MVT OpVT = N0.getOperand(0).getSimpleValueType();
54925 APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));
54926
54927 bool Op0HiZero =
54928 DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||
54929 DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);
54930 bool Op1HiZero =
54931 DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||
54932 DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);
54933
54934 // TODO: Check for zero lower elements once we have actual codegen that
54935 // creates them.
54936 if (!Op0HiZero || !Op1HiZero)
54937 return SDValue();
54938
54939 // Create a shuffle mask packing the lower elements from each VPMADDWD.
54940 SmallVector<int> Mask;
54941 for (int i = 0; i != (int)NumElts; ++i) {
54942 Mask.push_back(2 * i);
54943 Mask.push_back(2 * (i + NumElts));
54944 }
54945
54946 SDValue LHS =
54947 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);
54948 SDValue RHS =
54949 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);
54950 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);
54951}
54952
54953/// CMOV of constants requires materializing constant operands in registers.
54954/// Try to fold those constants into an 'add' instruction to reduce instruction
54955/// count. We do this with CMOV rather the generic 'select' because there are
54956/// earlier folds that may be used to turn select-of-constants into logic hacks.
54958 const X86Subtarget &Subtarget) {
54959 // If an operand is zero, add-of-0 gets simplified away, so that's clearly
54960 // better because we eliminate 1-2 instructions. This transform is still
54961 // an improvement without zero operands because we trade 2 move constants and
54962 // 1 add for 2 adds (LEA) as long as the constants can be represented as
54963 // immediate asm operands (fit in 32-bits).
54964 auto isSuitableCmov = [](SDValue V) {
54965 if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
54966 return false;
54967 if (!isa<ConstantSDNode>(V.getOperand(0)) ||
54968 !isa<ConstantSDNode>(V.getOperand(1)))
54969 return false;
54970 return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
54971 (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
54972 V.getConstantOperandAPInt(1).isSignedIntN(32));
54973 };
54974
54975 // Match an appropriate CMOV as the first operand of the add.
54976 SDValue Cmov = N->getOperand(0);
54977 SDValue OtherOp = N->getOperand(1);
54978 if (!isSuitableCmov(Cmov))
54979 std::swap(Cmov, OtherOp);
54980 if (!isSuitableCmov(Cmov))
54981 return SDValue();
54982
54983 // Don't remove a load folding opportunity for the add. That would neutralize
54984 // any improvements from removing constant materializations.
54985 if (X86::mayFoldLoad(OtherOp, Subtarget))
54986 return SDValue();
54987
54988 EVT VT = N->getValueType(0);
54989 SDLoc DL(N);
54990 SDValue FalseOp = Cmov.getOperand(0);
54991 SDValue TrueOp = Cmov.getOperand(1);
54992
54993 // We will push the add through the select, but we can potentially do better
54994 // if we know there is another add in the sequence and this is pointer math.
54995 // In that case, we can absorb an add into the trailing memory op and avoid
54996 // a 3-operand LEA which is likely slower than a 2-operand LEA.
54997 // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
54998 if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
54999 !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
55000 all_of(N->uses(), [&](SDNode *Use) {
55001 auto *MemNode = dyn_cast<MemSDNode>(Use);
55002 return MemNode && MemNode->getBasePtr().getNode() == N;
55003 })) {
55004 // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
55005 // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but
55006 // it is possible that choosing op1 might be better.
55007 SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);
55008 FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);
55009 TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);
55010 Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,
55011 Cmov.getOperand(2), Cmov.getOperand(3));
55012 return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);
55013 }
55014
55015 // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
55016 FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
55017 TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
55018 return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
55019 Cmov.getOperand(3));
55020}
55021
55024 const X86Subtarget &Subtarget) {
55025 EVT VT = N->getValueType(0);
55026 SDValue Op0 = N->getOperand(0);
55027 SDValue Op1 = N->getOperand(1);
55028 SDLoc DL(N);
55029
55030 if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG, Subtarget))
55031 return Select;
55032
55033 if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, DL, VT, Subtarget))
55034 return MAdd;
55035 if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, DL, VT, Subtarget))
55036 return MAdd;
55037 if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))
55038 return MAdd;
55039
55040 // Try to synthesize horizontal adds from adds of shuffles.
55041 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
55042 return V;
55043
55044 // add(psadbw(X,0),psadbw(Y,0)) -> psadbw(add(X,Y),0)
55045 // iff X and Y won't overflow.
55046 if (Op0.getOpcode() == X86ISD::PSADBW && Op1.getOpcode() == X86ISD::PSADBW &&
55049 if (DAG.willNotOverflowAdd(false, Op0.getOperand(0), Op1.getOperand(0))) {
55050 MVT OpVT = Op0.getOperand(1).getSimpleValueType();
55051 SDValue Sum =
55052 DAG.getNode(ISD::ADD, DL, OpVT, Op0.getOperand(0), Op1.getOperand(0));
55053 return DAG.getNode(X86ISD::PSADBW, DL, VT, Sum,
55054 getZeroVector(OpVT, Subtarget, DAG, DL));
55055 }
55056 }
55057
55058 // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
55059 // (sub Y, (sext (vXi1 X))).
55060 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
55061 // generic DAG combine without a legal type check, but adding this there
55062 // caused regressions.
55063 if (VT.isVector()) {
55064 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55065 if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
55066 Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
55067 TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
55068 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
55069 return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
55070 }
55071
55072 if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
55073 Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
55074 TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
55075 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
55076 return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
55077 }
55078 }
55079
55080 // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)
55081 if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&
55082 X86::isZeroNode(Op0.getOperand(1))) {
55083 assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use");
55084 return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,
55085 Op0.getOperand(0), Op0.getOperand(2));
55086 }
55087
55088 return combineAddOrSubToADCOrSBB(N, DAG);
55089}
55090
55091// Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov
55092// condition comes from the subtract node that produced -X. This matches the
55093// cmov expansion for absolute value. By swapping the operands we convert abs
55094// to nabs.
55096 SDValue N0 = N->getOperand(0);
55097 SDValue N1 = N->getOperand(1);
55098
55099 if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())
55100 return SDValue();
55101
55103 if (CC != X86::COND_S && CC != X86::COND_NS)
55104 return SDValue();
55105
55106 // Condition should come from a negate operation.
55107 SDValue Cond = N1.getOperand(3);
55108 if (Cond.getOpcode() != X86ISD::SUB || !isNullConstant(Cond.getOperand(0)))
55109 return SDValue();
55110 assert(Cond.getResNo() == 1 && "Unexpected result number");
55111
55112 // Get the X and -X from the negate.
55113 SDValue NegX = Cond.getValue(0);
55114 SDValue X = Cond.getOperand(1);
55115
55116 SDValue FalseOp = N1.getOperand(0);
55117 SDValue TrueOp = N1.getOperand(1);
55118
55119 // Cmov operands should be X and NegX. Order doesn't matter.
55120 if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))
55121 return SDValue();
55122
55123 // Build a new CMOV with the operands swapped.
55124 SDLoc DL(N);
55125 MVT VT = N->getSimpleValueType(0);
55126 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,
55127 N1.getOperand(2), Cond);
55128 // Convert sub to add.
55129 return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);
55130}
55131
55133 SDValue Op0 = N->getOperand(0);
55134 SDValue Op1 = N->getOperand(1);
55135
55136 // (sub C (zero_extend (setcc)))
55137 // =>
55138 // (add (zero_extend (setcc inverted) C-1)) if C is a nonzero immediate
55139 // Don't disturb (sub 0 setcc), which is easily done with neg.
55140 EVT VT = N->getValueType(0);
55141 auto *Op0C = dyn_cast<ConstantSDNode>(Op0);
55142 if (Op1.getOpcode() == ISD::ZERO_EXTEND && Op1.hasOneUse() && Op0C &&
55143 !Op0C->isZero() && Op1.getOperand(0).getOpcode() == X86ISD::SETCC &&
55144 Op1.getOperand(0).hasOneUse()) {
55145 SDValue SetCC = Op1.getOperand(0);
55148 APInt NewImm = Op0C->getAPIntValue() - 1;
55149 SDLoc DL(Op1);
55150 SDValue NewSetCC = getSETCC(NewCC, SetCC.getOperand(1), DL, DAG);
55151 NewSetCC = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NewSetCC);
55152 return DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(VT, VT), NewSetCC,
55153 DAG.getConstant(NewImm, DL, VT));
55154 }
55155
55156 return SDValue();
55157}
55158
55161 const X86Subtarget &Subtarget) {
55162 SDValue Op0 = N->getOperand(0);
55163 SDValue Op1 = N->getOperand(1);
55164
55165 // TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt.
55166 auto IsNonOpaqueConstant = [&](SDValue Op) {
55168 if (auto *Cst = dyn_cast<ConstantSDNode>(C))
55169 return !Cst->isOpaque();
55170 return true;
55171 }
55172 return false;
55173 };
55174
55175 // X86 can't encode an immediate LHS of a sub. See if we can push the
55176 // negation into a preceding instruction. If the RHS of the sub is a XOR with
55177 // one use and a constant, invert the immediate, saving one register.
55178 // However, ignore cases where C1 is 0, as those will become a NEG.
55179 // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
55180 if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
55181 !isNullConstant(Op0) && IsNonOpaqueConstant(Op1.getOperand(1)) &&
55182 Op1->hasOneUse()) {
55183 SDLoc DL(N);
55184 EVT VT = Op0.getValueType();
55185 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
55186 DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
55187 SDValue NewAdd =
55188 DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
55189 return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
55190 }
55191
55192 if (SDValue V = combineSubABS(N, DAG))
55193 return V;
55194
55195 // Try to synthesize horizontal subs from subs of shuffles.
55196 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
55197 return V;
55198
55199 // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)
55200 if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&
55201 X86::isZeroNode(Op1.getOperand(1))) {
55202 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
55203 return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,
55204 Op1.getOperand(0), Op1.getOperand(2));
55205 }
55206
55207 // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)
55208 // Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.
55209 if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&
55210 !(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {
55211 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
55212 SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,
55213 Op1.getOperand(1), Op1.getOperand(2));
55214 return DAG.getNode(ISD::SUB, SDLoc(N), Op0.getValueType(), ADC.getValue(0),
55215 Op1.getOperand(0));
55216 }
55217
55218 if (SDValue V = combineXorSubCTLZ(N, DAG, Subtarget))
55219 return V;
55220
55221 if (SDValue V = combineAddOrSubToADCOrSBB(N, DAG))
55222 return V;
55223
55224 return combineSubSetcc(N, DAG);
55225}
55226
55228 const X86Subtarget &Subtarget) {
55229 MVT VT = N->getSimpleValueType(0);
55230 SDLoc DL(N);
55231
55232 if (N->getOperand(0) == N->getOperand(1)) {
55233 if (N->getOpcode() == X86ISD::PCMPEQ)
55234 return DAG.getConstant(-1, DL, VT);
55235 if (N->getOpcode() == X86ISD::PCMPGT)
55236 return DAG.getConstant(0, DL, VT);
55237 }
55238
55239 return SDValue();
55240}
55241
55242/// Helper that combines an array of subvector ops as if they were the operands
55243/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
55244/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
55248 const X86Subtarget &Subtarget) {
55249 assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
55250 unsigned EltSizeInBits = VT.getScalarSizeInBits();
55251
55252 if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
55253 return DAG.getUNDEF(VT);
55254
55255 if (llvm::all_of(Ops, [](SDValue Op) {
55256 return ISD::isBuildVectorAllZeros(Op.getNode());
55257 }))
55258 return getZeroVector(VT, Subtarget, DAG, DL);
55259
55260 SDValue Op0 = Ops[0];
55261 bool IsSplat = llvm::all_equal(Ops);
55262 unsigned NumOps = Ops.size();
55263 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55264 LLVMContext &Ctx = *DAG.getContext();
55265
55266 // Repeated subvectors.
55267 if (IsSplat &&
55268 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
55269 // If this broadcast is inserted into both halves, use a larger broadcast.
55270 if (Op0.getOpcode() == X86ISD::VBROADCAST)
55271 return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
55272
55273 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
55274 if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
55275 (Subtarget.hasAVX2() ||
55277 VT.getScalarType(), Subtarget)))
55278 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
55279 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
55280 Op0.getOperand(0),
55281 DAG.getIntPtrConstant(0, DL)));
55282
55283 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
55284 if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
55285 (Subtarget.hasAVX2() ||
55286 (EltSizeInBits >= 32 &&
55287 X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&
55288 Op0.getOperand(0).getValueType() == VT.getScalarType())
55289 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
55290
55291 // concat_vectors(extract_subvector(broadcast(x)),
55292 // extract_subvector(broadcast(x))) -> broadcast(x)
55293 // concat_vectors(extract_subvector(subv_broadcast(x)),
55294 // extract_subvector(subv_broadcast(x))) -> subv_broadcast(x)
55295 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
55296 Op0.getOperand(0).getValueType() == VT) {
55297 SDValue SrcVec = Op0.getOperand(0);
55298 if (SrcVec.getOpcode() == X86ISD::VBROADCAST ||
55300 return Op0.getOperand(0);
55301 if (SrcVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
55302 Op0.getValueType() == cast<MemSDNode>(SrcVec)->getMemoryVT())
55303 return Op0.getOperand(0);
55304 }
55305
55306 // concat_vectors(permq(x),permq(x)) -> permq(concat_vectors(x,x))
55307 if (Op0.getOpcode() == X86ISD::VPERMI && Subtarget.useAVX512Regs() &&
55308 !X86::mayFoldLoad(Op0.getOperand(0), Subtarget))
55309 return DAG.getNode(Op0.getOpcode(), DL, VT,
55311 Op0.getOperand(0), Op0.getOperand(0)),
55312 Op0.getOperand(1));
55313 }
55314
55315 // concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128.
55316 // Only concat of subvector high halves which vperm2x128 is best at.
55317 // TODO: This should go in combineX86ShufflesRecursively eventually.
55318 if (VT.is256BitVector() && NumOps == 2) {
55319 SDValue Src0 = peekThroughBitcasts(Ops[0]);
55320 SDValue Src1 = peekThroughBitcasts(Ops[1]);
55321 if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
55323 EVT SrcVT0 = Src0.getOperand(0).getValueType();
55324 EVT SrcVT1 = Src1.getOperand(0).getValueType();
55325 unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
55326 unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
55327 if (SrcVT0.is256BitVector() && SrcVT1.is256BitVector() &&
55328 Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) &&
55329 Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) {
55330 return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
55331 DAG.getBitcast(VT, Src0.getOperand(0)),
55332 DAG.getBitcast(VT, Src1.getOperand(0)),
55333 DAG.getTargetConstant(0x31, DL, MVT::i8));
55334 }
55335 }
55336 }
55337
55338 // Repeated opcode.
55339 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
55340 // but it currently struggles with different vector widths.
55341 if (llvm::all_of(Ops, [Op0](SDValue Op) {
55342 return Op.getOpcode() == Op0.getOpcode() && Op.hasOneUse();
55343 })) {
55344 auto ConcatSubOperand = [&](EVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
55346 for (SDValue SubOp : SubOps)
55347 Subs.push_back(SubOp.getOperand(I));
55348 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
55349 };
55350 auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {
55351 bool AllConstants = true;
55352 bool AllSubVectors = true;
55353 for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {
55354 SDValue Sub = SubOps[I].getOperand(Op);
55355 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
55356 SDValue BC = peekThroughBitcasts(Sub);
55357 AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
55359 AllSubVectors &= Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
55360 Sub.getOperand(0).getValueType() == VT &&
55361 Sub.getConstantOperandAPInt(1) == (I * NumSubElts);
55362 }
55363 return AllConstants || AllSubVectors;
55364 };
55365
55366 switch (Op0.getOpcode()) {
55367 case X86ISD::VBROADCAST: {
55368 if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) {
55369 return Op.getOperand(0).getValueType().is128BitVector();
55370 })) {
55371 if (VT == MVT::v4f64 || VT == MVT::v4i64)
55372 return DAG.getNode(X86ISD::UNPCKL, DL, VT,
55373 ConcatSubOperand(VT, Ops, 0),
55374 ConcatSubOperand(VT, Ops, 0));
55375 // TODO: Add pseudo v8i32 PSHUFD handling to AVX1Only targets.
55376 if (VT == MVT::v8f32 || (VT == MVT::v8i32 && Subtarget.hasInt256()))
55377 return DAG.getNode(VT == MVT::v8f32 ? X86ISD::VPERMILPI
55379 DL, VT, ConcatSubOperand(VT, Ops, 0),
55380 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
55381 }
55382 break;
55383 }
55384 case X86ISD::MOVDDUP:
55385 case X86ISD::MOVSHDUP:
55386 case X86ISD::MOVSLDUP: {
55387 if (!IsSplat)
55388 return DAG.getNode(Op0.getOpcode(), DL, VT,
55389 ConcatSubOperand(VT, Ops, 0));
55390 break;
55391 }
55392 case X86ISD::SHUFP: {
55393 // Add SHUFPD support if/when necessary.
55394 if (!IsSplat && VT.getScalarType() == MVT::f32 &&
55395 llvm::all_of(Ops, [Op0](SDValue Op) {
55396 return Op.getOperand(2) == Op0.getOperand(2);
55397 })) {
55398 return DAG.getNode(Op0.getOpcode(), DL, VT,
55399 ConcatSubOperand(VT, Ops, 0),
55400 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
55401 }
55402 break;
55403 }
55404 case X86ISD::UNPCKH:
55405 case X86ISD::UNPCKL: {
55406 // Don't concatenate build_vector patterns.
55407 if (!IsSplat && EltSizeInBits >= 32 &&
55408 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
55409 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
55410 none_of(Ops, [](SDValue Op) {
55411 return peekThroughBitcasts(Op.getOperand(0)).getOpcode() ==
55413 peekThroughBitcasts(Op.getOperand(1)).getOpcode() ==
55415 })) {
55416 return DAG.getNode(Op0.getOpcode(), DL, VT,
55417 ConcatSubOperand(VT, Ops, 0),
55418 ConcatSubOperand(VT, Ops, 1));
55419 }
55420 break;
55421 }
55422 case X86ISD::PSHUFHW:
55423 case X86ISD::PSHUFLW:
55424 case X86ISD::PSHUFD:
55425 if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
55426 Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
55427 return DAG.getNode(Op0.getOpcode(), DL, VT,
55428 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
55429 }
55430 [[fallthrough]];
55431 case X86ISD::VPERMILPI:
55432 if (!IsSplat && EltSizeInBits == 32 &&
55433 (VT.is256BitVector() ||
55434 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
55435 all_of(Ops, [&Op0](SDValue Op) {
55436 return Op0.getOperand(1) == Op.getOperand(1);
55437 })) {
55438 MVT FloatVT = VT.changeVectorElementType(MVT::f32);
55439 SDValue Res = DAG.getBitcast(FloatVT, ConcatSubOperand(VT, Ops, 0));
55440 Res =
55441 DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1));
55442 return DAG.getBitcast(VT, Res);
55443 }
55444 if (!IsSplat && NumOps == 2 && VT == MVT::v4f64) {
55445 uint64_t Idx0 = Ops[0].getConstantOperandVal(1);
55446 uint64_t Idx1 = Ops[1].getConstantOperandVal(1);
55447 uint64_t Idx = ((Idx1 & 3) << 2) | (Idx0 & 3);
55448 return DAG.getNode(Op0.getOpcode(), DL, VT,
55449 ConcatSubOperand(VT, Ops, 0),
55450 DAG.getTargetConstant(Idx, DL, MVT::i8));
55451 }
55452 break;
55453 case X86ISD::PSHUFB:
55454 case X86ISD::PSADBW:
55455 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
55456 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
55457 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
55458 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
55459 NumOps * SrcVT.getVectorNumElements());
55460 return DAG.getNode(Op0.getOpcode(), DL, VT,
55461 ConcatSubOperand(SrcVT, Ops, 0),
55462 ConcatSubOperand(SrcVT, Ops, 1));
55463 }
55464 break;
55465 case X86ISD::VPERMV:
55466 if (!IsSplat && NumOps == 2 &&
55467 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
55468 MVT OpVT = Op0.getSimpleValueType();
55469 int NumSrcElts = OpVT.getVectorNumElements();
55470 SmallVector<int, 64> ConcatMask;
55471 for (unsigned i = 0; i != NumOps; ++i) {
55472 SmallVector<int, 64> SubMask;
55474 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
55475 break;
55476 for (int M : SubMask) {
55477 if (0 <= M)
55478 M += i * NumSrcElts;
55479 ConcatMask.push_back(M);
55480 }
55481 }
55482 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
55483 SDValue Src = concatSubVectors(Ops[0].getOperand(1),
55484 Ops[1].getOperand(1), DAG, DL);
55485 MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits);
55486 MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
55487 SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
55488 return DAG.getNode(X86ISD::VPERMV, DL, VT, Mask, Src);
55489 }
55490 }
55491 break;
55492 case X86ISD::VPERMV3:
55493 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
55494 MVT OpVT = Op0.getSimpleValueType();
55495 int NumSrcElts = OpVT.getVectorNumElements();
55496 SmallVector<int, 64> ConcatMask;
55497 for (unsigned i = 0; i != NumOps; ++i) {
55498 SmallVector<int, 64> SubMask;
55500 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
55501 break;
55502 for (int M : SubMask) {
55503 if (0 <= M) {
55504 M += M < NumSrcElts ? 0 : NumSrcElts;
55505 M += i * NumSrcElts;
55506 }
55507 ConcatMask.push_back(M);
55508 }
55509 }
55510 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
55511 SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),
55512 Ops[1].getOperand(0), DAG, DL);
55513 SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),
55514 Ops[1].getOperand(2), DAG, DL);
55515 MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits);
55516 MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
55517 SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
55518 return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);
55519 }
55520 }
55521 break;
55522 case X86ISD::VPERM2X128: {
55523 if (!IsSplat && VT.is512BitVector() && Subtarget.useAVX512Regs()) {
55524 assert(NumOps == 2 && "Bad concat_vectors operands");
55525 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
55526 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
55527 // TODO: Handle zero'd subvectors.
55528 if ((Imm0 & 0x88) == 0 && (Imm1 & 0x88) == 0) {
55529 int Mask[4] = {(int)(Imm0 & 0x03), (int)((Imm0 >> 4) & 0x3), (int)(Imm1 & 0x03),
55530 (int)((Imm1 >> 4) & 0x3)};
55531 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
55532 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
55533 Ops[0].getOperand(1), DAG, DL);
55534 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
55535 Ops[1].getOperand(1), DAG, DL);
55536 SDValue Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
55537 DAG.getBitcast(ShuffleVT, LHS),
55538 DAG.getBitcast(ShuffleVT, RHS),
55539 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
55540 return DAG.getBitcast(VT, Res);
55541 }
55542 }
55543 break;
55544 }
55545 case X86ISD::SHUF128: {
55546 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
55547 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
55548 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
55549 unsigned Imm = ((Imm0 & 1) << 0) | ((Imm0 & 2) << 1) | 0x08 |
55550 ((Imm1 & 1) << 4) | ((Imm1 & 2) << 5) | 0x80;
55551 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
55552 Ops[0].getOperand(1), DAG, DL);
55553 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
55554 Ops[1].getOperand(1), DAG, DL);
55555 return DAG.getNode(X86ISD::SHUF128, DL, VT, LHS, RHS,
55556 DAG.getTargetConstant(Imm, DL, MVT::i8));
55557 }
55558 break;
55559 }
55560 case ISD::TRUNCATE:
55561 if (!IsSplat && NumOps == 2 && VT.is256BitVector()) {
55562 EVT SrcVT = Ops[0].getOperand(0).getValueType();
55563 if (SrcVT.is256BitVector() && SrcVT.isSimple() &&
55564 SrcVT == Ops[1].getOperand(0).getValueType() &&
55565 Subtarget.useAVX512Regs() &&
55566 Subtarget.getPreferVectorWidth() >= 512 &&
55567 (SrcVT.getScalarSizeInBits() > 16 || Subtarget.useBWIRegs())) {
55568 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);
55569 return DAG.getNode(ISD::TRUNCATE, DL, VT,
55570 ConcatSubOperand(NewSrcVT, Ops, 0));
55571 }
55572 }
55573 break;
55574 case X86ISD::VSHLI:
55575 case X86ISD::VSRLI:
55576 // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
55577 // TODO: Move this to LowerShiftByScalarImmediate?
55578 if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
55579 llvm::all_of(Ops, [](SDValue Op) {
55580 return Op.getConstantOperandAPInt(1) == 32;
55581 })) {
55582 SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0));
55583 SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
55584 if (Op0.getOpcode() == X86ISD::VSHLI) {
55585 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
55586 {8, 0, 8, 2, 8, 4, 8, 6});
55587 } else {
55588 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
55589 {1, 8, 3, 8, 5, 8, 7, 8});
55590 }
55591 return DAG.getBitcast(VT, Res);
55592 }
55593 [[fallthrough]];
55594 case X86ISD::VSRAI:
55595 case X86ISD::VSHL:
55596 case X86ISD::VSRL:
55597 case X86ISD::VSRA:
55598 if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
55599 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
55600 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
55601 llvm::all_of(Ops, [Op0](SDValue Op) {
55602 return Op0.getOperand(1) == Op.getOperand(1);
55603 })) {
55604 return DAG.getNode(Op0.getOpcode(), DL, VT,
55605 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
55606 }
55607 break;
55608 case X86ISD::VPERMI:
55609 case X86ISD::VROTLI:
55610 case X86ISD::VROTRI:
55611 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
55612 llvm::all_of(Ops, [Op0](SDValue Op) {
55613 return Op0.getOperand(1) == Op.getOperand(1);
55614 })) {
55615 return DAG.getNode(Op0.getOpcode(), DL, VT,
55616 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
55617 }
55618 break;
55619 case ISD::AND:
55620 case ISD::OR:
55621 case ISD::XOR:
55622 case X86ISD::ANDNP:
55623 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
55624 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
55625 return DAG.getNode(Op0.getOpcode(), DL, VT,
55626 ConcatSubOperand(VT, Ops, 0),
55627 ConcatSubOperand(VT, Ops, 1));
55628 }
55629 break;
55630 case X86ISD::PCMPEQ:
55631 case X86ISD::PCMPGT:
55632 if (!IsSplat && VT.is256BitVector() && Subtarget.hasInt256() &&
55633 (IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1))) {
55634 return DAG.getNode(Op0.getOpcode(), DL, VT,
55635 ConcatSubOperand(VT, Ops, 0),
55636 ConcatSubOperand(VT, Ops, 1));
55637 }
55638 break;
55639 case ISD::CTPOP:
55640 case ISD::CTTZ:
55641 case ISD::CTLZ:
55644 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
55645 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
55646 return DAG.getNode(Op0.getOpcode(), DL, VT,
55647 ConcatSubOperand(VT, Ops, 0));
55648 }
55649 break;
55651 if (!IsSplat &&
55652 (VT.is256BitVector() ||
55653 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
55654 llvm::all_of(Ops, [Op0](SDValue Op) {
55655 return Op0.getOperand(2) == Op.getOperand(2);
55656 })) {
55657 return DAG.getNode(Op0.getOpcode(), DL, VT,
55658 ConcatSubOperand(VT, Ops, 0),
55659 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
55660 }
55661 break;
55662 case ISD::ADD:
55663 case ISD::SUB:
55664 case ISD::MUL:
55665 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
55666 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
55667 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
55668 return DAG.getNode(Op0.getOpcode(), DL, VT,
55669 ConcatSubOperand(VT, Ops, 0),
55670 ConcatSubOperand(VT, Ops, 1));
55671 }
55672 break;
55673 // Due to VADD, VSUB, VMUL can executed on more ports than VINSERT and
55674 // their latency are short, so here we don't replace them unless we won't
55675 // introduce extra VINSERT.
55676 case ISD::FADD:
55677 case ISD::FSUB:
55678 case ISD::FMUL:
55679 if (!IsSplat && (IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1)) &&
55680 (VT.is256BitVector() ||
55681 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
55682 return DAG.getNode(Op0.getOpcode(), DL, VT,
55683 ConcatSubOperand(VT, Ops, 0),
55684 ConcatSubOperand(VT, Ops, 1));
55685 }
55686 break;
55687 case ISD::FDIV:
55688 if (!IsSplat && (VT.is256BitVector() ||
55689 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
55690 return DAG.getNode(Op0.getOpcode(), DL, VT,
55691 ConcatSubOperand(VT, Ops, 0),
55692 ConcatSubOperand(VT, Ops, 1));
55693 }
55694 break;
55695 case X86ISD::HADD:
55696 case X86ISD::HSUB:
55697 case X86ISD::FHADD:
55698 case X86ISD::FHSUB:
55699 if (!IsSplat && VT.is256BitVector() &&
55700 (VT.isFloatingPoint() || Subtarget.hasInt256())) {
55701 return DAG.getNode(Op0.getOpcode(), DL, VT,
55702 ConcatSubOperand(VT, Ops, 0),
55703 ConcatSubOperand(VT, Ops, 1));
55704 }
55705 break;
55706 case X86ISD::PACKSS:
55707 case X86ISD::PACKUS:
55708 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
55709 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
55710 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
55711 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
55712 NumOps * SrcVT.getVectorNumElements());
55713 return DAG.getNode(Op0.getOpcode(), DL, VT,
55714 ConcatSubOperand(SrcVT, Ops, 0),
55715 ConcatSubOperand(SrcVT, Ops, 1));
55716 }
55717 break;
55718 case X86ISD::PALIGNR:
55719 if (!IsSplat &&
55720 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
55721 (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
55722 llvm::all_of(Ops, [Op0](SDValue Op) {
55723 return Op0.getOperand(2) == Op.getOperand(2);
55724 })) {
55725 return DAG.getNode(Op0.getOpcode(), DL, VT,
55726 ConcatSubOperand(VT, Ops, 0),
55727 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
55728 }
55729 break;
55730 case X86ISD::BLENDI:
55731 if (NumOps == 2 && VT.is512BitVector() && Subtarget.useBWIRegs()) {
55732 uint64_t Mask0 = Ops[0].getConstantOperandVal(2);
55733 uint64_t Mask1 = Ops[1].getConstantOperandVal(2);
55734 // MVT::v16i16 has repeated blend mask.
55735 if (Op0.getSimpleValueType() == MVT::v16i16) {
55736 Mask0 = (Mask0 << 8) | Mask0;
55737 Mask1 = (Mask1 << 8) | Mask1;
55738 }
55739 uint64_t Mask = (Mask1 << (VT.getVectorNumElements() / 2)) | Mask0;
55741 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
55742 SDValue Sel =
55743 DAG.getBitcast(MaskVT, DAG.getConstant(Mask, DL, MaskSVT));
55744 return DAG.getSelect(DL, VT, Sel, ConcatSubOperand(VT, Ops, 1),
55745 ConcatSubOperand(VT, Ops, 0));
55746 }
55747 break;
55748 case ISD::VSELECT:
55749 if (!IsSplat && Subtarget.hasAVX512() &&
55750 (VT.is256BitVector() ||
55751 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
55752 (EltSizeInBits >= 32 || Subtarget.hasBWI())) {
55753 EVT SelVT = Ops[0].getOperand(0).getValueType();
55754 if (SelVT.getVectorElementType() == MVT::i1) {
55755 SelVT = EVT::getVectorVT(Ctx, MVT::i1,
55756 NumOps * SelVT.getVectorNumElements());
55757 if (TLI.isTypeLegal(SelVT))
55758 return DAG.getNode(Op0.getOpcode(), DL, VT,
55759 ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
55760 ConcatSubOperand(VT, Ops, 1),
55761 ConcatSubOperand(VT, Ops, 2));
55762 }
55763 }
55764 [[fallthrough]];
55765 case X86ISD::BLENDV:
55766 if (!IsSplat && VT.is256BitVector() && NumOps == 2 &&
55767 (EltSizeInBits >= 32 || Subtarget.hasInt256()) &&
55768 IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) {
55769 EVT SelVT = Ops[0].getOperand(0).getValueType();
55770 SelVT = SelVT.getDoubleNumVectorElementsVT(Ctx);
55771 if (TLI.isTypeLegal(SelVT))
55772 return DAG.getNode(Op0.getOpcode(), DL, VT,
55773 ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
55774 ConcatSubOperand(VT, Ops, 1),
55775 ConcatSubOperand(VT, Ops, 2));
55776 }
55777 break;
55778 }
55779 }
55780
55781 // Fold subvector loads into one.
55782 // If needed, look through bitcasts to get to the load.
55783 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
55784 unsigned Fast;
55785 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
55786 if (TLI->allowsMemoryAccess(Ctx, DAG.getDataLayout(), VT,
55787 *FirstLd->getMemOperand(), &Fast) &&
55788 Fast) {
55789 if (SDValue Ld =
55790 EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
55791 return Ld;
55792 }
55793 }
55794
55795 // Attempt to fold target constant loads.
55796 if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {
55797 SmallVector<APInt> EltBits;
55798 APInt UndefElts = APInt::getZero(VT.getVectorNumElements());
55799 for (unsigned I = 0; I != NumOps; ++I) {
55800 APInt OpUndefElts;
55801 SmallVector<APInt> OpEltBits;
55802 if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,
55803 OpEltBits, /*AllowWholeUndefs*/ true,
55804 /*AllowPartialUndefs*/ false))
55805 break;
55806 EltBits.append(OpEltBits);
55807 UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());
55808 }
55809 if (EltBits.size() == VT.getVectorNumElements()) {
55810 Constant *C = getConstantVector(VT, EltBits, UndefElts, Ctx);
55811 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
55812 SDValue CV = DAG.getConstantPool(C, PVT);
55815 SDValue Ld = DAG.getLoad(VT, DL, DAG.getEntryNode(), CV, MPI);
55816 SDValue Sub = extractSubVector(Ld, 0, DAG, DL, Op0.getValueSizeInBits());
55817 DAG.ReplaceAllUsesOfValueWith(Op0, Sub);
55818 return Ld;
55819 }
55820 }
55821
55822 // If this simple subvector or scalar/subvector broadcast_load is inserted
55823 // into both halves, use a larger broadcast_load. Update other uses to use
55824 // an extracted subvector.
55825 if (IsSplat &&
55826 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
55827 if (ISD::isNormalLoad(Op0.getNode()) ||
55830 auto *Mem = cast<MemSDNode>(Op0);
55831 unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD
55834 if (SDValue BcastLd =
55835 getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {
55836 SDValue BcastSrc =
55837 extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());
55838 DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);
55839 return BcastLd;
55840 }
55841 }
55842 }
55843
55844 // If we're splatting a 128-bit subvector to 512-bits, use SHUF128 directly.
55845 if (IsSplat && NumOps == 4 && VT.is512BitVector() &&
55846 Subtarget.useAVX512Regs()) {
55847 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
55848 SDValue Res = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
55849 Res = DAG.getBitcast(ShuffleVT, Res);
55850 Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT, Res, Res,
55851 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
55852 return DAG.getBitcast(VT, Res);
55853 }
55854
55855 return SDValue();
55856}
55857
55860 const X86Subtarget &Subtarget) {
55861 EVT VT = N->getValueType(0);
55862 EVT SrcVT = N->getOperand(0).getValueType();
55863 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55864 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
55865
55866 if (VT.getVectorElementType() == MVT::i1) {
55867 // Attempt to constant fold.
55868 unsigned SubSizeInBits = SrcVT.getSizeInBits();
55870 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
55871 auto *C = dyn_cast<ConstantSDNode>(peekThroughBitcasts(Ops[I]));
55872 if (!C) break;
55873 Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);
55874 if (I == (E - 1)) {
55875 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
55876 if (TLI.isTypeLegal(IntVT))
55877 return DAG.getBitcast(VT, DAG.getConstant(Constant, SDLoc(N), IntVT));
55878 }
55879 }
55880
55881 // Don't do anything else for i1 vectors.
55882 return SDValue();
55883 }
55884
55885 if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
55886 if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
55887 DCI, Subtarget))
55888 return R;
55889 }
55890
55891 return SDValue();
55892}
55893
55896 const X86Subtarget &Subtarget) {
55897 if (DCI.isBeforeLegalizeOps())
55898 return SDValue();
55899
55900 MVT OpVT = N->getSimpleValueType(0);
55901
55902 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
55903
55904 SDLoc dl(N);
55905 SDValue Vec = N->getOperand(0);
55906 SDValue SubVec = N->getOperand(1);
55907
55908 uint64_t IdxVal = N->getConstantOperandVal(2);
55909 MVT SubVecVT = SubVec.getSimpleValueType();
55910
55911 if (Vec.isUndef() && SubVec.isUndef())
55912 return DAG.getUNDEF(OpVT);
55913
55914 // Inserting undefs/zeros into zeros/undefs is a zero vector.
55915 if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
55916 (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
55917 return getZeroVector(OpVT, Subtarget, DAG, dl);
55918
55920 // If we're inserting into a zero vector and then into a larger zero vector,
55921 // just insert into the larger zero vector directly.
55922 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
55924 uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
55925 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
55926 getZeroVector(OpVT, Subtarget, DAG, dl),
55927 SubVec.getOperand(1),
55928 DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
55929 }
55930
55931 // If we're inserting into a zero vector and our input was extracted from an
55932 // insert into a zero vector of the same type and the extraction was at
55933 // least as large as the original insertion. Just insert the original
55934 // subvector into a zero vector.
55935 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
55936 isNullConstant(SubVec.getOperand(1)) &&
55938 SDValue Ins = SubVec.getOperand(0);
55939 if (isNullConstant(Ins.getOperand(2)) &&
55940 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
55941 Ins.getOperand(1).getValueSizeInBits().getFixedValue() <=
55942 SubVecVT.getFixedSizeInBits())
55943 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
55944 getZeroVector(OpVT, Subtarget, DAG, dl),
55945 Ins.getOperand(1), N->getOperand(2));
55946 }
55947 }
55948
55949 // Stop here if this is an i1 vector.
55950 if (IsI1Vector)
55951 return SDValue();
55952
55953 // Eliminate an intermediate vector widening:
55954 // insert_subvector X, (insert_subvector undef, Y, 0), Idx -->
55955 // insert_subvector X, Y, Idx
55956 // TODO: This is a more general version of a DAGCombiner fold, can we move it
55957 // there?
55958 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
55959 SubVec.getOperand(0).isUndef() && isNullConstant(SubVec.getOperand(2)))
55960 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
55961 SubVec.getOperand(1), N->getOperand(2));
55962
55963 // If this is an insert of an extract, combine to a shuffle. Don't do this
55964 // if the insert or extract can be represented with a subregister operation.
55965 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
55966 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
55967 (IdxVal != 0 ||
55968 !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
55969 int ExtIdxVal = SubVec.getConstantOperandVal(1);
55970 if (ExtIdxVal != 0) {
55971 int VecNumElts = OpVT.getVectorNumElements();
55972 int SubVecNumElts = SubVecVT.getVectorNumElements();
55973 SmallVector<int, 64> Mask(VecNumElts);
55974 // First create an identity shuffle mask.
55975 for (int i = 0; i != VecNumElts; ++i)
55976 Mask[i] = i;
55977 // Now insert the extracted portion.
55978 for (int i = 0; i != SubVecNumElts; ++i)
55979 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
55980
55981 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
55982 }
55983 }
55984
55985 // Match concat_vector style patterns.
55986 SmallVector<SDValue, 2> SubVectorOps;
55987 if (collectConcatOps(N, SubVectorOps, DAG)) {
55988 if (SDValue Fold =
55989 combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
55990 return Fold;
55991
55992 // If we're inserting all zeros into the upper half, change this to
55993 // a concat with zero. We will match this to a move
55994 // with implicit upper bit zeroing during isel.
55995 // We do this here because we don't want combineConcatVectorOps to
55996 // create INSERT_SUBVECTOR from CONCAT_VECTORS.
55997 if (SubVectorOps.size() == 2 &&
55998 ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
55999 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
56000 getZeroVector(OpVT, Subtarget, DAG, dl),
56001 SubVectorOps[0], DAG.getIntPtrConstant(0, dl));
56002
56003 // Attempt to recursively combine to a shuffle.
56004 if (all_of(SubVectorOps, [](SDValue SubOp) {
56005 return isTargetShuffle(SubOp.getOpcode());
56006 })) {
56007 SDValue Op(N, 0);
56008 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
56009 return Res;
56010 }
56011 }
56012
56013 // If this is a broadcast insert into an upper undef, use a larger broadcast.
56014 if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
56015 return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
56016
56017 // If this is a broadcast load inserted into an upper undef, use a larger
56018 // broadcast load.
56019 if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
56020 SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
56021 auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
56022 SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);
56023 SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
56024 SDValue BcastLd =
56026 MemIntr->getMemoryVT(),
56027 MemIntr->getMemOperand());
56028 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
56029 return BcastLd;
56030 }
56031
56032 // If we're splatting the lower half subvector of a full vector load into the
56033 // upper half, attempt to create a subvector broadcast.
56034 if (IdxVal == (OpVT.getVectorNumElements() / 2) && SubVec.hasOneUse() &&
56035 Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
56036 auto *VecLd = dyn_cast<LoadSDNode>(Vec);
56037 auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
56038 if (VecLd && SubLd &&
56039 DAG.areNonVolatileConsecutiveLoads(SubLd, VecLd,
56040 SubVec.getValueSizeInBits() / 8, 0))
56041 return getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT, SubVecVT,
56042 SubLd, 0, DAG);
56043 }
56044
56045 return SDValue();
56046}
56047
56048/// If we are extracting a subvector of a vector select and the select condition
56049/// is composed of concatenated vectors, try to narrow the select width. This
56050/// is a common pattern for AVX1 integer code because 256-bit selects may be
56051/// legal, but there is almost no integer math/logic available for 256-bit.
56052/// This function should only be called with legal types (otherwise, the calls
56053/// to get simple value types will assert).
56055 SelectionDAG &DAG) {
56056 SDValue Sel = Ext->getOperand(0);
56057 if (Sel.getOpcode() != ISD::VSELECT ||
56058 !isFreeToSplitVector(Sel.getOperand(0).getNode(), DAG))
56059 return SDValue();
56060
56061 // Note: We assume simple value types because this should only be called with
56062 // legal operations/types.
56063 // TODO: This can be extended to handle extraction to 256-bits.
56064 MVT VT = Ext->getSimpleValueType(0);
56065 if (!VT.is128BitVector())
56066 return SDValue();
56067
56068 MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
56069 if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
56070 return SDValue();
56071
56072 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
56073 MVT SelVT = Sel.getSimpleValueType();
56074 assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
56075 "Unexpected vector type with legal operations");
56076
56077 unsigned SelElts = SelVT.getVectorNumElements();
56078 unsigned CastedElts = WideVT.getVectorNumElements();
56079 unsigned ExtIdx = Ext->getConstantOperandVal(1);
56080 if (SelElts % CastedElts == 0) {
56081 // The select has the same or more (narrower) elements than the extract
56082 // operand. The extraction index gets scaled by that factor.
56083 ExtIdx *= (SelElts / CastedElts);
56084 } else if (CastedElts % SelElts == 0) {
56085 // The select has less (wider) elements than the extract operand. Make sure
56086 // that the extraction index can be divided evenly.
56087 unsigned IndexDivisor = CastedElts / SelElts;
56088 if (ExtIdx % IndexDivisor != 0)
56089 return SDValue();
56090 ExtIdx /= IndexDivisor;
56091 } else {
56092 llvm_unreachable("Element count of simple vector types are not divisible?");
56093 }
56094
56095 unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
56096 unsigned NarrowElts = SelElts / NarrowingFactor;
56097 MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
56098 SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
56099 SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
56100 SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
56101 SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
56102 return DAG.getBitcast(VT, NarrowSel);
56103}
56104
56107 const X86Subtarget &Subtarget) {
56108 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
56109 // eventually get combined/lowered into ANDNP) with a concatenated operand,
56110 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
56111 // We let generic combining take over from there to simplify the
56112 // insert/extract and 'not'.
56113 // This pattern emerges during AVX1 legalization. We handle it before lowering
56114 // to avoid complications like splitting constant vector loads.
56115
56116 // Capture the original wide type in the likely case that we need to bitcast
56117 // back to this type.
56118 if (!N->getValueType(0).isSimple())
56119 return SDValue();
56120
56121 MVT VT = N->getSimpleValueType(0);
56122 SDValue InVec = N->getOperand(0);
56123 unsigned IdxVal = N->getConstantOperandVal(1);
56124 SDValue InVecBC = peekThroughBitcasts(InVec);
56125 EVT InVecVT = InVec.getValueType();
56126 unsigned SizeInBits = VT.getSizeInBits();
56127 unsigned InSizeInBits = InVecVT.getSizeInBits();
56128 unsigned NumSubElts = VT.getVectorNumElements();
56129 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56130 SDLoc DL(N);
56131
56132 if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
56133 TLI.isTypeLegal(InVecVT) &&
56134 InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {
56135 auto isConcatenatedNot = [](SDValue V) {
56136 V = peekThroughBitcasts(V);
56137 if (!isBitwiseNot(V))
56138 return false;
56139 SDValue NotOp = V->getOperand(0);
56141 };
56142 if (isConcatenatedNot(InVecBC.getOperand(0)) ||
56143 isConcatenatedNot(InVecBC.getOperand(1))) {
56144 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
56145 SDValue Concat = splitVectorIntBinary(InVecBC, DAG, SDLoc(InVecBC));
56146 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
56147 DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
56148 }
56149 }
56150
56151 if (DCI.isBeforeLegalizeOps())
56152 return SDValue();
56153
56154 if (SDValue V = narrowExtractedVectorSelect(N, DL, DAG))
56155 return V;
56156
56158 return getZeroVector(VT, Subtarget, DAG, DL);
56159
56160 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
56161 if (VT.getScalarType() == MVT::i1)
56162 return DAG.getConstant(1, DL, VT);
56163 return getOnesVector(VT, DAG, DL);
56164 }
56165
56166 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
56167 return DAG.getBuildVector(VT, DL, InVec->ops().slice(IdxVal, NumSubElts));
56168
56169 // If we are extracting from an insert into a larger vector, replace with a
56170 // smaller insert if we don't access less than the original subvector. Don't
56171 // do this for i1 vectors.
56172 // TODO: Relax the matching indices requirement?
56173 if (VT.getVectorElementType() != MVT::i1 &&
56174 InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse() &&
56175 IdxVal == InVec.getConstantOperandVal(2) &&
56176 InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {
56177 SDValue NewExt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
56178 InVec.getOperand(0), N->getOperand(1));
56179 unsigned NewIdxVal = InVec.getConstantOperandVal(2) - IdxVal;
56180 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewExt,
56181 InVec.getOperand(1),
56182 DAG.getVectorIdxConstant(NewIdxVal, DL));
56183 }
56184
56185 // If we're extracting an upper subvector from a broadcast we should just
56186 // extract the lowest subvector instead which should allow
56187 // SimplifyDemandedVectorElts do more simplifications.
56188 if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||
56190 DAG.isSplatValue(InVec, /*AllowUndefs*/ false)))
56191 return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
56192
56193 // If we're extracting a broadcasted subvector, just use the lowest subvector.
56194 if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
56195 cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)
56196 return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
56197
56198 // Attempt to extract from the source of a shuffle vector.
56199 if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {
56200 SmallVector<int, 32> ShuffleMask;
56201 SmallVector<int, 32> ScaledMask;
56202 SmallVector<SDValue, 2> ShuffleInputs;
56203 unsigned NumSubVecs = InSizeInBits / SizeInBits;
56204 // Decode the shuffle mask and scale it so its shuffling subvectors.
56205 if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
56206 scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
56207 unsigned SubVecIdx = IdxVal / NumSubElts;
56208 if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
56209 return DAG.getUNDEF(VT);
56210 if (ScaledMask[SubVecIdx] == SM_SentinelZero)
56211 return getZeroVector(VT, Subtarget, DAG, DL);
56212 SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
56213 if (Src.getValueSizeInBits() == InSizeInBits) {
56214 unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
56215 unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts;
56216 return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
56217 DL, SizeInBits);
56218 }
56219 }
56220 }
56221
56222 auto IsExtractFree = [](SDValue V) {
56223 V = peekThroughBitcasts(V);
56224 if (ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
56225 return true;
56227 return true;
56228 return V.isUndef();
56229 };
56230
56231 // If we're extracting the lowest subvector and we're the only user,
56232 // we may be able to perform this with a smaller vector width.
56233 unsigned InOpcode = InVec.getOpcode();
56234 if (InVec.hasOneUse()) {
56235 if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
56236 // v2f64 CVTDQ2PD(v4i32).
56237 if (InOpcode == ISD::SINT_TO_FP &&
56238 InVec.getOperand(0).getValueType() == MVT::v4i32) {
56239 return DAG.getNode(X86ISD::CVTSI2P, DL, VT, InVec.getOperand(0));
56240 }
56241 // v2f64 CVTUDQ2PD(v4i32).
56242 if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
56243 InVec.getOperand(0).getValueType() == MVT::v4i32) {
56244 return DAG.getNode(X86ISD::CVTUI2P, DL, VT, InVec.getOperand(0));
56245 }
56246 // v2f64 CVTPS2PD(v4f32).
56247 if (InOpcode == ISD::FP_EXTEND &&
56248 InVec.getOperand(0).getValueType() == MVT::v4f32) {
56249 return DAG.getNode(X86ISD::VFPEXT, DL, VT, InVec.getOperand(0));
56250 }
56251 }
56252 // v4i32 CVTPS2DQ(v4f32).
56253 if (InOpcode == ISD::FP_TO_SINT && VT == MVT::v4i32) {
56254 SDValue Src = InVec.getOperand(0);
56255 if (Src.getValueType().getScalarType() == MVT::f32)
56256 return DAG.getNode(InOpcode, DL, VT,
56257 extractSubVector(Src, IdxVal, DAG, DL, SizeInBits));
56258 }
56259 if (IdxVal == 0 &&
56260 (ISD::isExtOpcode(InOpcode) || ISD::isExtVecInRegOpcode(InOpcode)) &&
56261 (SizeInBits == 128 || SizeInBits == 256) &&
56262 InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
56263 SDValue Ext = InVec.getOperand(0);
56264 if (Ext.getValueSizeInBits() > SizeInBits)
56265 Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
56266 unsigned ExtOp = DAG.getOpcode_EXTEND_VECTOR_INREG(InOpcode);
56267 return DAG.getNode(ExtOp, DL, VT, Ext);
56268 }
56269 if (IdxVal == 0 && InOpcode == ISD::VSELECT &&
56270 InVec.getOperand(0).getValueType().is256BitVector() &&
56271 InVec.getOperand(1).getValueType().is256BitVector() &&
56272 InVec.getOperand(2).getValueType().is256BitVector()) {
56273 SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
56274 SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
56275 SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
56276 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
56277 }
56278 if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
56279 (SizeInBits == 128 || SizeInBits == 256)) {
56280 SDValue InVecSrc = InVec.getOperand(0);
56281 unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
56282 SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
56283 return DAG.getNode(InOpcode, DL, VT, Ext);
56284 }
56285 if ((InOpcode == X86ISD::CMPP || InOpcode == X86ISD::PCMPEQ ||
56286 InOpcode == X86ISD::PCMPGT) &&
56287 (IsExtractFree(InVec.getOperand(0)) ||
56288 IsExtractFree(InVec.getOperand(1))) &&
56289 SizeInBits == 128) {
56290 SDValue Ext0 =
56291 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
56292 SDValue Ext1 =
56293 extractSubVector(InVec.getOperand(1), IdxVal, DAG, DL, SizeInBits);
56294 if (InOpcode == X86ISD::CMPP)
56295 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, InVec.getOperand(2));
56296 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1);
56297 }
56298 if (InOpcode == X86ISD::MOVDDUP &&
56299 (SizeInBits == 128 || SizeInBits == 256)) {
56300 SDValue Ext0 =
56301 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
56302 return DAG.getNode(InOpcode, DL, VT, Ext0);
56303 }
56304 }
56305
56306 // Always split vXi64 logical shifts where we're extracting the upper 32-bits
56307 // as this is very likely to fold into a shuffle/truncation.
56308 if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
56309 InVecVT.getScalarSizeInBits() == 64 &&
56310 InVec.getConstantOperandAPInt(1) == 32) {
56311 SDValue Ext =
56312 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
56313 return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
56314 }
56315
56316 return SDValue();
56317}
56318
56320 EVT VT = N->getValueType(0);
56321 SDValue Src = N->getOperand(0);
56322 SDLoc DL(N);
56323
56324 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
56325 // This occurs frequently in our masked scalar intrinsic code and our
56326 // floating point select lowering with AVX512.
56327 // TODO: SimplifyDemandedBits instead?
56328 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse() &&
56329 isOneConstant(Src.getOperand(1)))
56330 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0));
56331
56332 // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
56333 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
56334 Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
56335 Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
56336 isNullConstant(Src.getOperand(1)))
56337 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
56338 Src.getOperand(1));
56339
56340 // Reduce v2i64 to v4i32 if we don't need the upper bits or are known zero.
56341 // TODO: Move to DAGCombine/SimplifyDemandedBits?
56342 if ((VT == MVT::v2i64 || VT == MVT::v2f64) && Src.hasOneUse()) {
56343 auto IsExt64 = [&DAG](SDValue Op, bool IsZeroExt) {
56344 if (Op.getValueType() != MVT::i64)
56345 return SDValue();
56346 unsigned Opc = IsZeroExt ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND;
56347 if (Op.getOpcode() == Opc &&
56348 Op.getOperand(0).getScalarValueSizeInBits() <= 32)
56349 return Op.getOperand(0);
56350 unsigned Ext = IsZeroExt ? ISD::ZEXTLOAD : ISD::EXTLOAD;
56351 if (auto *Ld = dyn_cast<LoadSDNode>(Op))
56352 if (Ld->getExtensionType() == Ext &&
56353 Ld->getMemoryVT().getScalarSizeInBits() <= 32)
56354 return Op;
56355 if (IsZeroExt) {
56356 KnownBits Known = DAG.computeKnownBits(Op);
56357 if (!Known.isConstant() && Known.countMinLeadingZeros() >= 32)
56358 return Op;
56359 }
56360 return SDValue();
56361 };
56362
56363 if (SDValue AnyExt = IsExt64(peekThroughOneUseBitcasts(Src), false))
56364 return DAG.getBitcast(
56365 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
56366 DAG.getAnyExtOrTrunc(AnyExt, DL, MVT::i32)));
56367
56368 if (SDValue ZeroExt = IsExt64(peekThroughOneUseBitcasts(Src), true))
56369 return DAG.getBitcast(
56370 VT,
56371 DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32,
56372 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
56373 DAG.getZExtOrTrunc(ZeroExt, DL, MVT::i32))));
56374 }
56375
56376 // Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.
56377 if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&
56378 Src.getOperand(0).getValueType() == MVT::x86mmx)
56379 return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));
56380
56381 // See if we're broadcasting the scalar value, in which case just reuse that.
56382 // Ensure the same SDValue from the SDNode use is being used.
56383 if (VT.getScalarType() == Src.getValueType())
56384 for (SDNode *User : Src->uses())
56385 if (User->getOpcode() == X86ISD::VBROADCAST &&
56386 Src == User->getOperand(0)) {
56387 unsigned SizeInBits = VT.getFixedSizeInBits();
56388 unsigned BroadcastSizeInBits =
56389 User->getValueSizeInBits(0).getFixedValue();
56390 if (BroadcastSizeInBits == SizeInBits)
56391 return SDValue(User, 0);
56392 if (BroadcastSizeInBits > SizeInBits)
56393 return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
56394 // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
56395 // coverage.
56396 }
56397
56398 return SDValue();
56399}
56400
56401// Simplify PMULDQ and PMULUDQ operations.
56404 const X86Subtarget &Subtarget) {
56405 SDValue LHS = N->getOperand(0);
56406 SDValue RHS = N->getOperand(1);
56407
56408 // Canonicalize constant to RHS.
56411 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
56412
56413 // Multiply by zero.
56414 // Don't return RHS as it may contain UNDEFs.
56415 if (ISD::isBuildVectorAllZeros(RHS.getNode()))
56416 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
56417
56418 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
56419 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56420 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))
56421 return SDValue(N, 0);
56422
56423 // If the input is an extend_invec and the SimplifyDemandedBits call didn't
56424 // convert it to any_extend_invec, due to the LegalOperations check, do the
56425 // conversion directly to a vector shuffle manually. This exposes combine
56426 // opportunities missed by combineEXTEND_VECTOR_INREG not calling
56427 // combineX86ShufflesRecursively on SSE4.1 targets.
56428 // FIXME: This is basically a hack around several other issues related to
56429 // ANY_EXTEND_VECTOR_INREG.
56430 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
56431 (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
56432 LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
56433 LHS.getOperand(0).getValueType() == MVT::v4i32) {
56434 SDLoc dl(N);
56435 LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
56436 LHS.getOperand(0), { 0, -1, 1, -1 });
56437 LHS = DAG.getBitcast(MVT::v2i64, LHS);
56438 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
56439 }
56440 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
56441 (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
56442 RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
56443 RHS.getOperand(0).getValueType() == MVT::v4i32) {
56444 SDLoc dl(N);
56445 RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
56446 RHS.getOperand(0), { 0, -1, 1, -1 });
56447 RHS = DAG.getBitcast(MVT::v2i64, RHS);
56448 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
56449 }
56450
56451 return SDValue();
56452}
56453
56454// Simplify VPMADDUBSW/VPMADDWD operations.
56457 EVT VT = N->getValueType(0);
56458 SDValue LHS = N->getOperand(0);
56459 SDValue RHS = N->getOperand(1);
56460
56461 // Multiply by zero.
56462 // Don't return LHS/RHS as it may contain UNDEFs.
56463 if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||
56465 return DAG.getConstant(0, SDLoc(N), VT);
56466
56467 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56468 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
56469 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
56470 return SDValue(N, 0);
56471
56472 return SDValue();
56473}
56474
56477 const X86Subtarget &Subtarget) {
56478 EVT VT = N->getValueType(0);
56479 SDValue In = N->getOperand(0);
56480 unsigned Opcode = N->getOpcode();
56481 unsigned InOpcode = In.getOpcode();
56482 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56483 SDLoc DL(N);
56484
56485 // Try to merge vector loads and extend_inreg to an extload.
56486 if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
56487 In.hasOneUse()) {
56488 auto *Ld = cast<LoadSDNode>(In);
56489 if (Ld->isSimple()) {
56490 MVT SVT = In.getSimpleValueType().getVectorElementType();
56493 : ISD::ZEXTLOAD;
56494 EVT MemVT = VT.changeVectorElementType(SVT);
56495 if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
56496 SDValue Load = DAG.getExtLoad(
56497 Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
56498 MemVT, Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags());
56499 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
56500 return Load;
56501 }
56502 }
56503 }
56504
56505 // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
56506 if (Opcode == InOpcode)
56507 return DAG.getNode(Opcode, DL, VT, In.getOperand(0));
56508
56509 // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
56510 // -> EXTEND_VECTOR_INREG(X).
56511 // TODO: Handle non-zero subvector indices.
56512 if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
56513 In.getOperand(0).getOpcode() == DAG.getOpcode_EXTEND(Opcode) &&
56514 In.getOperand(0).getOperand(0).getValueSizeInBits() ==
56515 In.getValueSizeInBits())
56516 return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));
56517
56518 // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).
56519 // TODO: Move to DAGCombine?
56520 if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&
56521 In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&
56522 In.getValueSizeInBits() == VT.getSizeInBits()) {
56523 unsigned NumElts = VT.getVectorNumElements();
56524 unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();
56525 EVT EltVT = In.getOperand(0).getValueType();
56526 SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));
56527 for (unsigned I = 0; I != NumElts; ++I)
56528 Elts[I * Scale] = In.getOperand(I);
56529 return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));
56530 }
56531
56532 // Attempt to combine as a shuffle on SSE41+ targets.
56533 if (Subtarget.hasSSE41()) {
56534 SDValue Op(N, 0);
56535 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
56536 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
56537 return Res;
56538 }
56539
56540 return SDValue();
56541}
56542
56545 EVT VT = N->getValueType(0);
56546
56547 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
56548 return DAG.getConstant(0, SDLoc(N), VT);
56549
56550 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56551 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
56552 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
56553 return SDValue(N, 0);
56554
56555 return SDValue();
56556}
56557
56558// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
56559// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
56560// extra instructions between the conversion due to going to scalar and back.
56562 const X86Subtarget &Subtarget) {
56563 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
56564 return SDValue();
56565
56566 if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
56567 return SDValue();
56568
56569 if (N->getValueType(0) != MVT::f32 ||
56570 N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
56571 return SDValue();
56572
56573 SDLoc dl(N);
56574 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
56575 N->getOperand(0).getOperand(0));
56576 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
56577 DAG.getTargetConstant(4, dl, MVT::i32));
56578 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
56579 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
56580 DAG.getIntPtrConstant(0, dl));
56581}
56582
56584 const X86Subtarget &Subtarget) {
56585 EVT VT = N->getValueType(0);
56586 bool IsStrict = N->isStrictFPOpcode();
56587 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
56588 EVT SrcVT = Src.getValueType();
56589
56590 SDLoc dl(N);
56591 if (SrcVT.getScalarType() == MVT::bf16) {
56592 if (!IsStrict && Src.getOpcode() == ISD::FP_ROUND &&
56593 Src.getOperand(0).getValueType() == VT)
56594 return Src.getOperand(0);
56595
56596 if (!SrcVT.isVector())
56597 return SDValue();
56598
56599 assert(!IsStrict && "Strict FP doesn't support BF16");
56600 if (VT.getVectorElementType() == MVT::f64) {
56601 MVT TmpVT = VT.getSimpleVT().changeVectorElementType(MVT::f32);
56602 return DAG.getNode(ISD::FP_EXTEND, dl, VT,
56603 DAG.getNode(ISD::FP_EXTEND, dl, TmpVT, Src));
56604 }
56605 assert(VT.getVectorElementType() == MVT::f32 && "Unexpected fpext");
56606 MVT NVT = SrcVT.getSimpleVT().changeVectorElementType(MVT::i32);
56607 Src = DAG.getBitcast(SrcVT.changeTypeToInteger(), Src);
56608 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Src);
56609 Src = DAG.getNode(ISD::SHL, dl, NVT, Src, DAG.getConstant(16, dl, NVT));
56610 return DAG.getBitcast(VT, Src);
56611 }
56612
56613 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
56614 return SDValue();
56615
56616 if (Subtarget.hasFP16())
56617 return SDValue();
56618
56619 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
56620 return SDValue();
56621
56622 if (VT.getVectorElementType() != MVT::f32 &&
56623 VT.getVectorElementType() != MVT::f64)
56624 return SDValue();
56625
56626 unsigned NumElts = VT.getVectorNumElements();
56627 if (NumElts == 1 || !isPowerOf2_32(NumElts))
56628 return SDValue();
56629
56630 // Convert the input to vXi16.
56631 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
56632 Src = DAG.getBitcast(IntVT, Src);
56633
56634 // Widen to at least 8 input elements.
56635 if (NumElts < 8) {
56636 unsigned NumConcats = 8 / NumElts;
56637 SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
56638 : DAG.getConstant(0, dl, IntVT);
56639 SmallVector<SDValue, 4> Ops(NumConcats, Fill);
56640 Ops[0] = Src;
56641 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
56642 }
56643
56644 // Destination is vXf32 with at least 4 elements.
56645 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
56646 std::max(4U, NumElts));
56647 SDValue Cvt, Chain;
56648 if (IsStrict) {
56649 Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
56650 {N->getOperand(0), Src});
56651 Chain = Cvt.getValue(1);
56652 } else {
56653 Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
56654 }
56655
56656 if (NumElts < 4) {
56657 assert(NumElts == 2 && "Unexpected size");
56658 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
56659 DAG.getIntPtrConstant(0, dl));
56660 }
56661
56662 if (IsStrict) {
56663 // Extend to the original VT if necessary.
56664 if (Cvt.getValueType() != VT) {
56665 Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
56666 {Chain, Cvt});
56667 Chain = Cvt.getValue(1);
56668 }
56669 return DAG.getMergeValues({Cvt, Chain}, dl);
56670 }
56671
56672 // Extend to the original VT if necessary.
56673 return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
56674}
56675
56676// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract
56677// from. Limit this to cases where the loads have the same input chain and the
56678// output chains are unused. This avoids any memory ordering issues.
56681 assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
56682 N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
56683 "Unknown broadcast load type");
56684
56685 // Only do this if the chain result is unused.
56686 if (N->hasAnyUseOfValue(1))
56687 return SDValue();
56688
56689 auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
56690
56691 SDValue Ptr = MemIntrin->getBasePtr();
56692 SDValue Chain = MemIntrin->getChain();
56693 EVT VT = N->getSimpleValueType(0);
56694 EVT MemVT = MemIntrin->getMemoryVT();
56695
56696 // Look at other users of our base pointer and try to find a wider broadcast.
56697 // The input chain and the size of the memory VT must match.
56698 for (SDNode *User : Ptr->uses())
56699 if (User != N && User->getOpcode() == N->getOpcode() &&
56700 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
56701 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
56702 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
56703 MemVT.getSizeInBits() &&
56704 !User->hasAnyUseOfValue(1) &&
56705 User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {
56706 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
56707 VT.getSizeInBits());
56708 Extract = DAG.getBitcast(VT, Extract);
56709 return DCI.CombineTo(N, Extract, SDValue(User, 1));
56710 }
56711
56712 return SDValue();
56713}
56714
56716 const X86Subtarget &Subtarget) {
56717 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
56718 return SDValue();
56719
56720 bool IsStrict = N->isStrictFPOpcode();
56721 EVT VT = N->getValueType(0);
56722 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
56723 EVT SrcVT = Src.getValueType();
56724
56725 if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
56726 SrcVT.getVectorElementType() != MVT::f32)
56727 return SDValue();
56728
56729 SDLoc dl(N);
56730
56731 SDValue Cvt, Chain;
56732 unsigned NumElts = VT.getVectorNumElements();
56733 if (Subtarget.hasFP16()) {
56734 // Combine (v8f16 fp_round(concat_vectors(v4f32 (xint_to_fp v4i64), ..)))
56735 // into (v8f16 vector_shuffle(v8f16 (CVTXI2P v4i64), ..))
56736 if (NumElts == 8 && Src.getOpcode() == ISD::CONCAT_VECTORS) {
56737 SDValue Cvt0, Cvt1;
56738 SDValue Op0 = Src.getOperand(0);
56739 SDValue Op1 = Src.getOperand(1);
56740 bool IsOp0Strict = Op0->isStrictFPOpcode();
56741 if (Op0.getOpcode() != Op1.getOpcode() ||
56742 Op0.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64 ||
56743 Op1.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64) {
56744 return SDValue();
56745 }
56746 int Mask[8] = {0, 1, 2, 3, 8, 9, 10, 11};
56747 if (IsStrict) {
56748 assert(IsOp0Strict && "Op0 must be strict node");
56749 unsigned Opc = Op0.getOpcode() == ISD::STRICT_SINT_TO_FP
56752 Cvt0 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
56753 {Op0.getOperand(0), Op0.getOperand(1)});
56754 Cvt1 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
56755 {Op1.getOperand(0), Op1.getOperand(1)});
56756 Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
56757 return DAG.getMergeValues({Cvt, Cvt0.getValue(1)}, dl);
56758 }
56759 unsigned Opc = Op0.getOpcode() == ISD::SINT_TO_FP ? X86ISD::CVTSI2P
56761 Cvt0 = DAG.getNode(Opc, dl, MVT::v8f16, Op0.getOperand(0));
56762 Cvt1 = DAG.getNode(Opc, dl, MVT::v8f16, Op1.getOperand(0));
56763 return Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
56764 }
56765 return SDValue();
56766 }
56767
56768 if (NumElts == 1 || !isPowerOf2_32(NumElts))
56769 return SDValue();
56770
56771 // Widen to at least 4 input elements.
56772 if (NumElts < 4)
56773 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
56774 DAG.getConstantFP(0.0, dl, SrcVT));
56775
56776 // Destination is v8i16 with at least 8 elements.
56777 EVT CvtVT =
56778 EVT::getVectorVT(*DAG.getContext(), MVT::i16, std::max(8U, NumElts));
56779 SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32);
56780 if (IsStrict) {
56781 Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other},
56782 {N->getOperand(0), Src, Rnd});
56783 Chain = Cvt.getValue(1);
56784 } else {
56785 Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd);
56786 }
56787
56788 // Extract down to real number of elements.
56789 if (NumElts < 8) {
56791 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
56792 DAG.getIntPtrConstant(0, dl));
56793 }
56794
56795 Cvt = DAG.getBitcast(VT, Cvt);
56796
56797 if (IsStrict)
56798 return DAG.getMergeValues({Cvt, Chain}, dl);
56799
56800 return Cvt;
56801}
56802
56804 SDValue Src = N->getOperand(0);
56805
56806 // Turn MOVDQ2Q+simple_load into an mmx load.
56807 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
56808 LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
56809
56810 if (LN->isSimple()) {
56811 SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
56812 LN->getBasePtr(),
56813 LN->getPointerInfo(),
56814 LN->getOriginalAlign(),
56815 LN->getMemOperand()->getFlags());
56816 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
56817 return NewLd;
56818 }
56819 }
56820
56821 return SDValue();
56822}
56823
56826 unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
56827 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56828 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))
56829 return SDValue(N, 0);
56830
56831 return SDValue();
56832}
56833
56835 DAGCombinerInfo &DCI) const {
56836 SelectionDAG &DAG = DCI.DAG;
56837 switch (N->getOpcode()) {
56838 // clang-format off
56839 default: break;
56841 return combineScalarToVector(N, DAG);
56843 case X86ISD::PEXTRW:
56844 case X86ISD::PEXTRB:
56845 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
56847 return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);
56849 return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget);
56851 return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget);
56852 case ISD::VSELECT:
56853 case ISD::SELECT:
56854 case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
56855 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
56856 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
56857 case X86ISD::CMP: return combineCMP(N, DAG, Subtarget);
56858 case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
56859 case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
56860 case X86ISD::ADD:
56861 case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI);
56862 case X86ISD::SBB: return combineSBB(N, DAG);
56863 case X86ISD::ADC: return combineADC(N, DAG, DCI);
56864 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
56865 case ISD::SHL: return combineShiftLeft(N, DAG);
56866 case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
56867 case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
56868 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
56869 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
56870 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
56871 case ISD::BITREVERSE: return combineBITREVERSE(N, DAG, DCI, Subtarget);
56872 case X86ISD::BEXTR:
56873 case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
56874 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
56875 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
56876 case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
56877 case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
56879 return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
56880 case ISD::SINT_TO_FP:
56882 return combineSIntToFP(N, DAG, DCI, Subtarget);
56883 case ISD::UINT_TO_FP:
56885 return combineUIntToFP(N, DAG, Subtarget);
56886 case ISD::LRINT:
56887 case ISD::LLRINT: return combineLRINT_LLRINT(N, DAG, Subtarget);
56888 case ISD::FADD:
56889 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
56890 case X86ISD::VFCMULC:
56891 case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget);
56892 case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
56893 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
56894 case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
56895 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
56896 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
56897 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
56898 case X86ISD::FXOR:
56899 case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
56900 case X86ISD::FMIN:
56901 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
56902 case ISD::FMINNUM:
56903 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
56904 case X86ISD::CVTSI2P:
56905 case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
56906 case X86ISD::CVTP2SI:
56907 case X86ISD::CVTP2UI:
56909 case X86ISD::CVTTP2SI:
56911 case X86ISD::CVTTP2UI:
56912 return combineCVTP2I_CVTTP2I(N, DAG, DCI);
56914 case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
56915 case X86ISD::BT: return combineBT(N, DAG, DCI);
56916 case ISD::ANY_EXTEND:
56917 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
56918 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
56919 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
56923 return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
56924 case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);
56925 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
56926 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
56927 case X86ISD::PACKSS:
56928 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
56929 case X86ISD::HADD:
56930 case X86ISD::HSUB:
56931 case X86ISD::FHADD:
56932 case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
56933 case X86ISD::VSHL:
56934 case X86ISD::VSRA:
56935 case X86ISD::VSRL:
56936 return combineVectorShiftVar(N, DAG, DCI, Subtarget);
56937 case X86ISD::VSHLI:
56938 case X86ISD::VSRAI:
56939 case X86ISD::VSRLI:
56940 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
56942 case X86ISD::PINSRB:
56943 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
56944 case X86ISD::SHUFP: // Handle all target specific shuffles
56945 case X86ISD::INSERTPS:
56946 case X86ISD::EXTRQI:
56947 case X86ISD::INSERTQI:
56948 case X86ISD::VALIGN:
56949 case X86ISD::PALIGNR:
56950 case X86ISD::VSHLDQ:
56951 case X86ISD::VSRLDQ:
56952 case X86ISD::BLENDI:
56953 case X86ISD::UNPCKH:
56954 case X86ISD::UNPCKL:
56955 case X86ISD::MOVHLPS:
56956 case X86ISD::MOVLHPS:
56957 case X86ISD::PSHUFB:
56958 case X86ISD::PSHUFD:
56959 case X86ISD::PSHUFHW:
56960 case X86ISD::PSHUFLW:
56961 case X86ISD::MOVSHDUP:
56962 case X86ISD::MOVSLDUP:
56963 case X86ISD::MOVDDUP:
56964 case X86ISD::MOVSS:
56965 case X86ISD::MOVSD:
56966 case X86ISD::MOVSH:
56967 case X86ISD::VBROADCAST:
56968 case X86ISD::VPPERM:
56969 case X86ISD::VPERMI:
56970 case X86ISD::VPERMV:
56971 case X86ISD::VPERMV3:
56972 case X86ISD::VPERMIL2:
56973 case X86ISD::VPERMILPI:
56974 case X86ISD::VPERMILPV:
56975 case X86ISD::VPERM2X128:
56976 case X86ISD::SHUF128:
56977 case X86ISD::VZEXT_MOVL:
56978 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
56979 case X86ISD::FMADD_RND:
56980 case X86ISD::FMSUB:
56982 case X86ISD::FMSUB_RND:
56983 case X86ISD::FNMADD:
56985 case X86ISD::FNMADD_RND:
56986 case X86ISD::FNMSUB:
56988 case X86ISD::FNMSUB_RND:
56989 case ISD::FMA:
56990 case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
56993 case X86ISD::FMADDSUB:
56994 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
56995 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
56996 case X86ISD::TESTP: return combineTESTP(N, DAG, DCI, Subtarget);
56997 case X86ISD::MGATHER:
56998 case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI);
56999 case ISD::MGATHER:
57000 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
57001 case X86ISD::PCMPEQ:
57002 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
57003 case X86ISD::PMULDQ:
57004 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
57005 case X86ISD::VPMADDUBSW:
57006 case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);
57007 case X86ISD::KSHIFTL:
57008 case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
57009 case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
57011 case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget);
57013 case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
57015 case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
57016 case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
57017 case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
57018 // clang-format on
57019 }
57020
57021 return SDValue();
57022}
57023
57025 return false;
57026}
57027
57028// Prefer (non-AVX512) vector TRUNCATE(SIGN_EXTEND_INREG(X)) to use of PACKSS.
57030 EVT ExtVT) const {
57031 return Subtarget.hasAVX512() || !VT.isVector();
57032}
57033
57034bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
57035 if (!isTypeLegal(VT))
57036 return false;
57037
57038 // There are no vXi8 shifts.
57039 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
57040 return false;
57041
57042 // TODO: Almost no 8-bit ops are desirable because they have no actual
57043 // size/speed advantages vs. 32-bit ops, but they do have a major
57044 // potential disadvantage by causing partial register stalls.
57045 //
57046 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
57047 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
57048 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
57049 // check for a constant operand to the multiply.
57050 if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
57051 return false;
57052
57053 // i16 instruction encodings are longer and some i16 instructions are slow,
57054 // so those are not desirable.
57055 if (VT == MVT::i16) {
57056 switch (Opc) {
57057 default:
57058 break;
57059 case ISD::LOAD:
57060 case ISD::SIGN_EXTEND:
57061 case ISD::ZERO_EXTEND:
57062 case ISD::ANY_EXTEND:
57063 case ISD::SHL:
57064 case ISD::SRA:
57065 case ISD::SRL:
57066 case ISD::SUB:
57067 case ISD::ADD:
57068 case ISD::MUL:
57069 case ISD::AND:
57070 case ISD::OR:
57071 case ISD::XOR:
57072 return false;
57073 }
57074 }
57075
57076 // Any legal type not explicitly accounted for above here is desirable.
57077 return true;
57078}
57079
57082 int JTI,
57083 SelectionDAG &DAG) const {
57084 const Module *M = DAG.getMachineFunction().getMMI().getModule();
57085 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
57086 if (IsCFProtectionSupported) {
57087 // In case control-flow branch protection is enabled, we need to add
57088 // notrack prefix to the indirect branch.
57089 // In order to do that we create NT_BRIND SDNode.
57090 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
57091 SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Value, dl);
57092 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, JTInfo, Addr);
57093 }
57094
57095 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, JTI, DAG);
57096}
57097
57100 const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const {
57102 EVT VT = LogicOp->getValueType(0);
57103 EVT OpVT = SETCC0->getOperand(0).getValueType();
57104 if (!VT.isInteger())
57106
57107 if (VT.isVector())
57112
57113 // Don't use `NotAnd` as even though `not` is generally shorter code size than
57114 // `add`, `add` can lower to LEA which can save moves / spills. Any case where
57115 // `NotAnd` applies, `AddAnd` does as well.
57116 // TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`,
57117 // if we change that to `andn Y, X` it may be worth prefering `NotAnd` here.
57119}
57120
57122 EVT VT = Op.getValueType();
57123 bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
57124 isa<ConstantSDNode>(Op.getOperand(1));
57125
57126 // i16 is legal, but undesirable since i16 instruction encodings are longer
57127 // and some i16 instructions are slow.
57128 // 8-bit multiply-by-constant can usually be expanded to something cheaper
57129 // using LEA and/or other ALU ops.
57130 if (VT != MVT::i16 && !Is8BitMulByConstant)
57131 return false;
57132
57133 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
57134 if (!Op.hasOneUse())
57135 return false;
57136 SDNode *User = *Op->use_begin();
57138 return false;
57139 auto *Ld = cast<LoadSDNode>(Load);
57140 auto *St = cast<StoreSDNode>(User);
57141 return Ld->getBasePtr() == St->getBasePtr();
57142 };
57143
57144 auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
57145 if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
57146 return false;
57147 if (!Op.hasOneUse())
57148 return false;
57149 SDNode *User = *Op->use_begin();
57150 if (User->getOpcode() != ISD::ATOMIC_STORE)
57151 return false;
57152 auto *Ld = cast<AtomicSDNode>(Load);
57153 auto *St = cast<AtomicSDNode>(User);
57154 return Ld->getBasePtr() == St->getBasePtr();
57155 };
57156
57157 bool Commute = false;
57158 switch (Op.getOpcode()) {
57159 default: return false;
57160 case ISD::SIGN_EXTEND:
57161 case ISD::ZERO_EXTEND:
57162 case ISD::ANY_EXTEND:
57163 break;
57164 case ISD::SHL:
57165 case ISD::SRA:
57166 case ISD::SRL: {
57167 SDValue N0 = Op.getOperand(0);
57168 // Look out for (store (shl (load), x)).
57169 if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))
57170 return false;
57171 break;
57172 }
57173 case ISD::ADD:
57174 case ISD::MUL:
57175 case ISD::AND:
57176 case ISD::OR:
57177 case ISD::XOR:
57178 Commute = true;
57179 [[fallthrough]];
57180 case ISD::SUB: {
57181 SDValue N0 = Op.getOperand(0);
57182 SDValue N1 = Op.getOperand(1);
57183 // Avoid disabling potential load folding opportunities.
57184 if (X86::mayFoldLoad(N1, Subtarget) &&
57185 (!Commute || !isa<ConstantSDNode>(N0) ||
57186 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
57187 return false;
57188 if (X86::mayFoldLoad(N0, Subtarget) &&
57189 ((Commute && !isa<ConstantSDNode>(N1)) ||
57190 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
57191 return false;
57192 if (IsFoldableAtomicRMW(N0, Op) ||
57193 (Commute && IsFoldableAtomicRMW(N1, Op)))
57194 return false;
57195 }
57196 }
57197
57198 PVT = MVT::i32;
57199 return true;
57200}
57201
57202//===----------------------------------------------------------------------===//
57203// X86 Inline Assembly Support
57204//===----------------------------------------------------------------------===//
57205
57206// Helper to match a string separated by whitespace.
57208 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
57209
57210 for (StringRef Piece : Pieces) {
57211 if (!S.starts_with(Piece)) // Check if the piece matches.
57212 return false;
57213
57214 S = S.substr(Piece.size());
57216 if (Pos == 0) // We matched a prefix.
57217 return false;
57218
57219 S = S.substr(Pos);
57220 }
57221
57222 return S.empty();
57223}
57224
57226
57227 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
57228 if (llvm::is_contained(AsmPieces, "~{cc}") &&
57229 llvm::is_contained(AsmPieces, "~{flags}") &&
57230 llvm::is_contained(AsmPieces, "~{fpsr}")) {
57231
57232 if (AsmPieces.size() == 3)
57233 return true;
57234 else if (llvm::is_contained(AsmPieces, "~{dirflag}"))
57235 return true;
57236 }
57237 }
57238 return false;
57239}
57240
57242 InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
57243
57244 const std::string &AsmStr = IA->getAsmString();
57245
57246 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
57247 if (!Ty || Ty->getBitWidth() % 16 != 0)
57248 return false;
57249
57250 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
57251 SmallVector<StringRef, 4> AsmPieces;
57252 SplitString(AsmStr, AsmPieces, ";\n");
57253
57254 switch (AsmPieces.size()) {
57255 default: return false;
57256 case 1:
57257 // FIXME: this should verify that we are targeting a 486 or better. If not,
57258 // we will turn this bswap into something that will be lowered to logical
57259 // ops instead of emitting the bswap asm. For now, we don't support 486 or
57260 // lower so don't worry about this.
57261 // bswap $0
57262 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
57263 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
57264 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
57265 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
57266 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
57267 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
57268 // No need to check constraints, nothing other than the equivalent of
57269 // "=r,0" would be valid here.
57271 }
57272
57273 // rorw $$8, ${0:w} --> llvm.bswap.i16
57274 if (CI->getType()->isIntegerTy(16) &&
57275 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
57276 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
57277 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
57278 AsmPieces.clear();
57279 StringRef ConstraintsStr = IA->getConstraintString();
57280 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
57281 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
57282 if (clobbersFlagRegisters(AsmPieces))
57284 }
57285 break;
57286 case 3:
57287 if (CI->getType()->isIntegerTy(32) &&
57288 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
57289 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
57290 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
57291 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
57292 AsmPieces.clear();
57293 StringRef ConstraintsStr = IA->getConstraintString();
57294 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
57295 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
57296 if (clobbersFlagRegisters(AsmPieces))
57298 }
57299
57300 if (CI->getType()->isIntegerTy(64)) {
57301 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
57302 if (Constraints.size() >= 2 &&
57303 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
57304 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
57305 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
57306 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
57307 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
57308 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
57310 }
57311 }
57312 break;
57313 }
57314 return false;
57315}
57316
57319 .Case("{@cca}", X86::COND_A)
57320 .Case("{@ccae}", X86::COND_AE)
57321 .Case("{@ccb}", X86::COND_B)
57322 .Case("{@ccbe}", X86::COND_BE)
57323 .Case("{@ccc}", X86::COND_B)
57324 .Case("{@cce}", X86::COND_E)
57325 .Case("{@ccz}", X86::COND_E)
57326 .Case("{@ccg}", X86::COND_G)
57327 .Case("{@ccge}", X86::COND_GE)
57328 .Case("{@ccl}", X86::COND_L)
57329 .Case("{@ccle}", X86::COND_LE)
57330 .Case("{@ccna}", X86::COND_BE)
57331 .Case("{@ccnae}", X86::COND_B)
57332 .Case("{@ccnb}", X86::COND_AE)
57333 .Case("{@ccnbe}", X86::COND_A)
57334 .Case("{@ccnc}", X86::COND_AE)
57335 .Case("{@ccne}", X86::COND_NE)
57336 .Case("{@ccnz}", X86::COND_NE)
57337 .Case("{@ccng}", X86::COND_LE)
57338 .Case("{@ccnge}", X86::COND_L)
57339 .Case("{@ccnl}", X86::COND_GE)
57340 .Case("{@ccnle}", X86::COND_G)
57341 .Case("{@ccno}", X86::COND_NO)
57342 .Case("{@ccnp}", X86::COND_NP)
57343 .Case("{@ccns}", X86::COND_NS)
57344 .Case("{@cco}", X86::COND_O)
57345 .Case("{@ccp}", X86::COND_P)
57346 .Case("{@ccs}", X86::COND_S)
57348 return Cond;
57349}
57350
57351/// Given a constraint letter, return the type of constraint for this target.
57354 if (Constraint.size() == 1) {
57355 switch (Constraint[0]) {
57356 case 'R':
57357 case 'q':
57358 case 'Q':
57359 case 'f':
57360 case 't':
57361 case 'u':
57362 case 'y':
57363 case 'x':
57364 case 'v':
57365 case 'l':
57366 case 'k': // AVX512 masking registers.
57367 return C_RegisterClass;
57368 case 'a':
57369 case 'b':
57370 case 'c':
57371 case 'd':
57372 case 'S':
57373 case 'D':
57374 case 'A':
57375 return C_Register;
57376 case 'I':
57377 case 'J':
57378 case 'K':
57379 case 'N':
57380 case 'G':
57381 case 'L':
57382 case 'M':
57383 return C_Immediate;
57384 case 'C':
57385 case 'e':
57386 case 'Z':
57387 return C_Other;
57388 default:
57389 break;
57390 }
57391 }
57392 else if (Constraint.size() == 2) {
57393 switch (Constraint[0]) {
57394 default:
57395 break;
57396 case 'W':
57397 if (Constraint[1] != 's')
57398 break;
57399 return C_Other;
57400 case 'Y':
57401 switch (Constraint[1]) {
57402 default:
57403 break;
57404 case 'z':
57405 return C_Register;
57406 case 'i':
57407 case 'm':
57408 case 'k':
57409 case 't':
57410 case '2':
57411 return C_RegisterClass;
57412 }
57413 }
57414 } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
57415 return C_Other;
57416 return TargetLowering::getConstraintType(Constraint);
57417}
57418
57419/// Examine constraint type and operand type and determine a weight value.
57420/// This object must already have been set up with the operand type
57421/// and the current alternative constraint selected.
57424 AsmOperandInfo &Info, const char *Constraint) const {
57426 Value *CallOperandVal = Info.CallOperandVal;
57427 // If we don't have a value, we can't do a match,
57428 // but allow it at the lowest weight.
57429 if (!CallOperandVal)
57430 return CW_Default;
57431 Type *Ty = CallOperandVal->getType();
57432 // Look at the constraint type.
57433 switch (*Constraint) {
57434 default:
57436 [[fallthrough]];
57437 case 'R':
57438 case 'q':
57439 case 'Q':
57440 case 'a':
57441 case 'b':
57442 case 'c':
57443 case 'd':
57444 case 'S':
57445 case 'D':
57446 case 'A':
57447 if (CallOperandVal->getType()->isIntegerTy())
57448 Wt = CW_SpecificReg;
57449 break;
57450 case 'f':
57451 case 't':
57452 case 'u':
57453 if (Ty->isFloatingPointTy())
57454 Wt = CW_SpecificReg;
57455 break;
57456 case 'y':
57457 if (Ty->isX86_MMXTy() && Subtarget.hasMMX())
57458 Wt = CW_SpecificReg;
57459 break;
57460 case 'Y':
57461 if (StringRef(Constraint).size() != 2)
57462 break;
57463 switch (Constraint[1]) {
57464 default:
57465 return CW_Invalid;
57466 // XMM0
57467 case 'z':
57468 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
57469 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
57470 ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
57471 return CW_SpecificReg;
57472 return CW_Invalid;
57473 // Conditional OpMask regs (AVX512)
57474 case 'k':
57475 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
57476 return CW_Register;
57477 return CW_Invalid;
57478 // Any MMX reg
57479 case 'm':
57480 if (Ty->isX86_MMXTy() && Subtarget.hasMMX())
57481 return Wt;
57482 return CW_Invalid;
57483 // Any SSE reg when ISA >= SSE2, same as 'x'
57484 case 'i':
57485 case 't':
57486 case '2':
57487 if (!Subtarget.hasSSE2())
57488 return CW_Invalid;
57489 break;
57490 }
57491 break;
57492 case 'v':
57493 if ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
57494 Wt = CW_Register;
57495 [[fallthrough]];
57496 case 'x':
57497 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
57498 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
57499 Wt = CW_Register;
57500 break;
57501 case 'k':
57502 // Enable conditional vector operations using %k<#> registers.
57503 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
57504 Wt = CW_Register;
57505 break;
57506 case 'I':
57507 if (auto *C = dyn_cast<ConstantInt>(Info.CallOperandVal))
57508 if (C->getZExtValue() <= 31)
57509 Wt = CW_Constant;
57510 break;
57511 case 'J':
57512 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
57513 if (C->getZExtValue() <= 63)
57514 Wt = CW_Constant;
57515 break;
57516 case 'K':
57517 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
57518 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
57519 Wt = CW_Constant;
57520 break;
57521 case 'L':
57522 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
57523 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
57524 Wt = CW_Constant;
57525 break;
57526 case 'M':
57527 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
57528 if (C->getZExtValue() <= 3)
57529 Wt = CW_Constant;
57530 break;
57531 case 'N':
57532 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
57533 if (C->getZExtValue() <= 0xff)
57534 Wt = CW_Constant;
57535 break;
57536 case 'G':
57537 case 'C':
57538 if (isa<ConstantFP>(CallOperandVal))
57539 Wt = CW_Constant;
57540 break;
57541 case 'e':
57542 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
57543 if ((C->getSExtValue() >= -0x80000000LL) &&
57544 (C->getSExtValue() <= 0x7fffffffLL))
57545 Wt = CW_Constant;
57546 break;
57547 case 'Z':
57548 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
57549 if (C->getZExtValue() <= 0xffffffff)
57550 Wt = CW_Constant;
57551 break;
57552 }
57553 return Wt;
57554}
57555
57556/// Try to replace an X constraint, which matches anything, with another that
57557/// has more specific requirements based on the type of the corresponding
57558/// operand.
57560LowerXConstraint(EVT ConstraintVT) const {
57561 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
57562 // 'f' like normal targets.
57563 if (ConstraintVT.isFloatingPoint()) {
57564 if (Subtarget.hasSSE1())
57565 return "x";
57566 }
57567
57568 return TargetLowering::LowerXConstraint(ConstraintVT);
57569}
57570
57571// Lower @cc targets via setcc.
57573 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
57574 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
57576 if (Cond == X86::COND_INVALID)
57577 return SDValue();
57578 // Check that return type is valid.
57579 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
57580 OpInfo.ConstraintVT.getSizeInBits() < 8)
57581 report_fatal_error("Glue output operand is of invalid type");
57582
57583 // Get EFLAGS register. Only update chain when copyfrom is glued.
57584 if (Glue.getNode()) {
57585 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Glue);
57586 Chain = Glue.getValue(1);
57587 } else
57588 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
57589 // Extract CC code.
57590 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
57591 // Extend to 32-bits
57592 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
57593
57594 return Result;
57595}
57596
57597/// Lower the specified operand into the Ops vector.
57598/// If it is invalid, don't add anything to Ops.
57600 StringRef Constraint,
57601 std::vector<SDValue> &Ops,
57602 SelectionDAG &DAG) const {
57603 SDValue Result;
57604 char ConstraintLetter = Constraint[0];
57605 switch (ConstraintLetter) {
57606 default: break;
57607 case 'I':
57608 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57609 if (C->getZExtValue() <= 31) {
57610 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
57611 Op.getValueType());
57612 break;
57613 }
57614 }
57615 return;
57616 case 'J':
57617 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57618 if (C->getZExtValue() <= 63) {
57619 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
57620 Op.getValueType());
57621 break;
57622 }
57623 }
57624 return;
57625 case 'K':
57626 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57627 if (isInt<8>(C->getSExtValue())) {
57628 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
57629 Op.getValueType());
57630 break;
57631 }
57632 }
57633 return;
57634 case 'L':
57635 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57636 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
57637 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
57638 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
57639 Op.getValueType());
57640 break;
57641 }
57642 }
57643 return;
57644 case 'M':
57645 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57646 if (C->getZExtValue() <= 3) {
57647 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
57648 Op.getValueType());
57649 break;
57650 }
57651 }
57652 return;
57653 case 'N':
57654 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57655 if (C->getZExtValue() <= 255) {
57656 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
57657 Op.getValueType());
57658 break;
57659 }
57660 }
57661 return;
57662 case 'O':
57663 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57664 if (C->getZExtValue() <= 127) {
57665 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
57666 Op.getValueType());
57667 break;
57668 }
57669 }
57670 return;
57671 case 'e': {
57672 // 32-bit signed value
57673 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57675 C->getSExtValue())) {
57676 // Widen to 64 bits here to get it sign extended.
57677 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
57678 break;
57679 }
57680 // FIXME gcc accepts some relocatable values here too, but only in certain
57681 // memory models; it's complicated.
57682 }
57683 return;
57684 }
57685 case 'W': {
57686 assert(Constraint[1] == 's');
57687 // Op is a BlockAddressSDNode or a GlobalAddressSDNode with an optional
57688 // offset.
57689 if (const auto *BA = dyn_cast<BlockAddressSDNode>(Op)) {
57690 Ops.push_back(DAG.getTargetBlockAddress(BA->getBlockAddress(),
57691 BA->getValueType(0)));
57692 } else {
57693 int64_t Offset = 0;
57694 if (Op->getOpcode() == ISD::ADD &&
57695 isa<ConstantSDNode>(Op->getOperand(1))) {
57696 Offset = cast<ConstantSDNode>(Op->getOperand(1))->getSExtValue();
57697 Op = Op->getOperand(0);
57698 }
57699 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
57700 Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
57701 GA->getValueType(0), Offset));
57702 }
57703 return;
57704 }
57705 case 'Z': {
57706 // 32-bit unsigned value
57707 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57709 C->getZExtValue())) {
57710 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
57711 Op.getValueType());
57712 break;
57713 }
57714 }
57715 // FIXME gcc accepts some relocatable values here too, but only in certain
57716 // memory models; it's complicated.
57717 return;
57718 }
57719 case 'i': {
57720 // Literal immediates are always ok.
57721 if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {
57722 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
57723 BooleanContent BCont = getBooleanContents(MVT::i64);
57724 ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
57726 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
57727 : CST->getSExtValue();
57728 Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
57729 break;
57730 }
57731
57732 // In any sort of PIC mode addresses need to be computed at runtime by
57733 // adding in a register or some sort of table lookup. These can't
57734 // be used as immediates. BlockAddresses and BasicBlocks are fine though.
57735 if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&
57736 !(isa<BlockAddressSDNode>(Op) || isa<BasicBlockSDNode>(Op)))
57737 return;
57738
57739 // If we are in non-pic codegen mode, we allow the address of a global (with
57740 // an optional displacement) to be used with 'i'.
57741 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
57742 // If we require an extra load to get this address, as in PIC mode, we
57743 // can't accept it.
57745 Subtarget.classifyGlobalReference(GA->getGlobal())))
57746 return;
57747 break;
57748 }
57749 }
57750
57751 if (Result.getNode()) {
57752 Ops.push_back(Result);
57753 return;
57754 }
57755 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
57756}
57757
57758/// Check if \p RC is a general purpose register class.
57759/// I.e., GR* or one of their variant.
57760static bool isGRClass(const TargetRegisterClass &RC) {
57761 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
57762 RC.hasSuperClassEq(&X86::GR16RegClass) ||
57763 RC.hasSuperClassEq(&X86::GR32RegClass) ||
57764 RC.hasSuperClassEq(&X86::GR64RegClass) ||
57765 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
57766}
57767
57768/// Check if \p RC is a vector register class.
57769/// I.e., FR* / VR* or one of their variant.
57770static bool isFRClass(const TargetRegisterClass &RC) {
57771 return RC.hasSuperClassEq(&X86::FR16XRegClass) ||
57772 RC.hasSuperClassEq(&X86::FR32XRegClass) ||
57773 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
57774 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
57775 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
57776 RC.hasSuperClassEq(&X86::VR512RegClass);
57777}
57778
57779/// Check if \p RC is a mask register class.
57780/// I.e., VK* or one of their variant.
57781static bool isVKClass(const TargetRegisterClass &RC) {
57782 return RC.hasSuperClassEq(&X86::VK1RegClass) ||
57783 RC.hasSuperClassEq(&X86::VK2RegClass) ||
57784 RC.hasSuperClassEq(&X86::VK4RegClass) ||
57785 RC.hasSuperClassEq(&X86::VK8RegClass) ||
57786 RC.hasSuperClassEq(&X86::VK16RegClass) ||
57787 RC.hasSuperClassEq(&X86::VK32RegClass) ||
57788 RC.hasSuperClassEq(&X86::VK64RegClass);
57789}
57790
57791std::pair<unsigned, const TargetRegisterClass *>
57793 StringRef Constraint,
57794 MVT VT) const {
57795 // First, see if this is a constraint that directly corresponds to an LLVM
57796 // register class.
57797 if (Constraint.size() == 1) {
57798 // GCC Constraint Letters
57799 switch (Constraint[0]) {
57800 default: break;
57801 // 'A' means [ER]AX + [ER]DX.
57802 case 'A':
57803 if (Subtarget.is64Bit())
57804 return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
57805 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
57806 "Expecting 64, 32 or 16 bit subtarget");
57807 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
57808
57809 // TODO: Slight differences here in allocation order and leaving
57810 // RIP in the class. Do they matter any more here than they do
57811 // in the normal allocation?
57812 case 'k':
57813 if (Subtarget.hasAVX512()) {
57814 if (VT == MVT::v1i1 || VT == MVT::i1)
57815 return std::make_pair(0U, &X86::VK1RegClass);
57816 if (VT == MVT::v8i1 || VT == MVT::i8)
57817 return std::make_pair(0U, &X86::VK8RegClass);
57818 if (VT == MVT::v16i1 || VT == MVT::i16)
57819 return std::make_pair(0U, &X86::VK16RegClass);
57820 }
57821 if (Subtarget.hasBWI()) {
57822 if (VT == MVT::v32i1 || VT == MVT::i32)
57823 return std::make_pair(0U, &X86::VK32RegClass);
57824 if (VT == MVT::v64i1 || VT == MVT::i64)
57825 return std::make_pair(0U, &X86::VK64RegClass);
57826 }
57827 break;
57828 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
57829 if (Subtarget.is64Bit()) {
57830 if (VT == MVT::i8 || VT == MVT::i1)
57831 return std::make_pair(0U, &X86::GR8_NOREX2RegClass);
57832 if (VT == MVT::i16)
57833 return std::make_pair(0U, &X86::GR16_NOREX2RegClass);
57834 if (VT == MVT::i32 || VT == MVT::f32)
57835 return std::make_pair(0U, &X86::GR32_NOREX2RegClass);
57836 if (VT != MVT::f80 && !VT.isVector())
57837 return std::make_pair(0U, &X86::GR64_NOREX2RegClass);
57838 break;
57839 }
57840 [[fallthrough]];
57841 // 32-bit fallthrough
57842 case 'Q': // Q_REGS
57843 if (VT == MVT::i8 || VT == MVT::i1)
57844 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
57845 if (VT == MVT::i16)
57846 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
57847 if (VT == MVT::i32 || VT == MVT::f32 ||
57848 (!VT.isVector() && !Subtarget.is64Bit()))
57849 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
57850 if (VT != MVT::f80 && !VT.isVector())
57851 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
57852 break;
57853 case 'r': // GENERAL_REGS
57854 case 'l': // INDEX_REGS
57855 if (VT == MVT::i8 || VT == MVT::i1)
57856 return std::make_pair(0U, &X86::GR8_NOREX2RegClass);
57857 if (VT == MVT::i16)
57858 return std::make_pair(0U, &X86::GR16_NOREX2RegClass);
57859 if (VT == MVT::i32 || VT == MVT::f32 ||
57860 (!VT.isVector() && !Subtarget.is64Bit()))
57861 return std::make_pair(0U, &X86::GR32_NOREX2RegClass);
57862 if (VT != MVT::f80 && !VT.isVector())
57863 return std::make_pair(0U, &X86::GR64_NOREX2RegClass);
57864 break;
57865 case 'R': // LEGACY_REGS
57866 if (VT == MVT::i8 || VT == MVT::i1)
57867 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
57868 if (VT == MVT::i16)
57869 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
57870 if (VT == MVT::i32 || VT == MVT::f32 ||
57871 (!VT.isVector() && !Subtarget.is64Bit()))
57872 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
57873 if (VT != MVT::f80 && !VT.isVector())
57874 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
57875 break;
57876 case 'f': // FP Stack registers.
57877 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
57878 // value to the correct fpstack register class.
57879 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
57880 return std::make_pair(0U, &X86::RFP32RegClass);
57881 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
57882 return std::make_pair(0U, &X86::RFP64RegClass);
57883 if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
57884 return std::make_pair(0U, &X86::RFP80RegClass);
57885 break;
57886 case 'y': // MMX_REGS if MMX allowed.
57887 if (!Subtarget.hasMMX()) break;
57888 return std::make_pair(0U, &X86::VR64RegClass);
57889 case 'v':
57890 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
57891 if (!Subtarget.hasSSE1()) break;
57892 bool VConstraint = (Constraint[0] == 'v');
57893
57894 switch (VT.SimpleTy) {
57895 default: break;
57896 // Scalar SSE types.
57897 case MVT::f16:
57898 if (VConstraint && Subtarget.hasFP16())
57899 return std::make_pair(0U, &X86::FR16XRegClass);
57900 break;
57901 case MVT::f32:
57902 case MVT::i32:
57903 if (VConstraint && Subtarget.hasVLX())
57904 return std::make_pair(0U, &X86::FR32XRegClass);
57905 return std::make_pair(0U, &X86::FR32RegClass);
57906 case MVT::f64:
57907 case MVT::i64:
57908 if (VConstraint && Subtarget.hasVLX())
57909 return std::make_pair(0U, &X86::FR64XRegClass);
57910 return std::make_pair(0U, &X86::FR64RegClass);
57911 case MVT::i128:
57912 if (Subtarget.is64Bit()) {
57913 if (VConstraint && Subtarget.hasVLX())
57914 return std::make_pair(0U, &X86::VR128XRegClass);
57915 return std::make_pair(0U, &X86::VR128RegClass);
57916 }
57917 break;
57918 // Vector types and fp128.
57919 case MVT::v8f16:
57920 if (!Subtarget.hasFP16())
57921 break;
57922 if (VConstraint)
57923 return std::make_pair(0U, &X86::VR128XRegClass);
57924 return std::make_pair(0U, &X86::VR128RegClass);
57925 case MVT::v8bf16:
57926 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
57927 break;
57928 if (VConstraint)
57929 return std::make_pair(0U, &X86::VR128XRegClass);
57930 return std::make_pair(0U, &X86::VR128RegClass);
57931 case MVT::f128:
57932 case MVT::v16i8:
57933 case MVT::v8i16:
57934 case MVT::v4i32:
57935 case MVT::v2i64:
57936 case MVT::v4f32:
57937 case MVT::v2f64:
57938 if (VConstraint && Subtarget.hasVLX())
57939 return std::make_pair(0U, &X86::VR128XRegClass);
57940 return std::make_pair(0U, &X86::VR128RegClass);
57941 // AVX types.
57942 case MVT::v16f16:
57943 if (!Subtarget.hasFP16())
57944 break;
57945 if (VConstraint)
57946 return std::make_pair(0U, &X86::VR256XRegClass);
57947 return std::make_pair(0U, &X86::VR256RegClass);
57948 case MVT::v16bf16:
57949 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
57950 break;
57951 if (VConstraint)
57952 return std::make_pair(0U, &X86::VR256XRegClass);
57953 return std::make_pair(0U, &X86::VR256RegClass);
57954 case MVT::v32i8:
57955 case MVT::v16i16:
57956 case MVT::v8i32:
57957 case MVT::v4i64:
57958 case MVT::v8f32:
57959 case MVT::v4f64:
57960 if (VConstraint && Subtarget.hasVLX())
57961 return std::make_pair(0U, &X86::VR256XRegClass);
57962 if (Subtarget.hasAVX())
57963 return std::make_pair(0U, &X86::VR256RegClass);
57964 break;
57965 case MVT::v32f16:
57966 if (!Subtarget.hasFP16())
57967 break;
57968 if (VConstraint)
57969 return std::make_pair(0U, &X86::VR512RegClass);
57970 return std::make_pair(0U, &X86::VR512_0_15RegClass);
57971 case MVT::v32bf16:
57972 if (!Subtarget.hasBF16())
57973 break;
57974 if (VConstraint)
57975 return std::make_pair(0U, &X86::VR512RegClass);
57976 return std::make_pair(0U, &X86::VR512_0_15RegClass);
57977 case MVT::v64i8:
57978 case MVT::v32i16:
57979 case MVT::v8f64:
57980 case MVT::v16f32:
57981 case MVT::v16i32:
57982 case MVT::v8i64:
57983 if (!Subtarget.hasAVX512()) break;
57984 if (VConstraint)
57985 return std::make_pair(0U, &X86::VR512RegClass);
57986 return std::make_pair(0U, &X86::VR512_0_15RegClass);
57987 }
57988 break;
57989 }
57990 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
57991 switch (Constraint[1]) {
57992 default:
57993 break;
57994 case 'i':
57995 case 't':
57996 case '2':
57997 return getRegForInlineAsmConstraint(TRI, "x", VT);
57998 case 'm':
57999 if (!Subtarget.hasMMX()) break;
58000 return std::make_pair(0U, &X86::VR64RegClass);
58001 case 'z':
58002 if (!Subtarget.hasSSE1()) break;
58003 switch (VT.SimpleTy) {
58004 default: break;
58005 // Scalar SSE types.
58006 case MVT::f16:
58007 if (!Subtarget.hasFP16())
58008 break;
58009 return std::make_pair(X86::XMM0, &X86::FR16XRegClass);
58010 case MVT::f32:
58011 case MVT::i32:
58012 return std::make_pair(X86::XMM0, &X86::FR32RegClass);
58013 case MVT::f64:
58014 case MVT::i64:
58015 return std::make_pair(X86::XMM0, &X86::FR64RegClass);
58016 case MVT::v8f16:
58017 if (!Subtarget.hasFP16())
58018 break;
58019 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
58020 case MVT::v8bf16:
58021 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
58022 break;
58023 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
58024 case MVT::f128:
58025 case MVT::v16i8:
58026 case MVT::v8i16:
58027 case MVT::v4i32:
58028 case MVT::v2i64:
58029 case MVT::v4f32:
58030 case MVT::v2f64:
58031 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
58032 // AVX types.
58033 case MVT::v16f16:
58034 if (!Subtarget.hasFP16())
58035 break;
58036 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
58037 case MVT::v16bf16:
58038 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
58039 break;
58040 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
58041 case MVT::v32i8:
58042 case MVT::v16i16:
58043 case MVT::v8i32:
58044 case MVT::v4i64:
58045 case MVT::v8f32:
58046 case MVT::v4f64:
58047 if (Subtarget.hasAVX())
58048 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
58049 break;
58050 case MVT::v32f16:
58051 if (!Subtarget.hasFP16())
58052 break;
58053 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
58054 case MVT::v32bf16:
58055 if (!Subtarget.hasBF16())
58056 break;
58057 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
58058 case MVT::v64i8:
58059 case MVT::v32i16:
58060 case MVT::v8f64:
58061 case MVT::v16f32:
58062 case MVT::v16i32:
58063 case MVT::v8i64:
58064 if (Subtarget.hasAVX512())
58065 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
58066 break;
58067 }
58068 break;
58069 case 'k':
58070 // This register class doesn't allocate k0 for masked vector operation.
58071 if (Subtarget.hasAVX512()) {
58072 if (VT == MVT::v1i1 || VT == MVT::i1)
58073 return std::make_pair(0U, &X86::VK1WMRegClass);
58074 if (VT == MVT::v8i1 || VT == MVT::i8)
58075 return std::make_pair(0U, &X86::VK8WMRegClass);
58076 if (VT == MVT::v16i1 || VT == MVT::i16)
58077 return std::make_pair(0U, &X86::VK16WMRegClass);
58078 }
58079 if (Subtarget.hasBWI()) {
58080 if (VT == MVT::v32i1 || VT == MVT::i32)
58081 return std::make_pair(0U, &X86::VK32WMRegClass);
58082 if (VT == MVT::v64i1 || VT == MVT::i64)
58083 return std::make_pair(0U, &X86::VK64WMRegClass);
58084 }
58085 break;
58086 }
58087 }
58088
58089 if (parseConstraintCode(Constraint) != X86::COND_INVALID)
58090 return std::make_pair(0U, &X86::GR32RegClass);
58091
58092 // Use the default implementation in TargetLowering to convert the register
58093 // constraint into a member of a register class.
58094 std::pair<Register, const TargetRegisterClass*> Res;
58096
58097 // Not found as a standard register?
58098 if (!Res.second) {
58099 // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
58100 // to/from f80.
58101 if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
58102 // Map st(0) -> st(7) -> ST0
58103 if (Constraint.size() == 7 && Constraint[0] == '{' &&
58104 tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
58105 Constraint[3] == '(' &&
58106 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
58107 Constraint[5] == ')' && Constraint[6] == '}') {
58108 // st(7) is not allocatable and thus not a member of RFP80. Return
58109 // singleton class in cases where we have a reference to it.
58110 if (Constraint[4] == '7')
58111 return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
58112 return std::make_pair(X86::FP0 + Constraint[4] - '0',
58113 &X86::RFP80RegClass);
58114 }
58115
58116 // GCC allows "st(0)" to be called just plain "st".
58117 if (StringRef("{st}").equals_insensitive(Constraint))
58118 return std::make_pair(X86::FP0, &X86::RFP80RegClass);
58119 }
58120
58121 // flags -> EFLAGS
58122 if (StringRef("{flags}").equals_insensitive(Constraint))
58123 return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
58124
58125 // dirflag -> DF
58126 // Only allow for clobber.
58127 if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
58128 VT == MVT::Other)
58129 return std::make_pair(X86::DF, &X86::DFCCRRegClass);
58130
58131 // fpsr -> FPSW
58132 // Only allow for clobber.
58133 if (StringRef("{fpsr}").equals_insensitive(Constraint) && VT == MVT::Other)
58134 return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
58135
58136 return Res;
58137 }
58138
58139 // Make sure it isn't a register that requires 64-bit mode.
58140 if (!Subtarget.is64Bit() &&
58141 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
58142 TRI->getEncodingValue(Res.first) >= 8) {
58143 // Register requires REX prefix, but we're in 32-bit mode.
58144 return std::make_pair(0, nullptr);
58145 }
58146
58147 // Make sure it isn't a register that requires AVX512.
58148 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
58149 TRI->getEncodingValue(Res.first) & 0x10) {
58150 // Register requires EVEX prefix.
58151 return std::make_pair(0, nullptr);
58152 }
58153
58154 // Otherwise, check to see if this is a register class of the wrong value
58155 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
58156 // turn into {ax},{dx}.
58157 // MVT::Other is used to specify clobber names.
58158 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
58159 return Res; // Correct type already, nothing to do.
58160
58161 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
58162 // return "eax". This should even work for things like getting 64bit integer
58163 // registers when given an f64 type.
58164 const TargetRegisterClass *Class = Res.second;
58165 // The generic code will match the first register class that contains the
58166 // given register. Thus, based on the ordering of the tablegened file,
58167 // the "plain" GR classes might not come first.
58168 // Therefore, use a helper method.
58169 if (isGRClass(*Class)) {
58170 unsigned Size = VT.getSizeInBits();
58171 if (Size == 1) Size = 8;
58172 if (Size != 8 && Size != 16 && Size != 32 && Size != 64)
58173 return std::make_pair(0, nullptr);
58174 Register DestReg = getX86SubSuperRegister(Res.first, Size);
58175 if (DestReg.isValid()) {
58176 bool is64Bit = Subtarget.is64Bit();
58177 const TargetRegisterClass *RC =
58178 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
58179 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
58180 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
58181 : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);
58182 if (Size == 64 && !is64Bit) {
58183 // Model GCC's behavior here and select a fixed pair of 32-bit
58184 // registers.
58185 switch (DestReg) {
58186 case X86::RAX:
58187 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
58188 case X86::RDX:
58189 return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
58190 case X86::RCX:
58191 return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
58192 case X86::RBX:
58193 return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
58194 case X86::RSI:
58195 return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
58196 case X86::RDI:
58197 return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
58198 case X86::RBP:
58199 return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
58200 default:
58201 return std::make_pair(0, nullptr);
58202 }
58203 }
58204 if (RC && RC->contains(DestReg))
58205 return std::make_pair(DestReg, RC);
58206 return Res;
58207 }
58208 // No register found/type mismatch.
58209 return std::make_pair(0, nullptr);
58210 } else if (isFRClass(*Class)) {
58211 // Handle references to XMM physical registers that got mapped into the
58212 // wrong class. This can happen with constraints like {xmm0} where the
58213 // target independent register mapper will just pick the first match it can
58214 // find, ignoring the required type.
58215
58216 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
58217 if (VT == MVT::f16)
58218 Res.second = &X86::FR16XRegClass;
58219 else if (VT == MVT::f32 || VT == MVT::i32)
58220 Res.second = &X86::FR32XRegClass;
58221 else if (VT == MVT::f64 || VT == MVT::i64)
58222 Res.second = &X86::FR64XRegClass;
58223 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
58224 Res.second = &X86::VR128XRegClass;
58225 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
58226 Res.second = &X86::VR256XRegClass;
58227 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
58228 Res.second = &X86::VR512RegClass;
58229 else {
58230 // Type mismatch and not a clobber: Return an error;
58231 Res.first = 0;
58232 Res.second = nullptr;
58233 }
58234 } else if (isVKClass(*Class)) {
58235 if (VT == MVT::v1i1 || VT == MVT::i1)
58236 Res.second = &X86::VK1RegClass;
58237 else if (VT == MVT::v8i1 || VT == MVT::i8)
58238 Res.second = &X86::VK8RegClass;
58239 else if (VT == MVT::v16i1 || VT == MVT::i16)
58240 Res.second = &X86::VK16RegClass;
58241 else if (VT == MVT::v32i1 || VT == MVT::i32)
58242 Res.second = &X86::VK32RegClass;
58243 else if (VT == MVT::v64i1 || VT == MVT::i64)
58244 Res.second = &X86::VK64RegClass;
58245 else {
58246 // Type mismatch and not a clobber: Return an error;
58247 Res.first = 0;
58248 Res.second = nullptr;
58249 }
58250 }
58251
58252 return Res;
58253}
58254
58256 // Integer division on x86 is expensive. However, when aggressively optimizing
58257 // for code size, we prefer to use a div instruction, as it is usually smaller
58258 // than the alternative sequence.
58259 // The exception to this is vector division. Since x86 doesn't have vector
58260 // integer division, leaving the division as-is is a loss even in terms of
58261 // size, because it will have to be scalarized, while the alternative code
58262 // sequence can be performed in vector form.
58263 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
58264 return OptSize && !VT.isVector();
58265}
58266
58267void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
58268 if (!Subtarget.is64Bit())
58269 return;
58270
58271 // Update IsSplitCSR in X86MachineFunctionInfo.
58273 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
58274 AFI->setIsSplitCSR(true);
58275}
58276
58277void X86TargetLowering::insertCopiesSplitCSR(
58278 MachineBasicBlock *Entry,
58279 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
58280 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
58281 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
58282 if (!IStart)
58283 return;
58284
58285 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
58286 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
58287 MachineBasicBlock::iterator MBBI = Entry->begin();
58288 for (const MCPhysReg *I = IStart; *I; ++I) {
58289 const TargetRegisterClass *RC = nullptr;
58290 if (X86::GR64RegClass.contains(*I))
58291 RC = &X86::GR64RegClass;
58292 else
58293 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
58294
58295 Register NewVR = MRI->createVirtualRegister(RC);
58296 // Create copy from CSR to a virtual register.
58297 // FIXME: this currently does not emit CFI pseudo-instructions, it works
58298 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
58299 // nounwind. If we want to generalize this later, we may need to emit
58300 // CFI pseudo-instructions.
58301 assert(
58302 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
58303 "Function should be nounwind in insertCopiesSplitCSR!");
58304 Entry->addLiveIn(*I);
58305 BuildMI(*Entry, MBBI, MIMetadata(), TII->get(TargetOpcode::COPY), NewVR)
58306 .addReg(*I);
58307
58308 // Insert the copy-back instructions right before the terminator.
58309 for (auto *Exit : Exits)
58310 BuildMI(*Exit, Exit->getFirstTerminator(), MIMetadata(),
58311 TII->get(TargetOpcode::COPY), *I)
58312 .addReg(NewVR);
58313 }
58314}
58315
58317 return Subtarget.is64Bit();
58318}
58319
58323 const TargetInstrInfo *TII) const {
58324 assert(MBBI->isCall() && MBBI->getCFIType() &&
58325 "Invalid call instruction for a KCFI check");
58326
58327 MachineFunction &MF = *MBB.getParent();
58328 // If the call target is a memory operand, unfold it and use R11 for the
58329 // call, so KCFI_CHECK won't have to recompute the address.
58330 switch (MBBI->getOpcode()) {
58331 case X86::CALL64m:
58332 case X86::CALL64m_NT:
58333 case X86::TAILJMPm64:
58334 case X86::TAILJMPm64_REX: {
58337 if (!TII->unfoldMemoryOperand(MF, *OrigCall, X86::R11, /*UnfoldLoad=*/true,
58338 /*UnfoldStore=*/false, NewMIs))
58339 report_fatal_error("Failed to unfold memory operand for a KCFI check");
58340 for (auto *NewMI : NewMIs)
58341 MBBI = MBB.insert(OrigCall, NewMI);
58342 assert(MBBI->isCall() &&
58343 "Unexpected instruction after memory operand unfolding");
58344 if (OrigCall->shouldUpdateCallSiteInfo())
58345 MF.moveCallSiteInfo(&*OrigCall, &*MBBI);
58346 MBBI->setCFIType(MF, OrigCall->getCFIType());
58347 OrigCall->eraseFromParent();
58348 break;
58349 }
58350 default:
58351 break;
58352 }
58353
58354 MachineOperand &Target = MBBI->getOperand(0);
58355 Register TargetReg;
58356 switch (MBBI->getOpcode()) {
58357 case X86::CALL64r:
58358 case X86::CALL64r_NT:
58359 case X86::TAILJMPr64:
58360 case X86::TAILJMPr64_REX:
58361 assert(Target.isReg() && "Unexpected target operand for an indirect call");
58362 Target.setIsRenamable(false);
58363 TargetReg = Target.getReg();
58364 break;
58365 case X86::CALL64pcrel32:
58366 case X86::TAILJMPd64:
58367 assert(Target.isSymbol() && "Unexpected target operand for a direct call");
58368 // X86TargetLowering::EmitLoweredIndirectThunk always uses r11 for
58369 // 64-bit indirect thunk calls.
58370 assert(StringRef(Target.getSymbolName()).ends_with("_r11") &&
58371 "Unexpected register for an indirect thunk call");
58372 TargetReg = X86::R11;
58373 break;
58374 default:
58375 llvm_unreachable("Unexpected CFI call opcode");
58376 break;
58377 }
58378
58379 return BuildMI(MBB, MBBI, MIMetadata(*MBBI), TII->get(X86::KCFI_CHECK))
58380 .addReg(TargetReg)
58381 .addImm(MBBI->getCFIType())
58382 .getInstr();
58383}
58384
58385/// Returns true if stack probing through a function call is requested.
58387 return !getStackProbeSymbolName(MF).empty();
58388}
58389
58390/// Returns true if stack probing through inline assembly is requested.
58392
58393 // No inline stack probe for Windows, they have their own mechanism.
58394 if (Subtarget.isOSWindows() ||
58395 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
58396 return false;
58397
58398 // If the function specifically requests inline stack probes, emit them.
58399 if (MF.getFunction().hasFnAttribute("probe-stack"))
58400 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
58401 "inline-asm";
58402
58403 return false;
58404}
58405
58406/// Returns the name of the symbol used to emit stack probes or the empty
58407/// string if not applicable.
58410 // Inline Stack probes disable stack probe call
58411 if (hasInlineStackProbe(MF))
58412 return "";
58413
58414 // If the function specifically requests stack probes, emit them.
58415 if (MF.getFunction().hasFnAttribute("probe-stack"))
58416 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
58417
58418 // Generally, if we aren't on Windows, the platform ABI does not include
58419 // support for stack probes, so don't emit them.
58420 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
58421 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
58422 return "";
58423
58424 // We need a stack probe to conform to the Windows ABI. Choose the right
58425 // symbol.
58426 if (Subtarget.is64Bit())
58427 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
58428 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
58429}
58430
58431unsigned
58433 // The default stack probe size is 4096 if the function has no stackprobesize
58434 // attribute.
58435 return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",
58436 4096);
58437}
58438
58440 if (ML && ML->isInnermost() &&
58441 ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
58444}
unsigned const MachineRegisterInfo * MRI
#define Success
static SDValue Widen(SelectionDAG *CurDAG, SDValue N)
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
unsigned RegSize
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
#define NODE_NAME_CASE(node)
static const LLT S1
amdgpu AMDGPU Register Bank Select
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
#define EXPAND(Op)
Function Alias Analysis Results
BitTracker BT
Definition: BitTracker.cpp:73
BlockVerifier::State From
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:203
This file contains the declarations for the subclasses of Constant, which represent the different fla...
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
Looks at all the uses of the given value Returns the Liveness deduced from the uses of this value Adds all uses that cause the result to be MaybeLive to MaybeLiveRetUses If the result is MaybeLiveUses might be modified but its content should be ignored(since it might not be complete). DeadArgumentEliminationPass
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
std::string Name
uint64_t Size
Symbol * Sym
Definition: ELF_riscv.cpp:479
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
static KnownBits extractBits(unsigned BitWidth, const KnownBits &SrcOpKnown, const KnownBits &OffsetKnown, const KnownBits &WidthKnown)
Hexagon Common GEP
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
IRTranslator LLVM IR MI
static const unsigned MaxDepth
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
static int matchShuffleAsBitRotate(ArrayRef< int > Mask, int NumSubElts)
Try to lower a vector shuffle as a bit rotation.
static Value * LowerCTLZ(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctlz of V before the specified instruction IP.
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition: Lint.cpp:528
Live Register Matrix
static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc)
Return true if node is an ISD::AND or ISD::OR of two M68k::SETcc nodes each of which has no other use...
static bool hasNonFlagsUse(SDValue Op)
return true if Op has a use that doesn't just read flags.
static bool isCMOVPseudo(MachineInstr &MI)
static SDValue combineCarryThroughADD(SDValue CCR)
static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG)
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
#define R2(n)
#define T1
uint64_t High
LLVMContext & Context
This file defines ARC utility functions which are used by various parts of the compiler.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
PowerPC Reduce CR logical Operation
PowerPC TLS Dynamic Call Fixup
if(VerifyEach)
const char LLVMTargetMachineRef TM
static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget)
static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc)
const SmallVectorImpl< MachineOperand > & Cond
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isSimple(Instruction *I)
unsigned OpIndex
static StringRef substr(StringRef Str, uint64_t Len)
This file implements the SmallBitVector class.
This file defines the SmallSet class.
static bool Enabled
Definition: Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
This file contains some functions that are useful when dealing with strings.
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...
This file describes how to lower LLVM code to machine code.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &DL, unsigned VectorWidth)
static bool is64Bit(const char *name)
#define GET_EGPR_IF_ENABLED(OPC)
static unsigned getSUBriOpcode(bool IsLP64)
static bool isNoopOrBroadcastShuffleMask(ArrayRef< int > Mask)
static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask)
static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget)
Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer t...
static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::ANDNP nodes.
static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0, const SDValue &Zext1, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT, SDValue X, SDValue Y, SelectionDAG &DAG, bool ZeroSecondOpOnly=false)
If this is an add or subtract where one operand is produced by a cmp+setcc, then try to convert it to...
static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp, SmallVectorImpl< SDValue > &SrcOps, SmallVectorImpl< APInt > *SrcMask=nullptr)
Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...)) style scalarized (associative) ...
static SDValue combineSubABS(SDNode *N, SelectionDAG &DAG)
static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, SDValue &Op1, bool &IsAlwaysSignaling)
Turns an ISD::CondCode into a value suitable for SSE floating-point mask CMPs.
static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC)
static SDValue combineXorSubCTLZ(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If a value is a scalar FP zero or a vector FP zero (potentially including undefined elements),...
static bool matchBinaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static SDValue combineSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isGRClass(const TargetRegisterClass &RC)
Check if RC is a general purpose register class.
static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero, SmallVectorImpl< SDValue > &Ops, SmallVectorImpl< int > &Mask, bool &IsUnary)
Calculates the shuffle mask corresponding to the target-specific opcode.
static SDValue vectorizeExtractedCast(SDValue Cast, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast operation that is extracted from a vector, try to vectorize the cast op followed ...
static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG)
static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable)
static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode, const SDLoc &DL, SelectionDAG &DAG, unsigned BaseIdx, unsigned LastIdx, SDValue &V0, SDValue &V1)
This is a helper function of LowerToHorizontalOp().
static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In, const SDLoc &dl, SelectionDAG &DAG)
static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > HalfMask, int HalfIdx1, int HalfIdx2, bool UndefLower, SelectionDAG &DAG, bool UseConcat=false)
Given the output values from getHalfShuffleMask(), create a half width shuffle of extracted vectors f...
static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, SDValue ShAmt, int ShAmtIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle vector element shifts by a splat shift amount.
@ ConstantBit
@ NotConstantBit
@ NotShiftBit
@ ShiftBit
@ UndefBit
static SDValue combineZext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc, bool NSW)
Given a buildvector constant, return a new vector constant with each element incremented or decrement...
static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, unsigned &NumExtracts, bool &IsSubAdd)
Returns true iff BV builds a vector with the result equivalent to the result of ADDSUB/SUBADD operati...
static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode)
static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane 32-bit floating point shuffles.
static MachineBasicBlock * emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, const TargetInstrInfo *TII)
Utility function to emit xbegin specifying the start of an RTM region.
static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef< SDValue > Elts, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
Given the initializing elements 'Elts' of a vector of type 'VT', see if the elements can be replaced ...
static bool scaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts, SmallVectorImpl< int > &ScaledMask)
static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerTruncateVecPackWithSignBits(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
This function lowers a vector truncation of 'extended sign-bits' or 'extended zero-bits' values.
static APInt getExtractedDemandedElts(SDNode *N)
static SDValue combineBitOpWithPACK(SDNode *N, SelectionDAG &DAG)
static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit integer shuffles.
static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we are inverting an PTEST/TESTP operand, attempt to adjust the CC to avoid the inversion.
static unsigned getAltBitOpcode(unsigned Opcode)
static Constant * getConstantVector(MVT VT, ArrayRef< APInt > Bits, const APInt &Undefs, LLVMContext &C)
static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue promoteXINT_TO_FP(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert i1-subvector to i1-vector.
static SDValue materializeVectorConstant(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Create a vector constant without a load.
static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle with a single PSHUFB of V1 or V2.
static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsDecomposedShuffleMerge(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic routine to decompose a shuffle and blend into independent blends and permutes.
static SDValue combineBMILogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerUINT_TO_FP_i64(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
64-bit unsigned integer to double expansion.
static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT, const X86Subtarget &Subtarget)
static bool isX86CCSigned(unsigned X86CC)
Return true if the condition is an signed comparison operation.
static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a 128-bit shuffles.
static SDValue combineBitOpWithShift(SDNode *N, SelectionDAG &DAG)
static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on SELECT and VSELECT nodes.
static bool isUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is undef or ...
static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getConstVector(ArrayRef< int > Values, MVT VT, SelectionDAG &DAG, const SDLoc &dl, bool IsMask=false)
static MachineInstrBuilder createPHIsForCMOVsInSinkBB(MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd, MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, MachineBasicBlock *SinkMBB)
static SDValue combineCMP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to put 128-bits into a vector > 128 bits.
static bool onlyZeroFlagUsed(SDValue Flags)
static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 256-bits from a 512-bit vector.
static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Either split a vector in halves or decompose the shuffles and the blend/unpack.
static SDValue combineMulToPMADDWD(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleAsLanePermuteAndShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one source with a lane permutatio...
static bool isFoldableUseOfShuffle(SDNode *N)
static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return (and Op, Mask) for compare instructions or (vselect Mask, Op, PreservedSrc) for others along w...
static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKSS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg sign extension and X86ISD::PACKSS.
static SDValue combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool isShuffleMaskInputInPlace(int Input, ArrayRef< int > Mask)
Test whether the specified input (0 or 1) is in-place blended by the given mask.
static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether elements in each LaneSizeInBits lane in this shuffle mask come from multiple lanes - thi...
static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT, ISD::CondCode Cond, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
As another special case, use PSUBUS[BW] when it's profitable.
static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 128-bit lane.
static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void getPackDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineADC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static bool isShuffleFoldableLoad(SDValue V)
Helper to test for a load that can be folded with x86 shuffles.
static SDValue lowerShuffleAsElementInsertion(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower insertion of a single element into a zero vector.
static SDValue combineXor(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnpackWdShuffleMask(ArrayRef< int > Mask, MVT VT, const SelectionDAG &DAG)
static SDValue LowerTruncateVecPack(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into X86ISD::PACKUS/X86ISD::P...
static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle case where shuffle sources are coming from the same 128-bit lane and every lane can be represe...
static void computeKnownBitsForPSADBW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static int getSEHRegistrationNodeSize(const Function *Fn)
static SDValue combineShuffleOfConcatUndef(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Creates an SDNode for a predicated scalar operation.
static SDValue buildFromShuffleMostly(SDValue Op, const SDLoc &DL, SelectionDAG &DAG)
static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
If a BUILD_VECTOR's source elements all apply the same bit operation and one of their operands is con...
static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Dispatching routine to lower various 128-bit x86 vector shuffles.
static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG)
static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth=0)
Returns the negated value if the node N flips sign of FP value.
static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 16-bit integer shuffles.
static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower atomic_load_ops into LOCK-prefixed operations.
static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 8-bit integer shuffles.
static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG)
static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0, int BroadcastIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single - truncated - integer element, coming from a scalar_to_vector/buil...
static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, const SDLoc &DL, SelectionDAG &DAG, unsigned X86Opcode, bool Mode, bool isUndefLO, bool isUndefHI)
Emit a sequence of two 128-bit horizontal add/sub followed by a concat_vector.
static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, SDValue *InGlue, const EVT PtrVT, unsigned ReturnReg, unsigned char OperandFlags, bool LocalDynamic=false)
static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to fold: and (vector_shuffle<Z,...,Z> (insert_vector_elt undef, (xor X, -1), Z),...
static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a bitmask instruction for a shuffle.
static bool is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 256-bit lane.
static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl, SDValue V1, SDValue V2, ArrayRef< int > Mask)
static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL, SelectionDAG &DAG)
static SDValue LowerUINT_TO_FP_i32(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
32-bit unsigned integer to float expansion.
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static cl::opt< int > ExperimentalPrefInnermostLoopAlignment("x86-experimental-pref-innermost-loop-alignment", cl::init(4), cl::desc("Sets the preferable loop alignment for experiments (as log2 bytes) " "for innermost loops only. If specified, this option overrides " "alignment set by x86-experimental-pref-loop-alignment."), cl::Hidden)
static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute from a vector of source v...
static SDValue getHopForBuildVector(const BuildVectorSDNode *BV, const SDLoc &DL, SelectionDAG &DAG, unsigned HOpcode, SDValue V0, SDValue V1)
static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle as a zero or any extension.
static bool needCarryOrOverflowFlag(SDValue Flags)
static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
Returns a vector of specified type with all bits set.
static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefLowerHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose lower half is undefined.
static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineRedundantDWordShuffle(SDValue N, MutableArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
Search for a combinable shuffle across a chain ending in pshufd.
static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG, SDValue OpMustEq, SDValue Op, unsigned Depth)
static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, uint64_t ShiftAmt, SelectionDAG &DAG)
Handle vector element shifts where the shift amount is a constant.
static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS, bool PackHiHalf=false)
Returns a node that packs the LHS + RHS nodes together at half width.
static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG)
static bool matchUnaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue V1, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT)
static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast to FP with a cast to integer operand (almost an ftrunc), try to vectorize the cas...
static bool getHalfShuffleMask(ArrayRef< int > Mask, MutableArrayRef< int > HalfMask, int &HalfIdx1, int &HalfIdx2)
If the input shuffle mask results in a vector that is undefined in all upper or lower half elements a...
static cl::opt< int > BrMergingBaseCostThresh("x86-br-merging-base-cost", cl::init(2), cl::desc("Sets the cost threshold for when multiple conditionals will be merged " "into one branch versus be split in multiple branches. Merging " "conditionals saves branches at the cost of additional instructions. " "This value sets the instruction cost limit, below which conditionals " "will be merged, and above which conditionals will be split. Set to -1 " "to never merge branches."), cl::Hidden)
static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT)
static SDValue emitLockedStackOp(SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue Chain, const SDLoc &DL)
Emit a locked operation on a stack location which does not change any memory location,...
static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, bool &ForceV1Zero, bool &ForceV2Zero, unsigned &ShuffleImm, ArrayRef< int > Mask, const APInt &Zeroable)
static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 8-lane 16-bit floating point shuffles.
static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle using bit math.
static SDValue reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-extending masked load, it is a scalar load and ve...
static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, unsigned TargetOpcode, unsigned SrcReg, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics with chain that return their value into registers EDX:EAX.
static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI)
static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, SelectionDAG &DAG)
static SDValue LowerBuildVectorAsInsert(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, unsigned EltSizeInBits, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a target shuffle mask is equivalent within each sub-lane.
static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to map a 128-bit or larger integer comparison to vector instructions before type legalization spl...
static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
This function detects the AVG pattern between vectors of unsigned i8/i16, which is c = (a + b + 1) / ...
static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether there are elements crossing LaneSizeInBits lanes in this shuffle mask.
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, X86::CondCode &X86CC)
Result of 'and' is compared against zero.
static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsZeroOrAnyExtend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a zero extension on any microarch.
static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool supportedVectorShiftWithBaseAmnt(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Compute the horizontal sum of bytes in V for the elements of VT.
static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 16-bit integer shuffles.
static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG)
static void computeInLaneShuffleMask(const ArrayRef< int > &Mask, int LaneSize, SmallVector< int > &InLaneMask)
Helper to get compute inlane shuffle mask for a complete shuffle mask.
static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG)
static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineTESTP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT, EVT MemVT, MemSDNode *Mem, unsigned Offset, SelectionDAG &DAG)
static bool isUndefUpperHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose upper half is undefined.
static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG)
Lower SRA_PARTS and friends, which return two i32 values and take a 2 x i32 value to shift plus a shi...
static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode)
static std::pair< SDValue, SDValue > getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG)
static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs reference the same FP CMP,...
static bool isVKClass(const TargetRegisterClass &RC)
Check if RC is a mask register class.
static SDValue combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If a vector select has an operand that is -1 or 0, try to simplify the select to a bitwise logic oper...
static int canLowerByDroppingElements(ArrayRef< int > Mask, bool MatchEven, bool IsSingleInput)
Check whether a compaction lowering can be done by dropping even/odd elements and compute how many ti...
static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
Attempt to pre-truncate inputs to arithmetic ops if it will simplify the codegen.
static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single element.
static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void resolveTargetShuffleInputsAndMask(SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask)
Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 64-lane 8-bit integer shuffles.
static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to combine a shuffle into a target-specific add-sub or mul-add-sub node.
static SDValue lowerShuffleAsLanePermuteAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes as a lane permutation followed by a per-lane p...
static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG)
static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of 8-lane i16 shuffles.
static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue In, SelectionDAG &DAG)
static bool canonicalizeShuffleMaskWithCommute(ArrayRef< int > Mask)
Helper function that returns true if the shuffle mask should be commuted to improve canonicalization.
static bool matchAsm(StringRef S, ArrayRef< const char * > Pieces)
static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getV4X86ShuffleImm8ForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG)
Change a vector store into a pair of half-size vector stores.
static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a vector to a larger size with the same scalar type, with the new elements either zero or undef...
static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static bool isUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue MatchVectorAllEqualTest(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FANDN nodes.
static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, TLSModel::Model model, bool is64Bit, bool isPIC)
static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R, SDValue And1_L, SDValue And1_R, const SDLoc &DL, SelectionDAG &DAG)
static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendBoolVectorInReg(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break a binary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue createSetFPEnvNodes(SDValue Ptr, SDValue Chain, SDLoc DL, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static const char * getIndirectThunkSymbol(const X86Subtarget &Subtarget, unsigned Reg)
static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG)
static unsigned getV4X86ShuffleImm(ArrayRef< int > Mask)
Get a 4-lane 8-bit shuffle immediate for a mask.
static void resolveTargetShuffleFromZeroables(SmallVectorImpl< int > &Mask, const APInt &KnownUndef, const APInt &KnownZero, bool ResolveKnownZeros=true)
static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert one bit to mask vector, like v16i1 or v8i1.
static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle by first fixing the 128-bit lanes and then shuffling each lane.
static bool isSoftF16(T VT, const X86Subtarget &Subtarget)
static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit integer shuffles.
static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG)
static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Detect vector gather/scatter index generation and convert it from being a bunch of shuffles and extra...
static bool isSingleSHUFPSMask(ArrayRef< int > Mask)
Test whether this can be lowered with a single SHUFPS instruction.
static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, X86::CondCode &CC1, SDValue &Flags, bool &isAnd)
Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
static bool isX86LogicalCmp(SDValue Op)
Return true if opcode is a X86 logical comparison.
static bool isAnyInRange(ArrayRef< int > Mask, int Low, int Hi)
Return true if the value of any element in Mask falls within the specified range (L,...
static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG)
static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT, SelectionDAG &DAG, unsigned Depth)
static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS=false)
Detect patterns of truncation with signed saturation: (truncate (smin ((smax (x, signed_min_of_dest_t...
const unsigned FPStateSize
static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, unsigned &UnpackOpcode, bool IsUnary, ArrayRef< int > TargetMask, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating point negations.
static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl, unsigned vectorWidth)
static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If both input operands of a logic op are being cast from floating-point types or FP compares,...
static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG, unsigned &HOpcode, SDValue &V0, SDValue &V1)
static SDValue combineFOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool createShuffleMaskFromVSELECT(SmallVectorImpl< int > &Mask, SDValue Cond, bool IsBLENDV=false)
static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size, bool AllowTruncate)
static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Helper to determine if In truncated to DstVT has the necessary signbits / leading zero bits to be tru...
static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Return Mask with the necessary casting or extending for Mask according to MaskVT when lowering maskin...
static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit floating point shuffles.
static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Horizontal vector math instructions may be slower than normal math with shuffles.
static bool isFRClass(const TargetRegisterClass &RC)
Check if RC is a vector register class.
static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool SimpleOnly)
Generic routine to split vector shuffle into half-sized shuffles.
static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue IsNOT(SDValue V, SelectionDAG &DAG)
static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG)
Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
static SDValue combineOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "test Op0,Op0", or something equivalent.
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, const SDLoc &dl)
Return a vector logical shift node.
static bool isFreeToSplitVector(SDNode *N, SelectionDAG &DAG)
static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane i32 vector shuffles.
static SDValue combineX86ShuffleChain(ArrayRef< SDValue > Inputs, SDValue Root, ArrayRef< int > BaseMask, int Depth, bool HasVariableMask, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Combine an arbitrary chain of shuffles into a single instruction if possible.
static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer types.
static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, SelectionDAG &DAG)
static bool isInRange(int Val, int Low, int Hi)
Return true if Val falls within the specified range (L, H].
static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Try to combine x86 target specific shuffles.
static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static std::pair< SDValue, SDValue > splitVector(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG)
Helper for attempting to create a X86ISD::BT node.
static SDValue EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Truncating Store with signed or unsigned saturation.
static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, bool FillWithZeroes=false)
Widen a vector input to a vector of NVT.
static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG)
static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool ImmBlends=false)
Try to lower as a blend of elements from two inputs followed by a single-input permutation.
static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx, const APInt &Zeroable)
const unsigned X87StateSize
static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit integer shuffles.
static bool isLegalConversion(MVT VT, bool IsSigned, const X86Subtarget &Subtarget)
static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool isUndefOrEqual(int Val, int CmpVal)
Val is the undef sentinel value or equal to the specified value.
static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isTargetShuffle(unsigned Opcode)
static bool isSingleElementRepeatedMask(ArrayRef< int > Mask)
Check if the Mask consists of the same element repeated multiple times.
static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG)
static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0, SDValue N1, ArrayRef< int > Mask, SelectionDAG &DAG)
If we are extracting two 128-bit halves of a vector and shuffling the result, match that to a 256-bit...
static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit floating point shuffles.
static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or 'fsubadd' operation accordingly...
static SDValue lowerV8I16GeneralSingleInputShuffle(const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 shuffle lowering,...
static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
High-level routine to lower various 256-bit x86 vector shuffles.
static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG)
Try to turn tests against the signbit in the form of: XOR(TRUNCATE(SRL(X, size(X)-1)),...
static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit floating point shuffles.
static SDValue combineOrXorWithSETCC(SDNode *N, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue isUpperSubvectorUndef(SDValue V, const SDLoc &DL, SelectionDAG &DAG)
static cl::opt< int > BrMergingLikelyBias("x86-br-merging-likely-bias", cl::init(0), cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "likely, then it is likely that if the conditionals are split " "both sides will be executed, so it may be desirable to increase " "the instruction cost threshold. Set to -1 to never merge likely " "branches."), cl::Hidden)
static bool clobbersFlagRegisters(const SmallVector< StringRef, 4 > &AsmPieces)
static SDValue getInvertedVectorForFMA(SDValue V, SelectionDAG &DAG)
static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp, int Idx, int ExpectedIdx)
Checks whether the vector elements referenced by two shuffle masks are equivalent.
static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Try to match a vector shuffle as an element rotation.
static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi)
Return true if Val is undef, zero or if its value falls within the specified range (L,...
static const Constant * getTargetConstantFromBasePtr(SDValue Ptr)
static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT, SDValue Src, const SDLoc &DL)
static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Original, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle.
static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset)
static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Helper that combines an array of subvector ops as if they were the operands of a ISD::CONCAT_VECTORS ...
static bool isUndefOrInRange(int Val, int Low, int Hi)
Return true if Val is undef or if its value falls within the specified range (L, H].
static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, EVT VT)
static bool collectConcatOps(SDNode *N, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG)
static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If both arms of a vector select are concatenated vectors, split the select, and concatenate the resul...
static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue combineSBB(SDNode *N, SelectionDAG &DAG)
static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static std::pair< Value *, BitTestKind > FindSingleBitChange(Value *V)
static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG)
If we are converting a value to floating-point, try to replace scalar truncate of an extracted vector...
static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef< int > Mask)
Test whether there are elements crossing 128-bit lanes in this shuffle mask.
static SDValue LowerI64IntToFP16(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit integer shuffles.
static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "cmp Op0,Op1", or something equivalent.
static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG)
const unsigned FPStateSizeInBits
static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-truncating masked store, it is a vector extract a...
static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode)
static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue narrowExtractedVectorSelect(SDNode *Ext, const SDLoc &DL, SelectionDAG &DAG)
If we are extracting a subvector of a vector select and the select condition is composed of concatena...
static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNoopShuffleMask(ArrayRef< int > Mask)
Tiny helper function to identify a no-op mask.
static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackh operation.
static SDValue combineExtractFromVectorLoad(SDNode *N, EVT VecVT, SDValue SrcVec, uint64_t Idx, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If this is a zero/all-bits result that is bitwise-anded with a low bits mask.
static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a byte shift sequence.
static SDValue combineX86ShuffleChainWithExtract(ArrayRef< SDValue > Inputs, SDValue Root, ArrayRef< int > BaseMask, int Depth, bool HasVariableMask, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isTargetShuffleVariableMask(unsigned Opcode)
static bool isLogicOp(unsigned Opcode)
static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool BitwiseOnly)
static SDValue LowerBuildVectorv8i16(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v8i16.
static bool matchBinaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT, bool IsUnary)
static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to lower as an unpack of elements from two inputs followed by a single-input permutation.
static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG)
static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, bool IsZero, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return a vector_shuffle of the specified vector of zero or undef vector.
static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Attempt to use the vbroadcast instruction to generate a splat value from a splat BUILD_VECTOR which u...
static SDValue combineMulToPMULDQ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineX86ShufflesConstants(ArrayRef< SDValue > Ops, ArrayRef< int > Mask, SDValue Root, bool HasVariableMask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit floating point shuffles.
static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG)
static bool getTargetShuffleMaskIndices(SDValue MaskNode, unsigned MaskEltSizeInBits, SmallVectorImpl< uint64_t > &RawMask, APInt &UndefElts)
static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG, const X86Subtarget &Subtarget)
sext(add_nsw(x, C)) --> add(sext(x), C_sext) zext(add_nuw(x, C)) --> add(zext(x), C_zext) Promoting a...
static const Constant * getTargetConstantFromNode(LoadSDNode *Load)
static bool canCombineAsMaskOperation(SDValue V, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a dword/qword rotation.
static bool isProfitableToUseFlagOp(SDValue Op)
static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG)
ISD::FROUND is defined to round to nearest with ties rounding away from 0.
static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG, const SDLoc &DL)
Detect patterns of truncation with unsigned saturation:
static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG)
If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the low half of each source v...
static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, bool isFP, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG)
Do a one-to-one translation of a ISD::CondCode to the X86-specific condition code,...
static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, unsigned ScalarSizeInBits, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable, const X86Subtarget &Subtarget)
Try to lower a vector shuffle as a bit shift (shifts in zeros).
static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG)
static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
High-level routine to lower various 512-bit x86 vector shuffles.
static SDValue LowerBuildVectorv16i8(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v16i8.
static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, APInt &UndefElts, SmallVectorImpl< APInt > &EltBits, bool AllowWholeUndefs=true, bool AllowPartialUndefs=false)
static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0, SDValue &Op1)
static SDValue splitIntVSETCC(EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SelectionDAG &DAG, const SDLoc &dl)
Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then concatenate the result back.
static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG)
Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit) followed by unpack 256-bit.
static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerAddSubToHorizontalOp(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Depending on uarch and/or optimizing for size, we might prefer to use a vector operation in place of ...
static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp, SelectionDAG &DAG, SDValue &Addr, SDValue &Index, Align &Alignment, unsigned &Offset)
Given a masked memory load/store operation, return true if it has one mask bit set.
static SDValue reduceVMULWidth(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
When the operands of vector mul are extended from smaller size values, like i8 and i16,...
static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode)
static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG)
static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2, unsigned ExpectedUses)
Returns true if is possible to fold MUL and an idiom that has already been recognized as ADDSUB/SUBAD...
static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS, unsigned &LogBias, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering 2-lane 128-bit shuffles.
static SDValue lowerUINT_TO_FP_vec(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute)
static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG)
The only differences between FABS and FNEG are the mask and the logic op.
ShrinkMode
Different mul shrinking modes.
static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG, const SDLoc &dl)
static SDValue canonicalizeShuffleMaskWithHorizOp(MutableArrayRef< SDValue > Ops, MutableArrayRef< int > Mask, unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void computeZeroableShuffleElements(ArrayRef< int > Mask, SDValue V1, SDValue V2, APInt &KnownUndef, APInt &KnownZero)
Compute whether each element of a shuffle is zeroable.
static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Masked Truncating Store with signed or unsigned saturation.
static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a VSELECT instruction to a vector shuffle.
static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2, MutableArrayRef< int > Mask, const APInt &Zeroable, bool &ForceV1Zero, bool &ForceV2Zero, uint64_t &BlendMask)
static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src, const SDLoc &DL)
static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
CMOV of constants requires materializing constant operands in registers.
static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG, EVT VT, const SDLoc &DL)
static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackl operation.
static SDValue getScalarValueForVectorElement(SDValue V, int Idx, SelectionDAG &DAG)
Try to get a scalar value for a specific element of a vector.
static unsigned getOpcodeForIndirectThunk(unsigned RPOpc)
static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of v16i8 shuffles.
static bool isNullFPScalarOrVectorConst(SDValue V)
static bool hasIdenticalHalvesShuffleMask(ArrayRef< int > Mask)
Return true if a shuffle mask chooses elements identically in its top and bottom halves.
static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2, unsigned &PackOpcode, ArrayRef< int > TargetMask, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned MaxStages=1)
static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget)
static SDValue combineBITREVERSE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to convert a vector reduction sequence composed of binops and shuffles into horizontal ops.
static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffle using X86ISD::VROTLI rotations.
static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT)
static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Combine: (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S) to: (brcond/cmov/setcc ....
static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize an EFLAGS definition used according to the condition code CC into a simpler EFLAGS value,...
static bool isBroadcastShuffleMask(ArrayRef< int > Mask)
static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsCommutative, SmallVectorImpl< int > &PostShuffleMask)
Return 'true' if this vector operation is "horizontal" and return the operands for the horizontal ope...
static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Extracting a scalar FP value from vector element 0 is free, so extract each operand first,...
static SDValue combineX86ShufflesRecursively(ArrayRef< SDValue > SrcOps, int SrcOpIndex, SDValue Root, ArrayRef< int > RootMask, ArrayRef< const SDNode * > SrcNodes, unsigned Depth, unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Fully generic combining of x86 shuffle instructions.
static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static StringRef getInstrStrFromOpNo(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo)
static bool isSequentialOrUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size,...
static bool canWidenShuffleElements(ArrayRef< int > Mask, SmallVectorImpl< int > &WidenedMask)
Helper function to test whether a shuffle mask could be simplified by widening the elements being shu...
static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an unary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue combineSext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit integer shuffles.
static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineLogicBlendIntoConditionalNegate(EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getShuffleScalarElt(SDValue Op, unsigned Index, SelectionDAG &DAG, unsigned Depth)
Returns the scalar element that will make up the i'th element of the result of the vector shuffle.
static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable)
static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG)
Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2, unsigned &InsertPSMask, const APInt &Zeroable, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNonZeroElementsInOrder(const APInt &Zeroable, ArrayRef< int > Mask, const EVT &VectorType, bool &IsZeroSideLeft)
static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMul(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG, EVT VecVT, EVT CmpVT, bool HasPT, F SToV)
Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp expansion.
static SDValue truncateAVX512SetCCNoBWI(EVT VT, EVT OpVT, SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just pre-promote its result type since...
static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Try to lower a vector shuffle as a byte rotation.
static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle as a permute of the inputs followed by an UNPCK instruction.
static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT, SelectionDAG &DAG)
static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isAddSubOrSubAddMask(ArrayRef< int > Mask, bool &Op0Even)
Checks if the shuffle mask takes subsequent elements alternately from two vectors.
static bool isCompletePermute(ArrayRef< int > Mask)
Return true if every element of a single input is referenced by the shuffle mask.
static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, SDValue EntryEBP)
When the MSVC runtime transfers control to us, either to an outlined function or when returning to a ...
static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode, SelectionDAG &DAG, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics that read the time stamp counter (x86_rdtsc and x86_rdtscp...
static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorAllEqual(const SDLoc &DL, SDValue LHS, SDValue RHS, ISD::CondCode CC, const APInt &OriginalMask, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static bool is128BitUnpackShuffleMask(ArrayRef< int > Mask, const SelectionDAG &DAG)
static bool isOrXorXorTree(SDValue X, bool Root=true)
Recursive helper for combineVectorSizedSetCCEquality() to see if we have a recognizable memcmp expans...
static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FAND nodes.
static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static ConstantPoolSDNode * getTargetConstantPoolFromBasePtr(SDValue Ptr)
static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V, SelectionDAG &DAG, const SDLoc &DL)
Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
static bool isShuffleEquivalent(ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a shuffle mask is equivalent to an explicit list of arguments.
static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT, const APInt &Zeroable, ArrayRef< int > Mask, SDValue &V1, SDValue &V2, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit floating point shuffles.
static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBUILD_VECTORAsVariablePermute(SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsByteRotateAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then permuting the elements of th...
static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTPOP(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool ZeroUppers)
static void createPackShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Unary, unsigned NumStages=1)
Create a shuffle mask that matches the PACKSS/PACKUS truncation.
static bool isUndefOrEqualInRange(ArrayRef< int > Mask, int CmpVal, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating-point adds/subs.
static SDValue LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an operation into 2 half sized ops and then concatenate the results.
static cl::opt< bool > MulConstantOptimization("mul-constant-optimization", cl::init(true), cl::desc("Replace 'mul x, Const' with more effective instructions like " "SHIFT, LEA, etc."), cl::Hidden)
static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld)
static bool isAnyZero(ArrayRef< int > Mask)
Return true if the value of any element in Mask is the zero sentinel value.
static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKUS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
static SDValue commuteSelect(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerINT_TO_FP_vXi64(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl< int > &Mask, APInt &KnownUndef, APInt &KnownZero)
static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS, SDValue Index, SDValue Base, SDValue Scale, SelectionDAG &DAG)
static SmallVector< int, 4 > getPSHUFShuffleMask(SDValue N)
Get the PSHUF-style mask from PSHUF node.
static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT, SelectionDAG &DAG)
Scalarize a vector store, bitcasting to TargetVT to determine the scalar type.
static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineShuffleToFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isUndefOrZero(int Val)
Val is either the undef or zero sentinel value.
static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If this is a dynamic select (non-constant condition) and we can match this node with one of the varia...
SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, F Builder, bool CheckBWI=true)
static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL].
static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr, MachineBasicBlock *BB)
static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 128-bits from a vector > 128 bits.
static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue &X86CC)
static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG)
Lower a vector shuffle using the SHUFPS instruction.
static SDValue combineStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isHorizOp(unsigned Opcode)
static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector CTLZ using native supported vector CTLZ instruction.
static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Extract one bit from mask vector, like v16i1 or v8i1.
static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl, MVT VT, bool IsSigned, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue *Low=nullptr)
static SDValue lowerShuffleAsBlendOfPSHUFBs(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse)
Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the blend if only one input i...
static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx)
static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS, SDValue Mask, SelectionDAG &DAG)
static bool isSequentialOrUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos + Size,...
static cl::opt< int > BrMergingUnlikelyBias("x86-br-merging-unlikely-bias", cl::init(-1), cl::desc("Decreases 'x86-br-merging-base-cost' in cases that it is unlikely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "unlikely, then it is unlikely that if the conditionals are split " "both sides will be executed, so it may be desirable to decrease " "the instruction cost threshold. Set to -1 to never merge unlikely " "branches."), cl::Hidden)
static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, APInt &KnownUndef, APInt &KnownZero)
Decode a target shuffle mask and inputs and see if any values are known to be undef or zero from thei...
static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBuildVectorv4x32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v4i32 or v4f32.
static bool isTargetShuffleEquivalent(MVT VT, ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, const SelectionDAG &DAG, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a target shuffle mask is equivalent to an explicit pattern.
static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG)
static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG)
static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG)
Fold "masked merge" expressions like (m & x) | (~m & y) into the equivalent ((x ^ y) & m) ^ y) patter...
static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1)
static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, bool Is64Bit, bool Is64BitLP64)
static SDValue combineAndNotIntoANDNP(SDNode *N, SelectionDAG &DAG)
Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineBT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, SDValue ExtIdx)
For an EXTRACT_VECTOR_ELT with a constant index return the real underlying vector and index.
static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnaryOp(unsigned Opcode)
static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each sub-lane.
static SDValue LowerZERO_EXTEND_Mask(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize branch condition evaluation.
static bool hasFPCMov(unsigned X86CC)
Is there a floating point cmov for the specific X86 condition code? Current x86 isa includes the foll...
static int getOneTrueElt(SDValue V)
If V is a build vector of boolean constants and exactly one of those constants is true,...
static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static constexpr int Concat[]
Value * RHS
Value * LHS
if(isa< SExtInst >(LHS)) std auto IsFreeTruncation
static const unsigned FramePtr
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition: APFloat.cpp:5196
static APFloat getAllOnesValue(const fltSemantics &Semantics)
Returns a float which is bitcasted from an all one value int.
Definition: APFloat.cpp:5221
void clearSign()
Definition: APFloat.h:1159
opStatus next(bool nextDown)
Definition: APFloat.h:1115
void changeSign()
Definition: APFloat.h:1158
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition: APFloat.h:957
Class for arbitrary precision integers.
Definition: APInt.h:76
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1385
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:427
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition: APInt.h:207
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition: APInt.h:401
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1491
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1370
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1620
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition: APInt.h:1364
uint64_t extractBitsAsZExtValue(unsigned numBits, unsigned bitPosition) const
Definition: APInt.cpp:489
APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition: APInt.cpp:1002
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition: APInt.h:1463
APInt trunc(unsigned width) const
Truncate to new width.
Definition: APInt.cpp:906
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition: APInt.h:184
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1308
APInt abs() const
Get the absolute value.
Definition: APInt.h:1737
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:349
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:236
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:358
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition: APInt.h:444
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1636
void setSignBit()
Set the sign bit to 1.
Definition: APInt.h:1318
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1439
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1089
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition: APInt.h:187
bool isMinValue() const
Determine if this is the smallest unsigned value.
Definition: APInt.h:395
static APInt getMinValue(unsigned numBits)
Gets minimum unsigned value of APInt for a specific bit width.
Definition: APInt.h:194
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:307
bool intersects(const APInt &RHS) const
This operation tests if there are any pairs of corresponding bits between this APInt and RHS that are...
Definition: APInt.h:1227
bool eq(const APInt &RHS) const
Equality comparison.
Definition: APInt.h:1057
int32_t exactLogBase2() const
Definition: APInt.h:1725
void clearAllBits()
Set every bit to 0.
Definition: APInt.h:1375
void ashrInPlace(unsigned ShiftAmt)
Arithmetic right-shift this APInt by ShiftAmt in place.
Definition: APInt.h:812
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1589
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition: APInt.h:413
unsigned getNumSignBits() const
Computes the number of leading bits of this APInt that are equal to its sign bit.
Definition: APInt.h:1578
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition: APInt.h:1548
static APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition: APInt.cpp:620
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition: APInt.h:197
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition: APInt.h:1482
void flipAllBits()
Toggle every bit to its opposite value.
Definition: APInt.h:1405
unsigned countl_one() const
Count the number of leading one bits.
Definition: APInt.h:1565
void insertBits(const APInt &SubBits, unsigned bitPosition)
Insert the bits from a smaller APInt starting at bitPosition.
Definition: APInt.cpp:368
void clearLowBits(unsigned loBits)
Set bottom loBits bits to 0.
Definition: APInt.h:1395
unsigned logBase2() const
Definition: APInt.h:1703
void setAllBits()
Set every bit to 1.
Definition: APInt.h:1297
bool getBoolValue() const
Convert APInt to a boolean value.
Definition: APInt.h:449
bool isMask(unsigned numBits) const
Definition: APInt.h:466
bool isMaxSignedValue() const
Determine if this is the largest signed value.
Definition: APInt.h:383
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition: APInt.h:312
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition: APInt.h:1128
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:954
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition: APInt.h:1345
APInt shl(unsigned shiftAmt) const
Left-shift function.
Definition: APInt.h:851
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition: APInt.h:1235
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:418
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:284
bool isSignBitSet() const
Determine if sign bit of this APInt is set.
Definition: APInt.h:319
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:274
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:178
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition: APInt.h:1367
APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition: APInt.cpp:453
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition: APInt.h:410
bool isOne() const
Determine if this is a value of 1.
Definition: APInt.h:367
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition: APInt.h:264
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:217
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition: APInt.h:836
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition: APInt.h:829
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1606
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1199
bool isMaxValue() const
Determine if this is the largest unsigned value.
Definition: APInt.h:377
APInt truncSSat(unsigned width) const
Truncate to new width with signed saturation.
Definition: APInt.cpp:942
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
iterator end() const
Definition: ArrayRef.h:154
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:210
iterator begin() const
Definition: ArrayRef.h:153
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:195
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
Definition: Type.cpp:647
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
Definition: Instructions.h:696
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:748
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:867
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:760
@ Add
*p = old + v
Definition: Instructions.h:764
@ FAdd
*p = old + v
Definition: Instructions.h:785
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:778
@ Or
*p = old | v
Definition: Instructions.h:772
@ Sub
*p = old - v
Definition: Instructions.h:766
@ And
*p = old & v
Definition: Instructions.h:768
@ Xor
*p = old ^ v
Definition: Instructions.h:774
@ FSub
*p = old - v
Definition: Instructions.h:788
@ UIncWrap
Increment one up to a maximum value.
Definition: Instructions.h:800
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:776
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:782
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:796
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:780
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:792
@ UDecWrap
Decrement one until a minimum value or zero.
Definition: Instructions.h:804
@ Nand
*p = ~(old & v)
Definition: Instructions.h:770
Value * getPointerOperand()
Definition: Instructions.h:910
BinOp getOperation() const
Definition: Instructions.h:845
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
Definition: Instructions.h:901
Value * getValOperand()
Definition: Instructions.h:914
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
Definition: Instructions.h:887
This is an SDNode representing atomic operations.
bool hasFnAttr(Attribute::AttrKind Kind) const
Return true if the attribute exists for the function.
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:349
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
size_type count() const
count - Returns the number of bits which are set.
Definition: BitVector.h:162
bool none() const
none - Returns true if none of the bits are set.
Definition: BitVector.h:188
The address of a basic block.
Definition: Constants.h:889
A "pseudo-class" with methods for operating on BUILD_VECTORs.
bool getRepeatedSequence(const APInt &DemandedElts, SmallVectorImpl< SDValue > &Sequence, BitVector *UndefElements=nullptr) const
Find the shortest repeating sequence of values in the build vector.
SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
ConstantSDNode * getConstantSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant or null if this is not a constant splat.
Value * getCalledOperand() const
Definition: InstrTypes.h:1735
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:993
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:1022
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:1020
@ ICMP_EQ
equal
Definition: InstrTypes.h:1014
@ ICMP_NE
not equal
Definition: InstrTypes.h:1015
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:1105
static Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1291
static Constant * get(LLVMContext &Context, ArrayRef< uint8_t > Elts)
get() constructors - Return a constant with vector type with an element count and element type matchi...
Definition: Constants.cpp:2897
This is the shared class of boolean and integer constants.
Definition: Constants.h:80
static bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
Definition: Constants.cpp:1588
bool isMachineConstantPoolEntry() const
const Constant * getConstVal() const
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1398
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getIntegerValue(Type *Ty, const APInt &V)
Return the value for an integer or pointer constant, or a vector thereof, with the given scalar value...
Definition: Constants.cpp:400
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
Constant * getAggregateElement(unsigned Elt) const
For aggregates (struct/array/vector) return the constant that corresponds to the specified element if...
Definition: Constants.cpp:432
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
unsigned size() const
Definition: DenseMap.h:99
bool empty() const
Definition: DenseMap.h:98
iterator begin()
Definition: DenseMap.h:75
iterator end()
Definition: DenseMap.h:84
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:145
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
Tagged union holding either a T or a Error.
Definition: Error.h:474
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition: FastISel.h:66
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type::subtype_iterator param_iterator
Definition: DerivedTypes.h:126
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:685
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:703
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition: Function.cpp:715
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:682
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:264
bool hasPersonalityFn() const
Check whether this function has a personality function.
Definition: Function.h:855
Constant * getPersonalityFn() const
Get the personality function associated with this function.
Definition: Function.cpp:1909
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:340
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:677
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:973
const GlobalValue * getGlobal() const
static StringRef dropLLVMManglingEscape(StringRef Name)
If the given string begins with the GlobalValue name mangling escape character '\1',...
Definition: GlobalValue.h:567
bool isAbsoluteSymbolRef() const
Returns whether this is a reference to an absolute symbol.
Definition: Globals.cpp:380
ThreadLocalMode getThreadLocalMode() const
Definition: GlobalValue.h:271
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
This instruction compares its operands according to the predicate given to the constructor.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2666
std::vector< ConstraintInfo > ConstraintInfoVector
Definition: InlineAsm.h:121
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:83
const BasicBlock * getParent() const
Definition: Instruction.h:152
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:149
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:87
Class to represent integer types.
Definition: DerivedTypes.h:40
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
Definition: DerivedTypes.h:72
static bool LowerToByteSwap(CallInst *CI)
Try to replace a call instruction with a call to a bswap intrinsic.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
Definition: Instructions.h:184
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:266
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
bool usesWindowsCFI() const
Definition: MCAsmInfo.h:799
MCSymbol * getOrCreateParentFrameOffsetSymbol(const Twine &FuncName)
Definition: MCContext.cpp:220
MCSymbol * getOrCreateLSDASymbol(const Twine &FuncName)
Definition: MCContext.cpp:225
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:40
Set of metadata that should be preserved when using BuildMI().
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
@ INVALID_SIMPLE_VALUE_TYPE
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
bool is32BitVector() const
Return true if this is a 32-bit vector type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:585
bool bitsLT(MVT VT) const
Return true if this has less bits than VT.
bool is512BitVector() const
Return true if this is a 512-bit vector type.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool is256BitVector() const
Return true if this is a 256-bit vector type.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
MVT getDoubleNumVectorElementsVT() const
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
MVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
bool isEHPad() const
Returns true if the block is a landing pad.
reverse_iterator rend()
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
void push_back(MachineInstr *MI)
void setCallFrameSize(unsigned N)
Set the call frame size on entry to this basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
bool isLiveIn(MCPhysReg Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
unsigned succ_size() const
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
Instructions::iterator instr_iterator
succ_reverse_iterator succ_rbegin()
void eraseFromParent()
This method unlinks 'this' from the containing function and deletes it.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< succ_iterator > successors()
iterator insertAfter(iterator I, MachineInstr *MI)
Insert MI into the instruction list after I.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
succ_reverse_iterator succ_rend()
void setMachineBlockAddressTaken()
Set this block to indicate that its address is used as something other than the target of a terminato...
void setIsEHPad(bool V=true)
Indicates the block is a landing pad.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
void setFrameAddressIsTaken(bool T)
void setReturnAddressIsTaken(bool s)
void setHasCopyImplyingStackAdjustment(bool B)
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
int getFunctionContextIndex() const
Return the index for the function context object.
const WinEHFuncInfo * getWinEHFuncInfo() const
getWinEHFuncInfo - Return information about how the current function uses Windows exception handling.
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
MachineModuleInfo & getMMI() const
bool shouldSplitStack() const
Should we be emitting segmented stack stuff for the function.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
void moveCallSiteInfo(const MachineInstr *Old, const MachineInstr *New)
Move the call site info from Old to \New call site info.
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDisp(const MachineOperand &Disp, int64_t off, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
bool killsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr kills the specified register.
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:568
unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
@ EK_LabelDifference32
EK_LabelDifference32 - Each entry is the address of the block minus the address of the jump table.
@ EK_BlockAddress
EK_BlockAddress - Each entry is a plain address of block, e.g.: .word LBB123.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
const Value * getValue() const
Return the base address of the memory access.
const MCContext & getContext() const
const Module * getModule() const
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This class is used to represent an MGATHER node.
This is a base class used to represent MGATHER and MSCATTER nodes.
This class is used to represent an MLOAD node.
This base class is used to represent MLOAD and MSTORE nodes.
const SDValue & getMask() const
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
This class is used to represent an MSCATTER node.
This class is used to represent an MSTORE node.
bool isCompressingStore() const
Returns true if the op does a compression to the vector before storing.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isVolatile() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID for this memory operation.
Align getOriginalAlign() const
Returns alignment and volatility of the memory access.
bool readMem() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isNonTemporal() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition: Module.cpp:333
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:307
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:116
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
iterator_range< use_iterator > uses()
SDNodeFlags getFlags() const
MVT getSimpleValueType(unsigned ResNo) const
Return the type of a specified result as a simple type.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getNumOperands() const
Return the number of values used by this operation.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
static bool areOnlyUsersOf(ArrayRef< const SDNode * > Nodes, const SDNode *N)
Return true if all the users of N are contained in Nodes.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool isUndef() const
Return true if the type of the node type undefined.
bool hasNUsesOfValue(unsigned NUses, unsigned Value) const
Return true if there are exactly NUSES uses of the indicated value.
void setFlags(SDNodeFlags NewFlags)
op_iterator op_end() const
op_iterator op_begin() const
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
const APInt & getConstantOperandAPInt(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getOpcode() const
unsigned getNumOperands() const
Help to insert SDNodeFlags automatically in transforming.
Definition: SelectionDAG.h:361
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:225
bool willNotOverflowAdd(bool IsSigned, SDValue N0, SDValue N1) const
Determine if the result of the addition of 2 nodes can never overflow.
static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode)
Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
Definition: SelectionDAG.h:924
SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op)
Return the specified value casted to the target's desired shift amount type.
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:722
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
Definition: SelectionDAG.h:954
SDValue getSplatSourceVector(SDValue V, int &SplatIndex)
If V is a splatted value, return the source vector and its splat index.
unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS)
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:474
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
SDValue getConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offs=0, bool isT=false, unsigned TargetFlags=0)
SDNode * isConstantIntBuildVectorOrConstantInt(SDValue N) const
Test whether the given value is a constant int or similar node.
SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
bool shouldOptForSize() const
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:478
bool isEqualTo(SDValue A, SDValue B) const
Test whether two SDValues are known to compare equal.
static constexpr unsigned MaxRecursionDepth
Definition: SelectionDAG.h:448
SDValue expandVACopy(SDNode *Node)
Expand the specified ISD::VACOPY node as the Legalize pass would.
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
Definition: SelectionDAG.h:732
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:828
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, bool isTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
SDValue simplifySelect(SDValue Cond, SDValue TVal, SDValue FVal)
Try to simplify a select/vselect into 1 of its operands or a constant.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:472
SDValue expandVAArg(SDNode *Node)
Expand the specified ISD::VAARG node as the Legalize pass would.
bool doesNodeExist(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops)
Check if a node exists without modifying its flags.
bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
Definition: SelectionDAG.h:659
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getCommutedVectorShuffle(const ShuffleVectorSDNode &SV)
Returns an ISD::VECTOR_SHUFFLE node semantically equivalent to the shuffle node in input but with swa...
std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
const APInt * getValidShiftAmountConstant(SDValue V, const APInt &DemandedElts) const
If a SHL/SRA/SRL node V has a constant or splat constant shift amount that is less than the element b...
bool isGuaranteedNotToBeUndefOrPoison(SDValue Op, bool PoisonOnly=false, unsigned Depth=0) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
SDValue getRegister(unsigned Reg, EVT VT)
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:473
std::pair< SDValue, SDValue > getStrictFPExtendOrRound(SDValue Op, SDValue Chain, const SDLoc &DL, EVT VT)
Convert Op, which must be a STRICT operation of float type, to the float type VT, by either extending...
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:773
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
bool isKnownNeverZeroFloat(SDValue Op) const
Test whether the given floating point SDValue is known to never be positive or negative zero.
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:676
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool MaskedVectorIsZero(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
Return true if 'Op' is known to be zero in DemandedElts.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:768
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:469
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
Definition: SelectionDAG.h:799
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:845
SDValue FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops)
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVMContext * getContext() const
Definition: SelectionDAG.h:485
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL, bool LegalTypes=true)
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
SDValue getMCSymbol(MCSymbol *Sym, EVT VT)
SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:739
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:554
SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
static unsigned getOpcode_EXTEND(unsigned Opcode)
Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
Definition: SelectionDAG.h:908
SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp, ArrayRef< ISD::NodeType > CandidateBinOps, bool AllowPartials=false)
Match a binop + shuffle pyramid that represents a horizontal reduction over the elements of a vector ...
SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
static bool isBitRotateMask(ArrayRef< int > Mask, unsigned EltSizeInBits, unsigned MinSubElts, unsigned MaxSubElts, unsigned &NumSubElts, unsigned &RotateAmt)
Checks if the shuffle is a bit rotation of the first operand across multiple subelements,...
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
static void commuteMask(MutableArrayRef< int > Mask)
Change values in a shuffle permute mask assuming the two vector operands have swapped position.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
size_type count() const
Returns the number of bits which are set.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
size_type size() const
Definition: SmallSet.h:161
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:717
iterator erase(const_iterator CI)
Definition: SmallVector.h:750
typename SuperClass::const_iterator const_iterator
Definition: SmallVector.h:591
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
pointer data()
Return a pointer to the vector's buffer, even if empty().
Definition: SmallVector.h:299
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:317
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
constexpr StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition: StringRef.h:563
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:257
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:134
size_t size_type
Definition: StringRef.h:56
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:137
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:269
static constexpr size_t npos
Definition: StringRef.h:52
bool equals_insensitive(StringRef RHS) const
Check for string equality, ignoring case.
Definition: StringRef.h:170
size_t find_first_not_of(char C, size_t From=0) const
Find the first character in the string that is not C or npos if not found.
Definition: StringRef.cpp:251
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
static StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition: Type.cpp:373
Information about stack frame layout on the target.
virtual bool hasFP(const MachineFunction &MF) const =0
hasFP - Return true if the specified function should have a dedicated frame pointer register.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
virtual bool areJTsAllowed(const Function *Fn) const
Return true if lowering to a jump table is allowed.
void setLibcallCallingConv(RTLIB::Libcall Call, CallingConv::ID CC)
Set the CallingConv that should be used for the specified libcall.
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp convert the backend supports.
bool isTruncStoreLegal(EVT ValVT, EVT MemVT) const
Return true if the specified store with truncation is legal on this target.
virtual bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
virtual bool isCommutativeBinOp(unsigned Opcode) const
Returns true if the opcode is a commutative binary operation.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const
Return true if it is profitable to fold a pair of shifts into a mask.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
BooleanContent getBooleanContents(bool isVec, bool isFloat) const
For targets without i1 registers, this gives the nature of the high-bits of boolean values held in ty...
virtual MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const
Returns preferred type for switch condition.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setLibcallName(RTLIB::Libcall Call, const char *Name)
Rename the default libcall routine name for the specified libcall.
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
BooleanContent
Enum that describes how the target represents true/false values.
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
Return true if the target supports a memory access of this type for the given address space and align...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal on this target.
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
AndOrSETCCFoldKind
Enum of different potentially desirable ways to fold (and/or (setcc ...), (setcc ....
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
NegatibleCost
Enum that specifies when a float negation is beneficial.
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual bool shouldConvertPhiType(Type *From, Type *To) const
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
static ISD::NodeType getExtendForContent(BooleanContent Content)
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
virtual SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const
Expands target specific indirect branch for the case of JumpTable expansion.
SDValue getCheaperNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, unsigned Depth=0) const
This is the helper function to return the newly negated expression only when the cost is cheaper.
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue SimplifyMultipleUseDemandedVectorElts(SDValue Op, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
Helper wrapper around SimplifyMultipleUseDemandedBits, demanding all bits from only some vector eleme...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
virtual const char * LowerXConstraint(EVT ConstraintVT) const
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl< SDValue > &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL=SDValue(), SDValue LH=SDValue()) const
Attempt to expand an n-bit div/rem/divrem by constant using a n/2-bit urem by constant and other arit...
bool isPositionIndependent() const
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
bool verifyReturnAddressArgumentIsConstant(SDValue Op, SelectionDAG &DAG) const
virtual bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth=0) const
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, SDValue Index) const
Get a pointer to vector element Idx located in memory for a vector of type VecVT starting at a base a...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
virtual bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
bool useTLSDESC() const
Returns true if this target uses TLS Descriptors.
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
unsigned NoSignedZerosFPMath
NoSignedZerosFPMath - This flag is enabled when the -enable-no-signed-zeros-fp-math is specified on t...
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetInstrInfo * getInstrInfo() const
Target - Wrapper for Target specific information.
bool isOSMSVCRT() const
Is this a "Windows" OS targeting a "MSVCRT.dll" environment.
Definition: Triple.h:662
bool isOSDarwin() const
Is this a "Darwin" OS (macOS, iOS, tvOS, watchOS, XROS, or DriverKit).
Definition: Triple.h:553
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:342
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
static IntegerType * getInt1Ty(LLVMContext &C)
Type * getArrayElementType() const
Definition: Type.h:404
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:154
uint64_t getArrayNumElements() const
bool isX86_MMXTy() const
Return true if this is X86 MMX.
Definition: Type.h:201
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:143
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:157
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1808
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
use_iterator use_begin()
Definition: Value.h:360
bool use_empty() const
Definition: Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1074
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Definition: Type.cpp:683
Type * getElementType() const
Definition: DerivedTypes.h:436
bool has128ByteRedZone(const MachineFunction &MF) const
Return true if the function has a redzone (accessible bytes past the frame of the top of stack functi...
bool Uses64BitFramePtr
True if the 64-bit frame or stack pointer should be used.
unsigned getGlobalBaseReg(MachineFunction *MF) const
getGlobalBaseReg - Return a virtual register initialized with the the global base register value.
X86MachineFunctionInfo - This class is derived from MachineFunction and contains private X86 target-s...
ArrayRef< size_t > getPreallocatedArgOffsets(const size_t Id)
void setRestoreBasePointer(const MachineFunction *MF)
size_t getPreallocatedStackSize(const size_t Id)
unsigned getPtrSizedFrameRegister(const MachineFunction &MF) const
bool hasBasePointer(const MachineFunction &MF) const
Register getFrameRegister(const MachineFunction &MF) const override
const uint32_t * getDarwinTLSCallPreservedMask() const
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
Register getStackRegister() const
unsigned getSlotSize() const
Register getBaseRegister() const
const uint32_t * getNoPreservedMask() const override
bool canExtendTo512BW() const
Definition: X86Subtarget.h:250
bool hasAnyFMA() const
Definition: X86Subtarget.h:213
bool isOSWindows() const
Definition: X86Subtarget.h:336
bool isTargetMachO() const
Definition: X86Subtarget.h:302
bool useIndirectThunkBranches() const
Definition: X86Subtarget.h:235
bool hasSSE1() const
Definition: X86Subtarget.h:200
bool hasThreeDNow() const
Definition: X86Subtarget.h:211
bool isPICStyleGOT() const
Definition: X86Subtarget.h:342
bool hasSSE42() const
Definition: X86Subtarget.h:205
const X86TargetLowering * getTargetLowering() const override
Definition: X86Subtarget.h:125
bool hasMFence() const
Use mfence if we have SSE2 or we're on x86-64 (even if we asked for no-sse2).
Definition: X86Subtarget.h:290
bool canUseCMOV() const
Definition: X86Subtarget.h:199
bool isPICStyleStubPIC() const
Definition: X86Subtarget.h:345
bool isTargetWindowsMSVC() const
Definition: X86Subtarget.h:314
bool canUseCMPXCHG8B() const
Definition: X86Subtarget.h:192
bool isTargetDarwin() const
Definition: X86Subtarget.h:294
bool isTargetWin64() const
Definition: X86Subtarget.h:338
bool isTarget64BitLP64() const
Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
Definition: X86Subtarget.h:185
const Triple & getTargetTriple() const
Definition: X86Subtarget.h:292
const X86InstrInfo * getInstrInfo() const override
Definition: X86Subtarget.h:129
bool useAVX512Regs() const
Definition: X86Subtarget.h:267
bool hasSSE3() const
Definition: X86Subtarget.h:202
bool isCallingConvWin64(CallingConv::ID CC) const
Definition: X86Subtarget.h:351
bool hasAVX512() const
Definition: X86Subtarget.h:208
bool canExtendTo512DQ() const
Definition: X86Subtarget.h:246
bool hasSSE41() const
Definition: X86Subtarget.h:204
bool hasMMX() const
Definition: X86Subtarget.h:210
bool isTargetELF() const
Definition: X86Subtarget.h:300
bool hasSSEPrefetch() const
Definition: X86Subtarget.h:221
bool canUseCMPXCHG16B() const
Definition: X86Subtarget.h:193
unsigned char classifyGlobalReference(const GlobalValue *GV, const Module &M) const
bool hasSSE2() const
Definition: X86Subtarget.h:201
bool hasSSSE3() const
Definition: X86Subtarget.h:203
bool hasInt256() const
Definition: X86Subtarget.h:209
bool isPICStyleRIPRel() const
Definition: X86Subtarget.h:343
bool isTargetCygMing() const
Definition: X86Subtarget.h:334
unsigned char classifyLocalReference(const GlobalValue *GV) const
Classify a global variable reference for the current subtarget according to how we should reference i...
unsigned char classifyBlockAddressReference() const
Classify a blockaddress reference for the current subtarget according to how we should reference it i...
bool isTargetPS() const
Definition: X86Subtarget.h:298
const X86RegisterInfo * getRegisterInfo() const override
Definition: X86Subtarget.h:139
bool hasAVX() const
Definition: X86Subtarget.h:206
bool isTargetWindowsGNU() const
Definition: X86Subtarget.h:326
unsigned getPreferVectorWidth() const
Definition: X86Subtarget.h:239
bool isTargetWindowsItanium() const
Definition: X86Subtarget.h:330
bool isTargetNaCl64() const
Definition: X86Subtarget.h:310
const X86FrameLowering * getFrameLowering() const override
Definition: X86Subtarget.h:131
bool useBWIRegs() const
Definition: X86Subtarget.h:276
unsigned char classifyGlobalFunctionReference(const GlobalValue *GV, const Module &M) const
Classify a global function reference for the current subtarget.
bool hasAVX2() const
Definition: X86Subtarget.h:207
bool shouldFormOverflowOp(unsigned Opcode, EVT VT, bool MathUsed) const override
Overflow nodes should get combined/lowered to optimal instructions (they should allow eliminating exp...
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
bool isLegalAddImmediate(int64_t Imm) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool preferSextInRegOfTruncate(EVT TruncVT, EVT VT, EVT ExtVT) const override
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool preferABDSToABSWithNSW(EVT VT) const override
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
std::pair< SDValue, SDValue > BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer, MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const
bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded vector elements, returning true on success...
SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, const SDLoc &DL, const AsmOperandInfo &Constraint, SelectionDAG &DAG) const override
Handle Lowering flag assembly outputs.
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const override
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth) const override
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
bool convertSelectOfConstantsToMath(EVT VT) const override
Return true if a select of constants (select Cond, C1, C2) should be transformed into simple math ops...
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint letter, return the type of constraint for this target.
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
bool isVectorShiftByScalarCheap(Type *Ty) const override
This is used to enable splatted operand transforms for vector shifts and vector funnel shifts.
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool isLegalStoreImmediate(int64_t Imm) const override
Return true if the specified immediate is legal for the value input of a store instruction.
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
bool isCtlzFast() const override
Return true if ctlz instruction is fast.
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
bool supportSwiftError() const override
Return true if the target supports swifterror attribute.
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
bool shouldSplatInsEltVarIndex(EVT VT) const override
Return true if inserting a scalar into a variable element of an undef vector is more efficiently hand...
bool shouldSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Return true if sinking I's operands to the same basic block as I is profitable, e....
bool isInlineAsmTargetBranch(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo) const override
On x86, return true if the operand with index OpNo is a CALL or JUMP instruction, which can use eithe...
MVT hasFastEqualityCompare(unsigned NumBits) const override
Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op, const APInt &DemandedElts, unsigned MaskIndex, TargetLoweringOpt &TLO, unsigned Depth) const
bool isLegalICmpImmediate(int64_t Imm) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool hasInlineStackProbe(const MachineFunction &MF) const override
Returns true if stack probing through inline assembly is requested.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
unsigned preferedOpcodeForCmpEqPiecesOfOperand(EVT VT, unsigned ShiftOpc, bool MayTransformRotate, const APInt &ShiftOrRotateAmt, const std::optional< APInt > &AndMask) const override
bool isXAndYEqZeroPreferableToXAndYEqY(ISD::CondCode Cond, EVT VT) const override
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
bool ExpandInlineAsm(CallInst *CI) const override
This hook allows the target to expand an inline asm call to be explicit llvm code if it wants to.
bool hasAndNot(SDValue Y) const override
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const override
Return true if we believe it is correct and profitable to reduce the load node to a smaller type.
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool preferScalarizeSplat(SDNode *N) const override
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const override
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
bool useLoadStackGuardNode() const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
bool hasAndNotCompare(SDValue Y) const override
Return true if the target should transform: (X & Y) == Y —> (~X & Y) == 0 (X & Y) !...
bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override
Return true if it is profitable to convert a select of FP constants into a constant pool load whose a...
StringRef getStackProbeSymbolName(const MachineFunction &MF) const override
Returns the name of the symbol used to emit stack probes or the empty string if not applicable.
bool hasBitTest(SDValue X, SDValue Y) const override
Return true if the target has a bit-test instruction: (X & (1 << Y)) ==/!= 0 This knowledge can be us...
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
bool isShuffleMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
bool useStackGuardXorFP() const override
If this function returns true, stack protection checks should XOR the frame pointer (or whichever poi...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine the number of bits in the operation that are sign bits.
bool shouldScalarizeBinop(SDValue) const override
Scalar ops always have equal or better analysis/performance/power than the vector equivalent,...
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type Ty1 to type Ty2.
bool decomposeMulByConstant(LLVMContext &Context, EVT VT, SDValue C) const override
Return true if it is profitable to transform an integer multiplication-by-constant into simpler opera...
bool areJTsAllowed(const Function *Fn) const override
Returns true if lowering to a jump table is allowed.
bool isCommutativeBinOp(unsigned Opcode) const override
Returns true if the opcode is a commutative binary operation.
bool isScalarFPTypeInSSEReg(EVT VT) const
Return true if the specified scalar FP type is computed in an SSE register, not on the X87 floating p...
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const override
Returns preferred type for switch condition.
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool isVectorClearMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Similar to isShuffleMaskLegal.
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &Info, const char *Constraint) const override
Examine constraint string and operand type and determine a weight value.
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Customize the preferred legalization strategy for certain types.
bool shouldConvertPhiType(Type *From, Type *To) const override
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool hasStackProbeSymbol(const MachineFunction &MF) const override
Returns true if stack probing through a function call is requested.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type Ty1 implicit zero-extends the valu...
bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
This function returns true if the memory access is aligned or if the target allows this specific unal...
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, const SDLoc &DL) const override
TargetLowering::AndOrSETCCFoldKind isDesirableToCombineLogicOpOfSETCC(const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const override
Return prefered fold type, Abs if this is a vector, AddAnd if its an integer, None otherwise.
bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override
There are two ways to clear extreme bits (either low or high): Mask: x & (-1 << y) (the instcombine c...
bool addressingModeSupportsTLS(const GlobalValue &GV) const override
Returns true if the targets addressing mode can target thread local storage (TLS).
SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const override
Expands target specific indirect branch for the case of JumpTable expansion.
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isBinOp(unsigned Opcode) const override
Add x86-specific opcodes to the default list.
bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const override
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDValue unwrapAddress(SDValue N) const override
CondMergingParams getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs, const Value *Rhs) const override
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the value type to use for ISD::SETCC.
X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI)
bool isVectorLoadExtDesirable(SDValue) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
const Constant * getTargetConstantFromLoad(LoadSDNode *LD) const override
This method returns the constant pool value that will be loaded by LD.
EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const override
For types supported by the target, this is an identity function.
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
unsigned getStackProbeSize(const MachineFunction &MF) const
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
Replace the results of node with an illegal result type with new values built out of custom code.
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
bool needsFixedCatchObjects() const override
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:199
self_iterator getIterator()
Definition: ilist_node.h:109
#define INT64_MIN
Definition: DataTypes.h:74
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition: APInt.cpp:2978
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ X86_ThisCall
Similar to X86_StdCall.
Definition: CallingConv.h:122
@ X86_StdCall
stdcall is mostly used by the Win32 API.
Definition: CallingConv.h:99
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition: CallingConv.h:87
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ X86_FastCall
'fast' analog of X86_StdCall.
Definition: CallingConv.h:103
bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:751
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:237
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1133
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1129
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:724
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition: ISDOpcodes.h:477
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition: ISDOpcodes.h:147
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition: ISDOpcodes.h:498
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:251
@ ATOMIC_LOAD_NAND
Definition: ISDOpcodes.h:1276
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:560
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:715
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition: ISDOpcodes.h:1162
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_LOAD_MAX
Definition: ISDOpcodes.h:1278
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1248
@ STRICT_FCEIL
Definition: ISDOpcodes.h:427
@ ATOMIC_LOAD_UMIN
Definition: ISDOpcodes.h:1279
@ FRAME_TO_ARGS_OFFSET
FRAME_TO_ARGS_OFFSET - This node represents offset from frame pointer to first (possible) on-stack ar...
Definition: ISDOpcodes.h:124
@ RESET_FPENV
Set floating-point environment to default state.
Definition: ISDOpcodes.h:1009
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:488
@ FMAXNUM_IEEE
Definition: ISDOpcodes.h:986
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:240
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1038
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:784
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:484
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:199
@ RETURNADDR
Definition: ISDOpcodes.h:95
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition: ISDOpcodes.h:151
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
Definition: ISDOpcodes.h:1261
@ STRICT_FMINIMUM
Definition: ISDOpcodes.h:437
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:791
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:544
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:391
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:689
@ MEMBARRIER
MEMBARRIER - Compiler barrier only; generate a no-op.
Definition: ISDOpcodes.h:1235
@ ATOMIC_FENCE
OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope) This corresponds to the fence instruction.
Definition: ISDOpcodes.h:1240
@ SIGN_EXTEND_VECTOR_INREG
SIGN_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register sign-extension of the low ...
Definition: ISDOpcodes.h:821
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:256
@ STRICT_FSETCCS
Definition: ISDOpcodes.h:478
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:914
@ STRICT_FLOG2
Definition: ISDOpcodes.h:422
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1274
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:904
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:230
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1275
@ INIT_TRAMPOLINE
INIT_TRAMPOLINE - This corresponds to the init_trampoline intrinsic.
Definition: ISDOpcodes.h:1206
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:940
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition: ISDOpcodes.h:412
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1407
@ GlobalTLSAddress
Definition: ISDOpcodes.h:79
@ EH_LABEL
EH_LABEL - Represents a label in mid basic block used to track locations needed for debug and excepti...
Definition: ISDOpcodes.h:1109
@ EH_RETURN
OUTCHAIN = EH_RETURN(INCHAIN, OFFSET, HANDLER) - This node represents 'eh_return' gcc dwarf builtin,...
Definition: ISDOpcodes.h:135
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:886
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:775
@ STRICT_UINT_TO_FP
Definition: ISDOpcodes.h:451
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:621
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition: ISDOpcodes.h:101
@ BR
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:1054
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:723
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1228
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:995
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition: ISDOpcodes.h:759
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:931
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1084
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:328
@ ATOMIC_LOAD_MIN
Definition: ISDOpcodes.h:1277
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1063
@ GC_TRANSITION_START
GC_TRANSITION_START/GC_TRANSITION_END - These operators mark the beginning and end of GC transition s...
Definition: ISDOpcodes.h:1320
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:350
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:728
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1244
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:212
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition: ISDOpcodes.h:628
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition: ISDOpcodes.h:1158
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:209
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:324
@ STRICT_FTRUNC
Definition: ISDOpcodes.h:431
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:881
@ STRICT_FP_TO_FP16
Definition: ISDOpcodes.h:917
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:652
@ STRICT_FP16_TO_FP
Definition: ISDOpcodes.h:916
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:706
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:601
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1272
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:574
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:985
@ STRICT_FMAXIMUM
Definition: ISDOpcodes.h:436
@ STRICT_FMAXNUM
Definition: ISDOpcodes.h:425
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:536
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:203
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:781
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1218
@ FP_TO_UINT_SAT
Definition: ISDOpcodes.h:857
@ STRICT_FMINNUM
Definition: ISDOpcodes.h:426
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:743
@ ATOMIC_LOAD_UMAX
Definition: ISDOpcodes.h:1280
@ LOCAL_RECOVER
LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
Definition: ISDOpcodes.h:114
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:972
@ UBSANTRAP
UBSANTRAP - Trap with an immediate describing the kind of sanitizer failure.
Definition: ISDOpcodes.h:1222
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:332
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1048
@ ConstantPool
Definition: ISDOpcodes.h:82
@ ANY_EXTEND_VECTOR_INREG
ANY_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register any-extension of the low la...
Definition: ISDOpcodes.h:810
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:799
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:675
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:889
@ STRICT_FROUND
Definition: ISDOpcodes.h:429
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:737
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:304
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition: ISDOpcodes.h:450
@ STRICT_BF16_TO_FP
Definition: ISDOpcodes.h:925
@ STRICT_FFLOOR
Definition: ISDOpcodes.h:428
@ STRICT_FROUNDEVEN
Definition: ISDOpcodes.h:430
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:923
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition: ISDOpcodes.h:94
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1270
@ STRICT_FP_TO_UINT
Definition: ISDOpcodes.h:444
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:466
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:443
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:991
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1271
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:837
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1189
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition: ISDOpcodes.h:158
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:471
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:681
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1215
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:184
@ GET_FPENV_MEM
Gets the current floating-point environment.
Definition: ISDOpcodes.h:1014
@ STRICT_FP_TO_BF16
Definition: ISDOpcodes.h:926
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition: ISDOpcodes.h:401
@ STRICT_FLOG10
Definition: ISDOpcodes.h:421
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:525
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ STRICT_FEXP2
Definition: ISDOpcodes.h:419
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1269
@ ExternalSymbol
Definition: ISDOpcodes.h:83
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:870
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:415
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition: ISDOpcodes.h:832
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:908
@ STRICT_FNEARBYINT
Definition: ISDOpcodes.h:424
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition: ISDOpcodes.h:856
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition: ISDOpcodes.h:141
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:787
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition: ISDOpcodes.h:1153
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1077
@ BlockAddress
Definition: ISDOpcodes.h:84
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:764
@ GC_TRANSITION_END
Definition: ISDOpcodes.h:1321
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:494
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:341
@ AssertZext
Definition: ISDOpcodes.h:62
@ STRICT_FRINT
Definition: ISDOpcodes.h:423
@ SET_FPENV_MEM
Sets the current floating point environment.
Definition: ISDOpcodes.h:1019
@ ADJUST_TRAMPOLINE
ADJUST_TRAMPOLINE - This corresponds to the adjust_trampoline intrinsic.
Definition: ISDOpcodes.h:1212
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:314
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:192
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:516
bool isExtVecInRegOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1606
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
bool isBuildVectorOfConstantSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantSDNode or undef.
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool matchUnaryPredicate(SDValue Op, std::function< bool(ConstantSDNode *)> Match, bool AllowUndefs=false)
Hook for matching ConstantSDNode predicate.
bool isExtOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1601
CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
Definition: ISDOpcodes.h:1422
bool isTrueWhenEqual(CondCode Cond)
Return true if the specified condition returns true if the two operands to the condition are equal.
Definition: ISDOpcodes.h:1588
bool isUNINDEXEDLoad(const SDNode *N)
Returns true if the specified node is an unindexed load.
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
bool isFreezeUndef(const SDNode *N)
Return true if the specified node is FREEZE(UNDEF).
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
Definition: ISDOpcodes.h:1563
bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
bool isBuildVectorOfConstantFPSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantFPSDNode or undef.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1530
bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1510
bool isUnsignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs an unsigned comparison when used with intege...
Definition: ISDOpcodes.h:1569
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1471
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
Definition: PatternMatch.h:518
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
cst_pred_ty< is_sign_mask > m_SignMask()
Match an integer or vector with only the sign bit(s) set.
Definition: PatternMatch.h:658
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
Definition: PatternMatch.h:966
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:869
BinaryOp_match< LHS, RHS, Instruction::Xor, true > m_c_Xor(const LHS &L, const RHS &R)
Matches an Xor with LHS and RHS in either order.
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
Definition: PatternMatch.h:593
CmpClass_match< LHS, RHS, ICmpInst, ICmpInst::Predicate > m_ICmp(ICmpInst::Predicate &Pred, const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
CmpClass_match< LHS, RHS, ICmpInst, ICmpInst::Predicate, true > m_c_ICmp(ICmpInst::Predicate &Pred, const LHS &L, const RHS &R)
Matches an ICmp with a predicate over LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
Definition: PatternMatch.h:299
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall
RTLIB::Libcall enum - This enum defines all of the runtime library calls the backend can emit.
Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition: LLVMContext.h:54
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:57
@ GeneralDynamic
Definition: CodeGen.h:46
@ X86
Windows x64, Windows Itanium (IA-64)
@ PTR32_UPTR
Definition: X86.h:209
@ FS
Definition: X86.h:206
@ PTR64
Definition: X86.h:210
@ PTR32_SPTR
Definition: X86.h:208
@ GS
Definition: X86.h:205
Reg
All possible values of the reg field in the ModR/M byte.
@ MO_TLSLD
MO_TLSLD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
Definition: X86BaseInfo.h:425
@ MO_GOTPCREL_NORELAX
MO_GOTPCREL_NORELAX - Same as MO_GOTPCREL except that R_X86_64_GOTPCREL relocations are guaranteed to...
Definition: X86BaseInfo.h:405
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
Definition: X86BaseInfo.h:502
@ MO_NTPOFF
MO_NTPOFF - On a symbol operand this indicates that the immediate is the negative thread-pointer offs...
Definition: X86BaseInfo.h:464
@ MO_INDNTPOFF
MO_INDNTPOFF - On a symbol operand this indicates that the immediate is the absolute address of the G...
Definition: X86BaseInfo.h:446
@ MO_GOTNTPOFF
MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry w...
Definition: X86BaseInfo.h:470
@ MO_TPOFF
MO_TPOFF - On a symbol operand this indicates that the immediate is the thread-pointer offset for the...
Definition: X86BaseInfo.h:452
@ MO_TLVP_PIC_BASE
MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate is some TLS offset from the ...
Definition: X86BaseInfo.h:490
@ MO_TLSGD
MO_TLSGD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
Definition: X86BaseInfo.h:417
@ MO_NO_FLAG
MO_NO_FLAG - No flag for the operand.
Definition: X86BaseInfo.h:377
@ MO_TLVP
MO_TLVP - On a symbol operand this indicates that the immediate is some TLS offset.
Definition: X86BaseInfo.h:486
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the reference is actually to the "__imp...
Definition: X86BaseInfo.h:474
@ MO_GOTTPOFF
MO_GOTTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry wi...
Definition: X86BaseInfo.h:439
@ MO_SECREL
MO_SECREL - On a symbol operand this indicates that the immediate is the offset from beginning of sec...
Definition: X86BaseInfo.h:494
@ MO_DTPOFF
MO_DTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
Definition: X86BaseInfo.h:458
@ MO_TLSLDM
MO_TLSLDM - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
Definition: X86BaseInfo.h:433
@ MO_GOTPCREL
MO_GOTPCREL - On a symbol operand this indicates that the immediate is offset to the GOT entry for th...
Definition: X86BaseInfo.h:401
@ FST
This instruction implements a truncating store from FP stack slots.
@ CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FMAX
Floating point max and min.
@ BT
X86 bit-test instructions.
@ HADD
Integer horizontal add/sub.
@ MOVQ2DQ
Copies a 64-bit value from an MMX vector to the low word of an XMM vector, with the high word zero fi...
@ BLENDI
Blend where the selector is an immediate.
@ CMP
X86 compare and logical compare instructions.
@ BLENDV
Dynamic (non-constant condition) vector blend where only the sign bits of the condition elements are ...
@ ADDSUB
Combined add and sub on an FP vector.
@ STRICT_FCMP
X86 strict FP compare instructions.
@ STRICT_CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FHADD
Floating point horizontal add/sub.
@ BSR
Bit scan reverse.
@ SETCC
X86 SetCC.
@ NT_BRIND
BRIND node with NoTrack prefix.
@ SELECTS
X86 Select.
@ FSETCCM
X86 FP SETCC, similar to above, but with output as an i1 mask and and a version with SAE.
@ PEXTRB
Extract an 8-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRB.
@ FXOR
Bitwise logical XOR of floating point values.
@ BRCOND
X86 conditional branches.
@ FSETCC
X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
@ PINSRB
Insert the lower 8-bits of a 32-bit value to a vector, corresponds to X86::PINSRB.
@ INSERTPS
Insert any element of a 4 x float vector into any element of a destination 4 x floatvector.
@ PSHUFB
Shuffle 16 8-bit values within a vector.
@ PEXTRW
Extract a 16-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRW.
@ AADD
RAO arithmetic instructions.
@ FANDN
Bitwise logical ANDNOT of floating point values.
@ GlobalBaseReg
On Darwin, this node represents the result of the popl at function entry, used for PIC code.
@ FMAXC
Commutative FMIN and FMAX.
@ EXTRQI
SSE4A Extraction and Insertion.
@ FLD
This instruction implements an extending load to FP stack slots.
@ PSADBW
Compute Sum of Absolute Differences.
@ FOR
Bitwise logical OR of floating point values.
@ FIST
This instruction implements a fp->int store from FP stack slots.
@ FP_TO_INT_IN_MEM
This instruction implements FP_TO_SINT with the integer destination in memory and a FP reg source.
@ LADD
LOCK-prefixed arithmetic read-modify-write instructions.
@ MMX_MOVW2D
Copies a GPR into the low 32-bit word of a MMX vector and zero out the high word.
@ Wrapper
A wrapper node for TargetConstantPool, TargetJumpTable, TargetExternalSymbol, TargetGlobalAddress,...
@ PINSRW
Insert the lower 16-bits of a 32-bit value to a vector, corresponds to X86::PINSRW.
@ CMPCCXADD
Compare and Add if Condition is Met.
@ MMX_MOVD2W
Copies a 32-bit value from the low word of a MMX vector to a GPR.
@ FILD
This instruction implements SINT_TO_FP with the integer source in memory and FP reg result.
@ MOVDQ2Q
Copies a 64-bit value from the low word of an XMM vector to an MMX vector.
@ ANDNP
Bitwise Logical AND NOT of Packed FP values.
@ BSF
Bit scan forward.
@ VAARG_64
These instructions grab the address of the next argument from a va_list.
@ FAND
Bitwise logical AND of floating point values.
@ CMOV
X86 conditional moves.
@ WrapperRIP
Special wrapper used under X86-64 PIC mode for RIP relative displacements.
@ FSHL
X86 funnel/double shift i16 instructions.
@ FRSQRT
Floating point reciprocal-sqrt and reciprocal approximation.
@ TO_NEAREST_INT
Definition: X86BaseInfo.h:42
@ CUR_DIRECTION
Definition: X86BaseInfo.h:46
bool mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into a vector splat instruction as a memory oper...
bool isZeroNode(SDValue Elt)
Returns true if Elt is a constant zero or floating point constant +0.0.
CondCode GetOppositeBranchCondition(CondCode CC)
GetOppositeBranchCondition - Return the inverse of the specified cond, e.g.
bool mayFoldIntoZeroExtend(SDValue Op)
Check if Op is an operation that could be folded into a zero extend x86 instruction.
@ AddrNumOperands
Definition: X86BaseInfo.h:36
bool mayFoldIntoStore(SDValue Op)
Check if Op is a value that could be used to fold a store into some other x86 instruction as a memory...
bool isExtendedSwiftAsyncFrameSupported(const X86Subtarget &Subtarget, const MachineFunction &MF)
True if the target supports the extended frame for async Swift functions.
bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into some other x86 instruction as a memory oper...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement)
Returns true of the given offset can be fit into displacement field of the instruction.
bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs)
If Op is a constant whose elements are all the same constant or undefined, return true and return the...
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
std::optional< const char * > toString(const std::optional< DWARFFormValue > &V)
Take an optional DWARFFormValue and try to extract a string value from it.
constexpr double e
Definition: MathExtras.h:31
NodeAddr< FuncNode * > Func
Definition: RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits, unsigned NumDstElts, bool IsAnyExtend, SmallVectorImpl< int > &ShuffleMask)
Decode a zero extension instruction as a shuffle mask.
IterT next_nodbg(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It, then continue incrementing it while it points to a debug instruction.
static bool isGlobalStubReference(unsigned char TargetFlag)
isGlobalStubReference - Return true if the specified TargetFlag operand is a reference to a stub for ...
Definition: X86InstrInfo.h:109
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:456
@ Length
Definition: DWP.cpp:456
void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVHLPS instruction as a v2f64/v4f32 shuffle mask.
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1742
static bool isGlobalRelativeToPICBase(unsigned char TargetFlag)
isGlobalRelativeToPICBase - Return true if the specified global value reference is relative to a 32-b...
Definition: X86InstrInfo.h:127
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
void DecodeZeroMoveLowMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decode a move lower and zero upper instruction as a shuffle mask.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMILPD/VPERMILPS variable mask from a raw array of constants.
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
bool isAllOnesOrAllOnesSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant -1 integer or a splatted vector of a constant -1 integer (with...
Definition: Utils.cpp:1527
void DecodePSHUFLWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshuflw.
static const IntrinsicData * getIntrinsicWithChain(unsigned IntNo)
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2406
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition: MathExtras.h:343
MCRegister getX86SubSuperRegister(MCRegister Reg, unsigned Size, bool High=false)
AddressSpace
Definition: NVPTXBaseInfo.h:21
@ SjLj
setjmp/longjmp based exceptions
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
static void setDirectAddressInInstr(MachineInstr *MI, unsigned Operand, unsigned Reg)
Replace the address used in the instruction with the direct memory reference.
void DecodeVPERMV3Mask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
void DecodeBLENDMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a BLEND immediate mask into a shuffle mask.
void decodeVSHUF64x2FamilyMask(unsigned NumElts, unsigned ScalarSize, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a shuffle packed values at 128-bit granularity (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) immed...
void DecodeVPERMMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for VPERMQ/VPERMPD.
static const MachineInstrBuilder & addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset=0, bool mem=true)
addFrameReference - This function is used to add a reference to the base of an abstract object on the...
void DecodeEXTRQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A EXTRQ instruction as a shuffle mask.
static const MachineInstrBuilder & addFullAddress(const MachineInstrBuilder &MIB, const X86AddressMode &AM)
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:280
static const IntrinsicData * getIntrinsicWithoutChain(unsigned IntNo)
@ SM_SentinelUndef
@ SM_SentinelZero
bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition: Utils.cpp:1509
void DecodePSRLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:342
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1768
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:330
bool isMinSignedConstant(SDValue V)
Returns true if V is a constant min signed integer value.
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:372
void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
unsigned M1(unsigned Val)
Definition: VE.h:376
void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMIL2PD/VPERMIL2PS variable mask from a raw array of constants.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVLHPS instruction as a v2f64/v4f32 shuffle mask.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
static const MachineInstrBuilder & addRegOffset(const MachineInstrBuilder &MIB, unsigned Reg, bool isKill, int Offset)
addRegOffset - This function is used to add a memory reference of the form [Reg + Offset],...
void createUnpackShuffleMask(EVT VT, SmallVectorImpl< int > &Mask, bool Lo, bool Unary)
Generate unpacklo/unpackhi shuffle mask.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:138
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
void DecodeINSERTQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A INSERTQ instruction as a shuffle mask.
SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:143
void DecodeVPERMVMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants.
static void verifyIntrinsicTables()
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Mod
The access may modify the value stored in memory.
void createSplat2ShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Lo)
Similar to unpacklo/unpackhi, but without the 128-bit lane limitation imposed by AVX and specific to ...
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
bool isFuncletEHPersonality(EHPersonality Pers)
Returns true if this is a personality function that invokes handler funclets (which must return to it...
void DecodeVALIGNMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
CombineLevel
Definition: DAGCombine.h:15
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1954
void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Xor
Bitwise or logical XOR of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Add
Sum of integers.
void DecodeScalarMoveMask(unsigned NumElts, bool IsLoad, SmallVectorImpl< int > &ShuffleMask)
Decode a scalar float move instruction as a shuffle mask.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1914
static X86AddressMode getAddressFromInstr(const MachineInstr *MI, unsigned Operand)
Compute the addressing mode from an machine instruction starting with the given operand.
void DecodeVPPERMMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPPERM mask from a raw array of constants such as from BUILD_VECTOR.
DWARFExpression::Operation Op
void DecodePALIGNRMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
void DecodeMOVSLDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
RoundingMode
Rounding mode.
@ TowardZero
roundTowardZero.
@ NearestTiesToEven
roundTiesToEven.
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
unsigned M0(unsigned Val)
Definition: VE.h:375
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a 128-bit INSERTPS instruction as a v4f32 shuffle mask.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpcklps/unpcklpd and punpckl*.
void DecodePSLLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1921
void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpckhps/unpckhpd and punpckh*.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2039
@ TRUNCATE_TO_MEM_VI16
@ INTR_TYPE_SCALAR_MASK_SAE
@ INTR_TYPE_1OP_SAE
@ TRUNCATE_TO_MEM_VI32
@ INTR_TYPE_2OP_SAE
@ TRUNCATE_TO_REG
@ INTR_TYPE_3OP_SCALAR_MASK_SAE
@ INTR_TYPE_3OP_MASK_SAE
@ INTR_TYPE_2OP_MASK
@ TRUNCATE_TO_MEM_VI8
@ CVTNEPS2BF16_MASK
@ CMP_MASK_SCALAR_CC
@ INTR_TYPE_1OP_MASK_SAE
@ FIXUPIMM_MASKZ
@ INTR_TYPE_SCALAR_MASK
@ INTR_TYPE_3OP_IMM8
@ INTR_TYPE_2OP_MASK_SAE
@ INTR_TYPE_SCALAR_MASK_RND
@ INTR_TYPE_1OP_MASK
@ COMPRESS_EXPAND_IN_REG
@ INTR_TYPE_4OP_IMM8
void DecodePSHUFMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufd/pshufw/vpermilpd/vpermilps.
void DecodeMOVDDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
void array_pod_sort(IteratorTy Start, IteratorTy End)
array_pod_sort - This sorts an array with the specified start and end extent.
Definition: STLExtras.h:1607
void DecodeVectorBroadcast(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decodes a broadcast of the first element of a vector.
void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for shufp*.
void DecodePSHUFHWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufhw.
void DecodeMOVSHDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition: MathExtras.h:203
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
void DecodePSHUFBMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a PSHUFB mask from a raw array of constants such as from BUILD_VECTOR.
int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
static const MachineInstrBuilder & addDirectMem(const MachineInstrBuilder &MIB, unsigned Reg)
addDirectMem - This function is used to add a direct memory reference to the current instruction – th...
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
#define EQ(a, b)
Definition: regexec.c:112
This is used by foldAnyOrAllBitsSet() to capture a source value (Root) and the bit indexes (Mask) nee...
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:249
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:230
static constexpr roundingMode rmTowardZero
Definition: APFloat.h:234
static const fltSemantics & x87DoubleExtended() LLVM_READNONE
Definition: APFloat.cpp:263
static const fltSemantics & IEEEquad() LLVM_READNONE
Definition: APFloat.cpp:251
static const fltSemantics & IEEEdouble() LLVM_READNONE
Definition: APFloat.cpp:250
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:247
static const fltSemantics & BFloat() LLVM_READNONE
Definition: APFloat.cpp:248
opStatus
IEEE-754R 7: Default exception handling.
Definition: APFloat.h:246
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Extended Value Type.
Definition: ValueTypes.h:34
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition: ValueTypes.h:93
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:380
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:120
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:290
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:146
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:448
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:233
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition: ValueTypes.h:349
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:628
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:397
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition: ValueTypes.h:203
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:366
bool is512BitVector() const
Return true if this is a 512-bit vector type.
Definition: ValueTypes.h:213
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition: ValueTypes.h:58
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition: ValueTypes.h:208
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:202
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:156
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:101
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:438
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:151
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition: ValueTypes.h:198
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:297
static std::optional< bool > eq(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_EQ result.
Definition: KnownBits.cpp:494
KnownBits anyextOrTrunc(unsigned BitWidth) const
Return known bits for an "any" extension or truncation of the value we're tracking.
Definition: KnownBits.h:182
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition: KnownBits.h:104
bool isZero() const
Returns true if value is all zero.
Definition: KnownBits.h:77
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition: KnownBits.h:238
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:63
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition: KnownBits.h:157
bool hasConflict() const
Returns true if there is conflicting information.
Definition: KnownBits.h:47
unsigned countMaxPopulation() const
Returns the maximum number of bits that could be one.
Definition: KnownBits.h:285
void setAllZero()
Make all bits known to be zero and discard any previous information.
Definition: KnownBits.h:89
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:40
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition: KnownBits.h:168
bool isConstant() const
Returns true if we know the value of all bits.
Definition: KnownBits.h:50
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:71
static KnownBits abdu(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits for abdu(LHS, RHS).
Definition: KnownBits.cpp:234
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition: KnownBits.h:221
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:292
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition: KnownBits.h:307
KnownBits zextOrTrunc(unsigned BitWidth) const
Return known bits for a zero extension or truncation of the value we're tracking.
Definition: KnownBits.h:192
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:244
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition: KnownBits.h:141
static KnownBits computeForAddSub(bool Add, bool NSW, bool NUW, const KnownBits &LHS, const KnownBits &RHS)
Compute known bits resulting from adding LHS and RHS.
Definition: KnownBits.cpp:57
bool isNegative() const
Returns true if this value is known to be negative.
Definition: KnownBits.h:101
void setAllOnes()
Make all bits known to be one and discard any previous information.
Definition: KnownBits.h:95
static KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
Definition: KnownBits.cpp:777
static std::optional< bool > sgt(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_SGT result.
Definition: KnownBits.cpp:532
bool isAllOnes() const
Returns true if value is all one bits.
Definition: KnownBits.h:83
const APInt & getConstant() const
Returns the value when all bits have a known value.
Definition: KnownBits.h:57
This class contains a discriminated union of information about pointers in memory operands,...
bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
MachinePointerInfo getWithOffset(int64_t O) const
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoSignedZeros() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This contains information for each constraint that we are lowering.
MVT ConstraintVT
The ValueType for the operand value.
std::string ConstraintCode
This contains the actual string for the code, like "m".
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
CallLoweringInfo & setChain(SDValue InChain)
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
bool CombineTo(SDValue O, SDValue N)
X86AddressMode - This struct holds a generalized full x86 address mode.