LLVM 19.0.0git
X86ISelLowering.cpp
Go to the documentation of this file.
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
16#include "X86.h"
17#include "X86CallingConv.h"
18#include "X86FrameLowering.h"
19#include "X86InstrBuilder.h"
20#include "X86IntrinsicsInfo.h"
22#include "X86TargetMachine.h"
23#include "X86TargetObjectFile.h"
25#include "llvm/ADT/SmallSet.h"
26#include "llvm/ADT/Statistic.h"
43#include "llvm/IR/CallingConv.h"
44#include "llvm/IR/Constants.h"
47#include "llvm/IR/Function.h"
48#include "llvm/IR/GlobalAlias.h"
50#include "llvm/IR/IRBuilder.h"
52#include "llvm/IR/Intrinsics.h"
54#include "llvm/MC/MCAsmInfo.h"
55#include "llvm/MC/MCContext.h"
56#include "llvm/MC/MCExpr.h"
57#include "llvm/MC/MCSymbol.h"
59#include "llvm/Support/Debug.h"
64#include <algorithm>
65#include <bitset>
66#include <cctype>
67#include <numeric>
68using namespace llvm;
69
70#define DEBUG_TYPE "x86-isel"
71
73 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
75 "Sets the preferable loop alignment for experiments (as log2 bytes) "
76 "for innermost loops only. If specified, this option overrides "
77 "alignment set by x86-experimental-pref-loop-alignment."),
79
81 "x86-br-merging-base-cost", cl::init(2),
83 "Sets the cost threshold for when multiple conditionals will be merged "
84 "into one branch versus be split in multiple branches. Merging "
85 "conditionals saves branches at the cost of additional instructions. "
86 "This value sets the instruction cost limit, below which conditionals "
87 "will be merged, and above which conditionals will be split. Set to -1 "
88 "to never merge branches."),
90
92 "x86-br-merging-likely-bias", cl::init(0),
93 cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely "
94 "that all conditionals will be executed. For example for merging "
95 "the conditionals (a == b && c > d), if its known that a == b is "
96 "likely, then it is likely that if the conditionals are split "
97 "both sides will be executed, so it may be desirable to increase "
98 "the instruction cost threshold. Set to -1 to never merge likely "
99 "branches."),
100 cl::Hidden);
101
103 "x86-br-merging-unlikely-bias", cl::init(-1),
104 cl::desc(
105 "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
106 "that all conditionals will be executed. For example for merging "
107 "the conditionals (a == b && c > d), if its known that a == b is "
108 "unlikely, then it is unlikely that if the conditionals are split "
109 "both sides will be executed, so it may be desirable to decrease "
110 "the instruction cost threshold. Set to -1 to never merge unlikely "
111 "branches."),
112 cl::Hidden);
113
115 "mul-constant-optimization", cl::init(true),
116 cl::desc("Replace 'mul x, Const' with more effective instructions like "
117 "SHIFT, LEA, etc."),
118 cl::Hidden);
119
121 const X86Subtarget &STI)
122 : TargetLowering(TM), Subtarget(STI) {
123 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
124 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
125
126 // Set up the TargetLowering object.
127
128 // X86 is weird. It always uses i8 for shift amounts and setcc results.
130 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
132
133 // For 64-bit, since we have so many registers, use the ILP scheduler.
134 // For 32-bit, use the register pressure specific scheduling.
135 // For Atom, always use ILP scheduling.
136 if (Subtarget.isAtom())
138 else if (Subtarget.is64Bit())
140 else
142 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
144
145 // Bypass expensive divides and use cheaper ones.
146 if (TM.getOptLevel() >= CodeGenOptLevel::Default) {
147 if (Subtarget.hasSlowDivide32())
148 addBypassSlowDiv(32, 8);
149 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
150 addBypassSlowDiv(64, 32);
151 }
152
153 // Setup Windows compiler runtime calls.
154 if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
155 static const struct {
156 const RTLIB::Libcall Op;
157 const char * const Name;
158 const CallingConv::ID CC;
159 } LibraryCalls[] = {
160 { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
161 { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
162 { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
163 { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
164 { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
165 };
166
167 for (const auto &LC : LibraryCalls) {
168 setLibcallName(LC.Op, LC.Name);
169 setLibcallCallingConv(LC.Op, LC.CC);
170 }
171 }
172
173 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
174 // MSVCRT doesn't have powi; fall back to pow
175 setLibcallName(RTLIB::POWI_F32, nullptr);
176 setLibcallName(RTLIB::POWI_F64, nullptr);
177 }
178
179 if (Subtarget.canUseCMPXCHG16B())
181 else if (Subtarget.canUseCMPXCHG8B())
183 else
185
186 setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
187
189
190 // Set up the register classes.
191 addRegisterClass(MVT::i8, &X86::GR8RegClass);
192 addRegisterClass(MVT::i16, &X86::GR16RegClass);
193 addRegisterClass(MVT::i32, &X86::GR32RegClass);
194 if (Subtarget.is64Bit())
195 addRegisterClass(MVT::i64, &X86::GR64RegClass);
196
197 for (MVT VT : MVT::integer_valuetypes())
199
200 // We don't accept any truncstore of integer registers.
201 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
202 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
203 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
204 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
205 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
206 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
207
208 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
209
210 // SETOEQ and SETUNE require checking two conditions.
211 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
214 }
215
216 // Integer absolute.
217 if (Subtarget.canUseCMOV()) {
218 setOperationAction(ISD::ABS , MVT::i16 , Custom);
219 setOperationAction(ISD::ABS , MVT::i32 , Custom);
220 if (Subtarget.is64Bit())
221 setOperationAction(ISD::ABS , MVT::i64 , Custom);
222 }
223
224 // Absolute difference.
225 for (auto Op : {ISD::ABDS, ISD::ABDU}) {
226 setOperationAction(Op , MVT::i8 , Custom);
227 setOperationAction(Op , MVT::i16 , Custom);
228 setOperationAction(Op , MVT::i32 , Custom);
229 if (Subtarget.is64Bit())
230 setOperationAction(Op , MVT::i64 , Custom);
231 }
232
233 // Signed saturation subtraction.
237 if (Subtarget.is64Bit())
239
240 // Funnel shifts.
241 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
242 // For slow shld targets we only lower for code size.
243 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
244
245 setOperationAction(ShiftOp , MVT::i8 , Custom);
246 setOperationAction(ShiftOp , MVT::i16 , Custom);
247 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
248 if (Subtarget.is64Bit())
249 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
250 }
251
252 if (!Subtarget.useSoftFloat()) {
253 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
254 // operation.
259 // We have an algorithm for SSE2, and we turn this into a 64-bit
260 // FILD or VCVTUSI2SS/SD for other targets.
263 // We have an algorithm for SSE2->double, and we turn this into a
264 // 64-bit FILD followed by conditional FADD for other targets.
267
268 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
269 // this operation.
272 // SSE has no i16 to fp conversion, only i32. We promote in the handler
273 // to allow f80 to use i16 and f64 to use i16 with sse1 only
276 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
279 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
280 // are Legal, f80 is custom lowered.
283
284 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
285 // this operation.
287 // FIXME: This doesn't generate invalid exception when it should. PR44019.
293 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
294 // are Legal, f80 is custom lowered.
297
298 // Handle FP_TO_UINT by promoting the destination to a larger signed
299 // conversion.
301 // FIXME: This doesn't generate invalid exception when it should. PR44019.
304 // FIXME: This doesn't generate invalid exception when it should. PR44019.
310
315
316 if (!Subtarget.is64Bit()) {
319 }
320 }
321
322 if (Subtarget.hasSSE2()) {
323 // Custom lowering for saturating float to int conversions.
324 // We handle promotion to larger result types manually.
325 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
328 }
329 if (Subtarget.is64Bit()) {
332 }
333 }
334
335 // Handle address space casts between mixed sized pointers.
338
339 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
340 if (!Subtarget.hasSSE2()) {
343 if (Subtarget.is64Bit()) {
345 // Without SSE, i64->f64 goes through memory.
347 }
348 } else if (!Subtarget.is64Bit())
350
351 // Scalar integer divide and remainder are lowered to use operations that
352 // produce two results, to match the available instructions. This exposes
353 // the two-result form to trivial CSE, which is able to combine x/y and x%y
354 // into a single instruction.
355 //
356 // Scalar integer multiply-high is also lowered to use two-result
357 // operations, to match the available instructions. However, plain multiply
358 // (low) operations are left as Legal, as there are single-result
359 // instructions for this in x86. Using the two-result multiply instructions
360 // when both high and low results are needed must be arranged by dagcombine.
361 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
368 }
369
370 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
372 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
373 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
376 }
377 if (Subtarget.is64Bit())
382
383 setOperationAction(ISD::FREM , MVT::f32 , Expand);
384 setOperationAction(ISD::FREM , MVT::f64 , Expand);
385 setOperationAction(ISD::FREM , MVT::f80 , Expand);
386 setOperationAction(ISD::FREM , MVT::f128 , Expand);
387
388 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
394 }
395
396 // Promote the i8 variants and force them on up to i32 which has a shorter
397 // encoding.
398 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
400 // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
401 // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
402 // promote that too.
403 setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);
405
406 if (!Subtarget.hasBMI()) {
407 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
409 if (Subtarget.is64Bit()) {
410 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
412 }
413 }
414
415 if (Subtarget.hasLZCNT()) {
416 // When promoting the i8 variants, force them to i32 for a shorter
417 // encoding.
418 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
420 } else {
421 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
422 if (VT == MVT::i64 && !Subtarget.is64Bit())
423 continue;
426 }
427 }
428
431 // Special handling for half-precision floating point conversions.
432 // If we don't have F16C support, then lower half float conversions
433 // into library calls.
435 Op, MVT::f32,
436 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
437 // There's never any support for operations beyond MVT::f32.
438 setOperationAction(Op, MVT::f64, Expand);
439 setOperationAction(Op, MVT::f80, Expand);
440 setOperationAction(Op, MVT::f128, Expand);
441 }
442
443 for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
446 }
447
448 for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
449 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
450 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
451 setTruncStoreAction(VT, MVT::f16, Expand);
452 setTruncStoreAction(VT, MVT::bf16, Expand);
453
456 }
457
461 if (Subtarget.is64Bit())
463 if (Subtarget.hasPOPCNT()) {
464 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
465 // popcntw is longer to encode than popcntl and also has a false dependency
466 // on the dest that popcntl hasn't had since Cannon Lake.
467 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
468 } else {
473 }
474
476
477 if (!Subtarget.hasMOVBE())
479
480 // X86 wants to expand cmov itself.
481 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
486 }
487 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
488 if (VT == MVT::i64 && !Subtarget.is64Bit())
489 continue;
492 }
493
494 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
497
499 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
500 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
504 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
505 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
506
507 // Darwin ABI issue.
508 for (auto VT : { MVT::i32, MVT::i64 }) {
509 if (VT == MVT::i64 && !Subtarget.is64Bit())
510 continue;
517 }
518
519 // 64-bit shl, sra, srl (iff 32-bit x86)
520 for (auto VT : { MVT::i32, MVT::i64 }) {
521 if (VT == MVT::i64 && !Subtarget.is64Bit())
522 continue;
526 }
527
528 if (Subtarget.hasSSEPrefetch() || Subtarget.hasThreeDNow())
530
532
533 // Expand certain atomics
534 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
542 }
543
544 if (!Subtarget.is64Bit())
546
547 if (Subtarget.canUseCMPXCHG16B())
549
550 // FIXME - use subtarget debug flags
551 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
552 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
553 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
555 }
556
559
562
563 setOperationAction(ISD::TRAP, MVT::Other, Legal);
565 if (Subtarget.isTargetPS())
567 else
569
570 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
572 setOperationAction(ISD::VAEND , MVT::Other, Expand);
573 bool Is64Bit = Subtarget.is64Bit();
574 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
575 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
576
579
581
582 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
585
587
588 auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
589 setOperationAction(ISD::FABS, VT, Action);
590 setOperationAction(ISD::FNEG, VT, Action);
592 setOperationAction(ISD::FREM, VT, Action);
593 setOperationAction(ISD::FMA, VT, Action);
594 setOperationAction(ISD::FMINNUM, VT, Action);
595 setOperationAction(ISD::FMAXNUM, VT, Action);
598 setOperationAction(ISD::FSIN, VT, Action);
599 setOperationAction(ISD::FCOS, VT, Action);
600 setOperationAction(ISD::FSINCOS, VT, Action);
601 setOperationAction(ISD::FSQRT, VT, Action);
602 setOperationAction(ISD::FPOW, VT, Action);
603 setOperationAction(ISD::FLOG, VT, Action);
604 setOperationAction(ISD::FLOG2, VT, Action);
605 setOperationAction(ISD::FLOG10, VT, Action);
606 setOperationAction(ISD::FEXP, VT, Action);
607 setOperationAction(ISD::FEXP2, VT, Action);
608 setOperationAction(ISD::FEXP10, VT, Action);
609 setOperationAction(ISD::FCEIL, VT, Action);
610 setOperationAction(ISD::FFLOOR, VT, Action);
612 setOperationAction(ISD::FRINT, VT, Action);
613 setOperationAction(ISD::BR_CC, VT, Action);
614 setOperationAction(ISD::SETCC, VT, Action);
617 setOperationAction(ISD::FROUND, VT, Action);
619 setOperationAction(ISD::FTRUNC, VT, Action);
620 setOperationAction(ISD::FLDEXP, VT, Action);
621 };
622
623 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
624 // f16, f32 and f64 use SSE.
625 // Set up the FP register classes.
626 addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
627 : &X86::FR16RegClass);
628 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
629 : &X86::FR32RegClass);
630 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
631 : &X86::FR64RegClass);
632
633 // Disable f32->f64 extload as we can only generate this in one instruction
634 // under optsize. So its easier to pattern match (fpext (load)) for that
635 // case instead of needing to emit 2 instructions for extload in the
636 // non-optsize case.
637 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
638
639 for (auto VT : { MVT::f32, MVT::f64 }) {
640 // Use ANDPD to simulate FABS.
642
643 // Use XORP to simulate FNEG.
645
646 // Use ANDPD and ORPD to simulate FCOPYSIGN.
648
649 // These might be better off as horizontal vector ops.
652
653 // We don't support sin/cos/fmod
657 }
658
659 // Half type will be promoted by default.
660 setF16Action(MVT::f16, Promote);
668
698
699 setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
700 setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
701
702 // Lower this to MOVMSK plus an AND.
705
706 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
707 (UseX87 || Is64Bit)) {
708 // Use SSE for f32, x87 for f64.
709 // Set up the FP register classes.
710 addRegisterClass(MVT::f32, &X86::FR32RegClass);
711 if (UseX87)
712 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
713
714 // Use ANDPS to simulate FABS.
716
717 // Use XORP to simulate FNEG.
719
720 if (UseX87)
722
723 // Use ANDPS and ORPS to simulate FCOPYSIGN.
724 if (UseX87)
727
728 // We don't support sin/cos/fmod
732
733 if (UseX87) {
734 // Always expand sin/cos functions even though x87 has an instruction.
738 }
739 } else if (UseX87) {
740 // f32 and f64 in x87.
741 // Set up the FP register classes.
742 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
743 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
744
745 for (auto VT : { MVT::f32, MVT::f64 }) {
748
749 // Always expand sin/cos functions even though x87 has an instruction.
753 }
754 }
755
756 // Expand FP32 immediates into loads from the stack, save special cases.
757 if (isTypeLegal(MVT::f32)) {
758 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
759 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
760 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
761 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
762 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
763 } else // SSE immediates.
764 addLegalFPImmediate(APFloat(+0.0f)); // xorps
765 }
766 // Expand FP64 immediates into loads from the stack, save special cases.
767 if (isTypeLegal(MVT::f64)) {
768 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
769 addLegalFPImmediate(APFloat(+0.0)); // FLD0
770 addLegalFPImmediate(APFloat(+1.0)); // FLD1
771 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
772 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
773 } else // SSE immediates.
774 addLegalFPImmediate(APFloat(+0.0)); // xorpd
775 }
776 // Support fp16 0 immediate.
777 if (isTypeLegal(MVT::f16))
778 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
779
780 // Handle constrained floating-point operations of scalar.
793
794 // We don't support FMA.
797
798 // f80 always uses X87.
799 if (UseX87) {
800 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
803 {
805 addLegalFPImmediate(TmpFlt); // FLD0
806 TmpFlt.changeSign();
807 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
808
809 bool ignored;
810 APFloat TmpFlt2(+1.0);
812 &ignored);
813 addLegalFPImmediate(TmpFlt2); // FLD1
814 TmpFlt2.changeSign();
815 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
816 }
817
818 // Always expand sin/cos functions even though x87 has an instruction.
822
834
835 // Handle constrained floating-point operations of scalar.
841 if (isTypeLegal(MVT::f16)) {
844 } else {
846 }
847 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
848 // as Custom.
850 }
851
852 // f128 uses xmm registers, but most operations require libcalls.
853 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
854 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
855 : &X86::VR128RegClass);
856
857 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
858
869
873
879 // No STRICT_FSINCOS
882
885 // We need to custom handle any FP_ROUND with an f128 input, but
886 // LegalizeDAG uses the result type to know when to run a custom handler.
887 // So we have to list all legal floating point result types here.
888 if (isTypeLegal(MVT::f32)) {
891 }
892 if (isTypeLegal(MVT::f64)) {
895 }
896 if (isTypeLegal(MVT::f80)) {
899 }
900
902
903 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
904 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
905 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
906 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
907 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
908 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
909 }
910
911 // Always use a library call for pow.
912 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
913 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
914 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
915 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
916
925
926 // Some FP actions are always expanded for vector types.
927 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
928 MVT::v4f32, MVT::v8f32, MVT::v16f32,
929 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
942 }
943
944 // First set operation action for all vector types to either promote
945 // (for widening) or expand (for scalarization). Then we will selectively
946 // turn on ones that can be effectively codegen'd.
986 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
987 setTruncStoreAction(InnerVT, VT, Expand);
988
991
992 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
993 // types, we have to deal with them whether we ask for Expansion or not.
994 // Setting Expand causes its own optimisation problems though, so leave
995 // them legal.
996 if (VT.getVectorElementType() == MVT::i1)
997 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
998
999 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
1000 // split/scalarized right now.
1001 if (VT.getVectorElementType() == MVT::f16 ||
1002 VT.getVectorElementType() == MVT::bf16)
1003 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1004 }
1005 }
1006
1007 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
1008 // with -msoft-float, disable use of MMX as well.
1009 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1010 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
1011 // No operations on x86mmx supported, everything uses intrinsics.
1012 }
1013
1014 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1015 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1016 : &X86::VR128RegClass);
1017
1020
1021 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
1022 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
1029
1030 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
1031 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
1032
1038 }
1039
1040 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1041 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1042 : &X86::VR128RegClass);
1043
1044 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1045 // registers cannot be used even for integer operations.
1046 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1047 : &X86::VR128RegClass);
1048 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1049 : &X86::VR128RegClass);
1050 addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1051 : &X86::VR128RegClass);
1052 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1053 : &X86::VR128RegClass);
1054 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1055 : &X86::VR128RegClass);
1056
1057 for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1060 }
1061
1062 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1063 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1068 }
1069
1070 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
1071 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
1072 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
1073
1074 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
1075 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1076 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1077 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
1078 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
1079 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
1080 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
1081 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
1082 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
1083 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
1086
1087 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
1088 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
1089 setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
1090
1091 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
1092 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
1094
1095 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1096 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1097 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1098 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1099 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1100 }
1101
1102 setOperationAction(ISD::ABDU, MVT::v16i8, Custom);
1103 setOperationAction(ISD::ABDS, MVT::v16i8, Custom);
1104 setOperationAction(ISD::ABDU, MVT::v8i16, Custom);
1105 setOperationAction(ISD::ABDS, MVT::v8i16, Custom);
1106 setOperationAction(ISD::ABDU, MVT::v4i32, Custom);
1107 setOperationAction(ISD::ABDS, MVT::v4i32, Custom);
1108
1119
1124
1125 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1129
1130 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1131 // setcc all the way to isel and prefer SETGT in some isel patterns.
1134 }
1135
1136 setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
1137 setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
1142
1143 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1149 }
1150
1151 for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1155
1156 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1157 continue;
1158
1161 }
1162 setF16Action(MVT::v8f16, Expand);
1163 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1164 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1165 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1166 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1167 setOperationAction(ISD::FNEG, MVT::v8f16, Custom);
1168 setOperationAction(ISD::FABS, MVT::v8f16, Custom);
1170
1171 // Custom lower v2i64 and v2f64 selects.
1178
1185
1186 // Custom legalize these to avoid over promotion or custom promotion.
1187 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1192 }
1193
1198
1201
1204
1205 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1210
1215
1216 // We want to legalize this to an f64 load rather than an i64 load on
1217 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1218 // store.
1219 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1220 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1221 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1222 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1223 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1225
1226 // Add 32-bit vector stores to help vectorization opportunities.
1227 setOperationAction(ISD::STORE, MVT::v2i16, Custom);
1229
1233 if (!Subtarget.hasAVX512())
1235
1239
1241
1258
1259 // In the customized shift lowering, the legal v4i32/v2i64 cases
1260 // in AVX2 will be recognized.
1261 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1265 if (VT == MVT::v2i64) continue;
1270 }
1271
1277 }
1278
1279 if (Subtarget.hasGFNI()) {
1284 }
1285
1286 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1287 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1288 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1289 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1290
1291 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1294 }
1295
1296 // These might be better off as horizontal vector ops.
1301 }
1302
1303 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1304 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1307 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1311 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1317
1319 }
1320
1321 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1322 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1323 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1324 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1325 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1326 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1327 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1328 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1329
1330 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1333 }
1334
1338
1339 // FIXME: Do we need to handle scalar-to-vector here?
1340 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1341 setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
1342
1343 // We directly match byte blends in the backend as they match the VSELECT
1344 // condition form.
1346
1347 // SSE41 brings specific instructions for doing vector sign extend even in
1348 // cases where we don't have SRA.
1349 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1352 }
1353
1354 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1355 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1356 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1357 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1358 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1359 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1360 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1361 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1362 }
1363
1364 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1365 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1366 // do the pre and post work in the vector domain.
1369 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1370 // so that DAG combine doesn't try to turn it into uint_to_fp.
1373 }
1374 }
1375
1376 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1378 }
1379
1380 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1381 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1382 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1385 }
1386
1387 // XOP can efficiently perform BITREVERSE with VPPERM.
1388 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1390 }
1391
1392 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1393 bool HasInt256 = Subtarget.hasInt256();
1394
1395 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1396 : &X86::VR256RegClass);
1397 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1398 : &X86::VR256RegClass);
1399 addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1400 : &X86::VR256RegClass);
1401 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1402 : &X86::VR256RegClass);
1403 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1404 : &X86::VR256RegClass);
1405 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1406 : &X86::VR256RegClass);
1407 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1408 : &X86::VR256RegClass);
1409
1410 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1423
1425
1429
1432 }
1433
1434 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1435 // even though v8i16 is a legal type.
1436 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1437 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1438 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1439 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1443
1450
1462
1463 if (!Subtarget.hasAVX512())
1465
1466 // In the customized shift lowering, the legal v8i32/v4i64 cases
1467 // in AVX2 will be recognized.
1468 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1474 if (VT == MVT::v4i64) continue;
1479 }
1480
1481 // These types need custom splitting if their input is a 128-bit vector.
1486
1490 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1491 setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
1494
1495 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1499 }
1500
1505
1506 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1511
1512 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1513 // setcc all the way to isel and prefer SETGT in some isel patterns.
1516 }
1517
1518 setOperationAction(ISD::SETCC, MVT::v4f64, Custom);
1519 setOperationAction(ISD::SETCC, MVT::v8f32, Custom);
1524
1525 if (Subtarget.hasAnyFMA()) {
1526 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1527 MVT::v2f64, MVT::v4f64 }) {
1530 }
1531 }
1532
1533 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1534 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1535 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1536 }
1537
1538 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1539 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1540 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1541 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1542
1543 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1544 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1545 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1546 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1547 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1548 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1549 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1550 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1551
1552 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1553 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1554
1555 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1556 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1557 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1558 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1559 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1560
1561 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1562 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1563 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1564 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1565 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1566 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1567 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1568 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1573
1574 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1575 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1576 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1577 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1578 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1579 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1580 }
1581
1582 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1585 }
1586
1587 if (HasInt256) {
1588 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1589 // when we have a 256bit-wide blend with immediate.
1592
1593 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1594 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1595 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1596 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1597 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1598 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1599 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1600 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1601 }
1602 }
1603
1604 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1605 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1606 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1608 }
1609
1610 // Extract subvector is special because the value type
1611 // (result) is 128-bit but the source is 256-bit wide.
1612 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1613 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1615 }
1616
1617 // Custom lower several nodes for 256-bit types.
1618 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1619 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1629 }
1630 setF16Action(MVT::v16f16, Expand);
1631 setOperationAction(ISD::FNEG, MVT::v16f16, Custom);
1632 setOperationAction(ISD::FABS, MVT::v16f16, Custom);
1634 setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1635 setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1636 setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1637 setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1638
1639 if (HasInt256) {
1641
1642 // Custom legalize 2x32 to get a little better code.
1645
1646 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1647 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1649 }
1650 }
1651
1652 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1653 Subtarget.hasF16C()) {
1654 for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1657 }
1658 for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1661 }
1662 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1663 setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1664 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1665 }
1666 }
1667
1668 // This block controls legalization of the mask vector sizes that are
1669 // available with AVX512. 512-bit vectors are in a separate block controlled
1670 // by useAVX512Regs.
1671 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1672 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1673 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1674 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1675 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1676 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1677
1681
1682 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1683 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1684 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1685 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1686 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1687 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1688 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1689 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1694
1695 // There is no byte sized k-register load or store without AVX512DQ.
1696 if (!Subtarget.hasDQI()) {
1697 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1698 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1699 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1700 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1701
1706 }
1707
1708 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1709 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1713 }
1714
1715 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1717
1718 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1722
1729 }
1730
1731 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1733 }
1734
1735 // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1736 // elements. 512-bits can be disabled based on prefer-vector-width and
1737 // required-vector-width function attributes.
1738 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1739 bool HasBWI = Subtarget.hasBWI();
1740
1741 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1742 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1743 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1744 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1745 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1746 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1747 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1748
1749 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1750 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1751 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1752 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1753 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1754 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1755 if (HasBWI)
1756 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1757 }
1758
1759 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1767 }
1768
1769 for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1774 }
1775
1776 for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1781 }
1782
1789
1801
1802 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1803 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1804 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1805 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1806 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1807 if (HasBWI)
1808 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1809
1810 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1811 // to 512-bit rather than use the AVX2 instructions so that we can use
1812 // k-masks.
1813 if (!Subtarget.hasVLX()) {
1814 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1815 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1818 }
1819 }
1820
1822 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1823 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1833
1834 if (HasBWI) {
1835 // Extends from v64i1 masks to 512-bit vectors.
1839 }
1840
1841 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1854
1856 }
1857
1858 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1861 }
1862
1863 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1864 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1865 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1866 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1867
1868 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1869 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1870 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1871 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1872
1873 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1874 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1875 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1876 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1877 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1878 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1879 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
1880 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
1881
1882 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1883 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1884
1885 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1895
1896 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1897 // setcc all the way to isel and prefer SETGT in some isel patterns.
1900 }
1901
1902 setOperationAction(ISD::SETCC, MVT::v8f64, Custom);
1903 setOperationAction(ISD::SETCC, MVT::v16f32, Custom);
1908
1909 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1916 }
1917
1918 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1919 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
1920 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
1922 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
1923 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
1924 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
1925 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
1930 }
1931
1932 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
1933 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
1934 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
1935 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
1936 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
1937 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
1938
1939 if (Subtarget.hasDQI()) {
1943 setOperationAction(Opc, MVT::v8i64, Custom);
1944 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1945 }
1946
1947 if (Subtarget.hasCDI()) {
1948 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1949 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1951 }
1952 } // Subtarget.hasCDI()
1953
1954 if (Subtarget.hasVPOPCNTDQ()) {
1955 for (auto VT : { MVT::v16i32, MVT::v8i64 })
1957 }
1958
1959 // Extract subvector is special because the value type
1960 // (result) is 256-bit but the source is 512-bit wide.
1961 // 128-bit was made Legal under AVX1.
1962 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1963 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1965
1966 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
1967 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
1977 }
1978 setF16Action(MVT::v32f16, Expand);
1983 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
1984 setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
1985
1986 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1991 }
1992 if (HasBWI) {
1993 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1996 }
1997 } else {
1998 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
1999 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
2000 }
2001
2002 if (Subtarget.hasVBMI2()) {
2003 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2006 }
2007
2008 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
2009 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
2010 }
2011 }// useAVX512Regs
2012
2013 if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
2014 for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
2015 MVT::v4i64}) {
2018 }
2019 }
2020
2021 // This block controls legalization for operations that don't have
2022 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
2023 // narrower widths.
2024 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
2025 // These operations are handled on non-VLX by artificially widening in
2026 // isel patterns.
2027
2031
2032 if (Subtarget.hasDQI()) {
2033 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
2034 // v2f32 UINT_TO_FP is already custom under SSE2.
2037 "Unexpected operation action!");
2038 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
2043 }
2044
2045 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
2051 }
2052
2053 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2056 }
2057
2058 // Custom legalize 2x32 to get a little better code.
2061
2062 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2063 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2065
2066 if (Subtarget.hasDQI()) {
2070 setOperationAction(Opc, MVT::v2i64, Custom);
2071 setOperationAction(Opc, MVT::v4i64, Custom);
2072 }
2073 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
2074 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
2075 }
2076
2077 if (Subtarget.hasCDI()) {
2078 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2080 }
2081 } // Subtarget.hasCDI()
2082
2083 if (Subtarget.hasVPOPCNTDQ()) {
2084 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2086 }
2087 setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
2088 setOperationAction(ISD::FABS, MVT::v32f16, Custom);
2090 }
2091
2092 // This block control legalization of v32i1/v64i1 which are available with
2093 // AVX512BW..
2094 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2095 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
2096 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
2097
2098 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2109 }
2110
2111 for (auto VT : { MVT::v16i1, MVT::v32i1 })
2113
2114 // Extends from v32i1 masks to 256-bit vectors.
2118
2119 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
2120 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
2121 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2122 }
2123
2124 // These operations are handled on non-VLX by artificially widening in
2125 // isel patterns.
2126 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2127
2128 if (Subtarget.hasBITALG()) {
2129 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2131 }
2132 }
2133
2134 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2135 auto setGroup = [&] (MVT VT) {
2146
2159
2161
2164
2170
2176
2180 };
2181
2182 // AVX512_FP16 scalar operations
2183 setGroup(MVT::f16);
2197
2200
2201 if (Subtarget.useAVX512Regs()) {
2202 setGroup(MVT::v32f16);
2208 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
2215
2220 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
2222 MVT::v32i16);
2223 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2225 MVT::v32i16);
2226 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2228 MVT::v32i16);
2229 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2231 MVT::v32i16);
2232
2236
2237 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2238 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2239 }
2240
2241 if (Subtarget.hasVLX()) {
2242 setGroup(MVT::v8f16);
2243 setGroup(MVT::v16f16);
2244
2255
2266
2267 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2270
2274
2275 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2276 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2277 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2278 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2279
2280 // Need to custom widen these to prevent scalarization.
2281 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2282 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2283 }
2284 }
2285
2286 if (!Subtarget.useSoftFloat() &&
2287 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2288 addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass
2289 : &X86::VR128RegClass);
2290 addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
2291 : &X86::VR256RegClass);
2292 // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2293 // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT.
2294 // Set the operation action Custom to do the customization later.
2297 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2298 setF16Action(VT, Expand);
2303 }
2304 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
2305 setOperationPromotedToType(Opc, MVT::v8bf16, MVT::v8f32);
2306 setOperationPromotedToType(Opc, MVT::v16bf16, MVT::v16f32);
2307 }
2309 addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2310 }
2311
2312 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16()) {
2313 addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2314 setF16Action(MVT::v32bf16, Expand);
2315 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2316 setOperationPromotedToType(Opc, MVT::v32bf16, MVT::v32f32);
2318 setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
2322 }
2323
2324 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2325 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2326 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2327 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2328 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2329 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2330
2331 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2332 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2333 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2334 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2335 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2336
2337 if (Subtarget.hasBWI()) {
2338 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2339 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2340 }
2341
2342 if (Subtarget.hasFP16()) {
2343 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2352 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2361 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2366 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2371 }
2372 }
2373
2374 if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {
2375 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2376 }
2377
2378 // We want to custom lower some of our intrinsics.
2382 if (!Subtarget.is64Bit()) {
2384 }
2385
2386 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2387 // handle type legalization for these operations here.
2388 //
2389 // FIXME: We really should do custom legalization for addition and
2390 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2391 // than generic legalization for 64-bit multiplication-with-overflow, though.
2392 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2393 if (VT == MVT::i64 && !Subtarget.is64Bit())
2394 continue;
2395 // Add/Sub/Mul with overflow operations are custom lowered.
2402
2403 // Support carry in as value rather than glue.
2409 }
2410
2411 if (!Subtarget.is64Bit()) {
2412 // These libcalls are not available in 32-bit.
2413 setLibcallName(RTLIB::SHL_I128, nullptr);
2414 setLibcallName(RTLIB::SRL_I128, nullptr);
2415 setLibcallName(RTLIB::SRA_I128, nullptr);
2416 setLibcallName(RTLIB::MUL_I128, nullptr);
2417 // The MULO libcall is not part of libgcc, only compiler-rt.
2418 setLibcallName(RTLIB::MULO_I64, nullptr);
2419 }
2420 // The MULO libcall is not part of libgcc, only compiler-rt.
2421 setLibcallName(RTLIB::MULO_I128, nullptr);
2422
2423 // Combine sin / cos into _sincos_stret if it is available.
2424 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2425 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2428 }
2429
2430 if (Subtarget.isTargetWin64()) {
2431 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2432 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2433 setOperationAction(ISD::SREM, MVT::i128, Custom);
2434 setOperationAction(ISD::UREM, MVT::i128, Custom);
2443 }
2444
2445 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2446 // is. We should promote the value to 64-bits to solve this.
2447 // This is what the CRT headers do - `fmodf` is an inline header
2448 // function casting to f64 and calling `fmod`.
2449 if (Subtarget.is32Bit() &&
2450 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2451 for (ISD::NodeType Op :
2461 if (isOperationExpand(Op, MVT::f32))
2462 setOperationAction(Op, MVT::f32, Promote);
2463
2464 // We have target-specific dag combine patterns for the following nodes:
2475 ISD::SHL,
2476 ISD::SRA,
2477 ISD::SRL,
2478 ISD::OR,
2479 ISD::AND,
2481 ISD::ADD,
2482 ISD::FADD,
2483 ISD::FSUB,
2484 ISD::FNEG,
2485 ISD::FMA,
2489 ISD::SUB,
2490 ISD::LOAD,
2491 ISD::MLOAD,
2492 ISD::STORE,
2506 ISD::SETCC,
2507 ISD::MUL,
2508 ISD::XOR,
2516
2518
2519 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2521 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2523 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2525
2526 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2527 // that needs to benchmarked and balanced with the potential use of vector
2528 // load/store types (PR33329, PR33914).
2531
2532 // Default loop alignment, which can be overridden by -align-loops.
2534
2535 // An out-of-order CPU can speculatively execute past a predictable branch,
2536 // but a conditional move could be stalled by an expensive earlier operation.
2537 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2538 EnableExtLdPromotion = true;
2540
2542
2543 // Default to having -disable-strictnode-mutation on
2544 IsStrictFPEnabled = true;
2545}
2546
2547// This has so far only been implemented for 64-bit MachO.
2549 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2550}
2551
2553 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2554 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2555}
2556
2558 const SDLoc &DL) const {
2559 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2560 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2561 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2562 return SDValue(Node, 0);
2563}
2564
2567 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2568 !Subtarget.hasBWI())
2569 return TypeSplitVector;
2570
2571 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2572 !Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16)
2573 return TypeSplitVector;
2574
2575 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2576 VT.getVectorElementType() != MVT::i1)
2577 return TypeWidenVector;
2578
2580}
2581
2582FastISel *
2584 const TargetLibraryInfo *libInfo) const {
2585 return X86::createFastISel(funcInfo, libInfo);
2586}
2587
2588//===----------------------------------------------------------------------===//
2589// Other Lowering Hooks
2590//===----------------------------------------------------------------------===//
2591
2593 bool AssumeSingleUse) {
2594 if (!AssumeSingleUse && !Op.hasOneUse())
2595 return false;
2596 if (!ISD::isNormalLoad(Op.getNode()))
2597 return false;
2598
2599 // If this is an unaligned vector, make sure the target supports folding it.
2600 auto *Ld = cast<LoadSDNode>(Op.getNode());
2601 if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
2602 Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
2603 return false;
2604
2605 // TODO: If this is a non-temporal load and the target has an instruction
2606 // for it, it should not be folded. See "useNonTemporalLoad()".
2607
2608 return true;
2609}
2610
2612 const X86Subtarget &Subtarget,
2613 bool AssumeSingleUse) {
2614 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
2615 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
2616 return false;
2617
2618 // We can not replace a wide volatile load with a broadcast-from-memory,
2619 // because that would narrow the load, which isn't legal for volatiles.
2620 auto *Ld = cast<LoadSDNode>(Op.getNode());
2621 return !Ld->isVolatile() ||
2622 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
2623}
2624
2626 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
2627}
2628
2630 if (Op.hasOneUse()) {
2631 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
2632 return (ISD::ZERO_EXTEND == Opcode);
2633 }
2634 return false;
2635}
2636
2637static bool isLogicOp(unsigned Opcode) {
2638 // TODO: Add support for X86ISD::FAND/FOR/FXOR/FANDN with test coverage.
2639 return ISD::isBitwiseLogicOp(Opcode) || X86ISD::ANDNP == Opcode;
2640}
2641
2642static bool isTargetShuffle(unsigned Opcode) {
2643 switch(Opcode) {
2644 default: return false;
2645 case X86ISD::BLENDI:
2646 case X86ISD::PSHUFB:
2647 case X86ISD::PSHUFD:
2648 case X86ISD::PSHUFHW:
2649 case X86ISD::PSHUFLW:
2650 case X86ISD::SHUFP:
2651 case X86ISD::INSERTPS:
2652 case X86ISD::EXTRQI:
2653 case X86ISD::INSERTQI:
2654 case X86ISD::VALIGN:
2655 case X86ISD::PALIGNR:
2656 case X86ISD::VSHLDQ:
2657 case X86ISD::VSRLDQ:
2658 case X86ISD::MOVLHPS:
2659 case X86ISD::MOVHLPS:
2660 case X86ISD::MOVSHDUP:
2661 case X86ISD::MOVSLDUP:
2662 case X86ISD::MOVDDUP:
2663 case X86ISD::MOVSS:
2664 case X86ISD::MOVSD:
2665 case X86ISD::MOVSH:
2666 case X86ISD::UNPCKL:
2667 case X86ISD::UNPCKH:
2668 case X86ISD::VBROADCAST:
2669 case X86ISD::VPERMILPI:
2670 case X86ISD::VPERMILPV:
2671 case X86ISD::VPERM2X128:
2672 case X86ISD::SHUF128:
2673 case X86ISD::VPERMIL2:
2674 case X86ISD::VPERMI:
2675 case X86ISD::VPPERM:
2676 case X86ISD::VPERMV:
2677 case X86ISD::VPERMV3:
2678 case X86ISD::VZEXT_MOVL:
2679 return true;
2680 }
2681}
2682
2683static bool isTargetShuffleVariableMask(unsigned Opcode) {
2684 switch (Opcode) {
2685 default: return false;
2686 // Target Shuffles.
2687 case X86ISD::PSHUFB:
2688 case X86ISD::VPERMILPV:
2689 case X86ISD::VPERMIL2:
2690 case X86ISD::VPPERM:
2691 case X86ISD::VPERMV:
2692 case X86ISD::VPERMV3:
2693 return true;
2694 // 'Faux' Target Shuffles.
2695 case ISD::OR:
2696 case ISD::AND:
2697 case X86ISD::ANDNP:
2698 return true;
2699 }
2700}
2701
2704 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2706 int ReturnAddrIndex = FuncInfo->getRAIndex();
2707
2708 if (ReturnAddrIndex == 0) {
2709 // Set up a frame object for the return address.
2710 unsigned SlotSize = RegInfo->getSlotSize();
2711 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
2712 -(int64_t)SlotSize,
2713 false);
2714 FuncInfo->setRAIndex(ReturnAddrIndex);
2715 }
2716
2717 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
2718}
2719
2721 bool HasSymbolicDisplacement) {
2722 // Offset should fit into 32 bit immediate field.
2723 if (!isInt<32>(Offset))
2724 return false;
2725
2726 // If we don't have a symbolic displacement - we don't have any extra
2727 // restrictions.
2728 if (!HasSymbolicDisplacement)
2729 return true;
2730
2731 // We can fold large offsets in the large code model because we always use
2732 // 64-bit offsets.
2733 if (CM == CodeModel::Large)
2734 return true;
2735
2736 // For kernel code model we know that all object resist in the negative half
2737 // of 32bits address space. We may not accept negative offsets, since they may
2738 // be just off and we may accept pretty large positive ones.
2739 if (CM == CodeModel::Kernel)
2740 return Offset >= 0;
2741
2742 // For other non-large code models we assume that latest small object is 16MB
2743 // before end of 31 bits boundary. We may also accept pretty large negative
2744 // constants knowing that all objects are in the positive half of address
2745 // space.
2746 return Offset < 16 * 1024 * 1024;
2747}
2748
2749/// Return true if the condition is an signed comparison operation.
2750static bool isX86CCSigned(unsigned X86CC) {
2751 switch (X86CC) {
2752 default:
2753 llvm_unreachable("Invalid integer condition!");
2754 case X86::COND_E:
2755 case X86::COND_NE:
2756 case X86::COND_B:
2757 case X86::COND_A:
2758 case X86::COND_BE:
2759 case X86::COND_AE:
2760 return false;
2761 case X86::COND_G:
2762 case X86::COND_GE:
2763 case X86::COND_L:
2764 case X86::COND_LE:
2765 return true;
2766 }
2767}
2768
2770 switch (SetCCOpcode) {
2771 // clang-format off
2772 default: llvm_unreachable("Invalid integer condition!");
2773 case ISD::SETEQ: return X86::COND_E;
2774 case ISD::SETGT: return X86::COND_G;
2775 case ISD::SETGE: return X86::COND_GE;
2776 case ISD::SETLT: return X86::COND_L;
2777 case ISD::SETLE: return X86::COND_LE;
2778 case ISD::SETNE: return X86::COND_NE;
2779 case ISD::SETULT: return X86::COND_B;
2780 case ISD::SETUGT: return X86::COND_A;
2781 case ISD::SETULE: return X86::COND_BE;
2782 case ISD::SETUGE: return X86::COND_AE;
2783 // clang-format on
2784 }
2785}
2786
2787/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
2788/// condition code, returning the condition code and the LHS/RHS of the
2789/// comparison to make.
2791 bool isFP, SDValue &LHS, SDValue &RHS,
2792 SelectionDAG &DAG) {
2793 if (!isFP) {
2794 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
2795 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
2796 // X > -1 -> X == 0, jump !sign.
2797 RHS = DAG.getConstant(0, DL, RHS.getValueType());
2798 return X86::COND_NS;
2799 }
2800 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
2801 // X < 0 -> X == 0, jump on sign.
2802 return X86::COND_S;
2803 }
2804 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
2805 // X >= 0 -> X == 0, jump on !sign.
2806 return X86::COND_NS;
2807 }
2808 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
2809 // X < 1 -> X <= 0
2810 RHS = DAG.getConstant(0, DL, RHS.getValueType());
2811 return X86::COND_LE;
2812 }
2813 }
2814
2815 return TranslateIntegerX86CC(SetCCOpcode);
2816 }
2817
2818 // First determine if it is required or is profitable to flip the operands.
2819
2820 // If LHS is a foldable load, but RHS is not, flip the condition.
2821 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
2822 !ISD::isNON_EXTLoad(RHS.getNode())) {
2823 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
2824 std::swap(LHS, RHS);
2825 }
2826
2827 switch (SetCCOpcode) {
2828 default: break;
2829 case ISD::SETOLT:
2830 case ISD::SETOLE:
2831 case ISD::SETUGT:
2832 case ISD::SETUGE:
2833 std::swap(LHS, RHS);
2834 break;
2835 }
2836
2837 // On a floating point condition, the flags are set as follows:
2838 // ZF PF CF op
2839 // 0 | 0 | 0 | X > Y
2840 // 0 | 0 | 1 | X < Y
2841 // 1 | 0 | 0 | X == Y
2842 // 1 | 1 | 1 | unordered
2843 switch (SetCCOpcode) {
2844 // clang-format off
2845 default: llvm_unreachable("Condcode should be pre-legalized away");
2846 case ISD::SETUEQ:
2847 case ISD::SETEQ: return X86::COND_E;
2848 case ISD::SETOLT: // flipped
2849 case ISD::SETOGT:
2850 case ISD::SETGT: return X86::COND_A;
2851 case ISD::SETOLE: // flipped
2852 case ISD::SETOGE:
2853 case ISD::SETGE: return X86::COND_AE;
2854 case ISD::SETUGT: // flipped
2855 case ISD::SETULT:
2856 case ISD::SETLT: return X86::COND_B;
2857 case ISD::SETUGE: // flipped
2858 case ISD::SETULE:
2859 case ISD::SETLE: return X86::COND_BE;
2860 case ISD::SETONE:
2861 case ISD::SETNE: return X86::COND_NE;
2862 case ISD::SETUO: return X86::COND_P;
2863 case ISD::SETO: return X86::COND_NP;
2864 case ISD::SETOEQ:
2865 case ISD::SETUNE: return X86::COND_INVALID;
2866 // clang-format on
2867 }
2868}
2869
2870/// Is there a floating point cmov for the specific X86 condition code?
2871/// Current x86 isa includes the following FP cmov instructions:
2872/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
2873static bool hasFPCMov(unsigned X86CC) {
2874 switch (X86CC) {
2875 default:
2876 return false;
2877 case X86::COND_B:
2878 case X86::COND_BE:
2879 case X86::COND_E:
2880 case X86::COND_P:
2881 case X86::COND_A:
2882 case X86::COND_AE:
2883 case X86::COND_NE:
2884 case X86::COND_NP:
2885 return true;
2886 }
2887}
2888
2889static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
2890 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
2891 VT.is512BitVector();
2892}
2893
2895 const CallInst &I,
2896 MachineFunction &MF,
2897 unsigned Intrinsic) const {
2899 Info.offset = 0;
2900
2901 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
2902 if (!IntrData) {
2903 switch (Intrinsic) {
2904 case Intrinsic::x86_aesenc128kl:
2905 case Intrinsic::x86_aesdec128kl:
2907 Info.ptrVal = I.getArgOperand(1);
2908 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
2909 Info.align = Align(1);
2911 return true;
2912 case Intrinsic::x86_aesenc256kl:
2913 case Intrinsic::x86_aesdec256kl:
2915 Info.ptrVal = I.getArgOperand(1);
2916 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
2917 Info.align = Align(1);
2919 return true;
2920 case Intrinsic::x86_aesencwide128kl:
2921 case Intrinsic::x86_aesdecwide128kl:
2923 Info.ptrVal = I.getArgOperand(0);
2924 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
2925 Info.align = Align(1);
2927 return true;
2928 case Intrinsic::x86_aesencwide256kl:
2929 case Intrinsic::x86_aesdecwide256kl:
2931 Info.ptrVal = I.getArgOperand(0);
2932 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
2933 Info.align = Align(1);
2935 return true;
2936 case Intrinsic::x86_cmpccxadd32:
2937 case Intrinsic::x86_cmpccxadd64:
2938 case Intrinsic::x86_atomic_bts:
2939 case Intrinsic::x86_atomic_btc:
2940 case Intrinsic::x86_atomic_btr: {
2942 Info.ptrVal = I.getArgOperand(0);
2943 unsigned Size = I.getType()->getScalarSizeInBits();
2944 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
2945 Info.align = Align(Size);
2948 return true;
2949 }
2950 case Intrinsic::x86_atomic_bts_rm:
2951 case Intrinsic::x86_atomic_btc_rm:
2952 case Intrinsic::x86_atomic_btr_rm: {
2954 Info.ptrVal = I.getArgOperand(0);
2955 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
2956 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
2957 Info.align = Align(Size);
2960 return true;
2961 }
2962 case Intrinsic::x86_aadd32:
2963 case Intrinsic::x86_aadd64:
2964 case Intrinsic::x86_aand32:
2965 case Intrinsic::x86_aand64:
2966 case Intrinsic::x86_aor32:
2967 case Intrinsic::x86_aor64:
2968 case Intrinsic::x86_axor32:
2969 case Intrinsic::x86_axor64:
2970 case Intrinsic::x86_atomic_add_cc:
2971 case Intrinsic::x86_atomic_sub_cc:
2972 case Intrinsic::x86_atomic_or_cc:
2973 case Intrinsic::x86_atomic_and_cc:
2974 case Intrinsic::x86_atomic_xor_cc: {
2976 Info.ptrVal = I.getArgOperand(0);
2977 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
2978 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
2979 Info.align = Align(Size);
2982 return true;
2983 }
2984 }
2985 return false;
2986 }
2987
2988 switch (IntrData->Type) {
2991 case TRUNCATE_TO_MEM_VI32: {
2993 Info.ptrVal = I.getArgOperand(0);
2994 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
2996 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
2997 ScalarVT = MVT::i8;
2998 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
2999 ScalarVT = MVT::i16;
3000 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
3001 ScalarVT = MVT::i32;
3002
3003 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
3004 Info.align = Align(1);
3006 break;
3007 }
3008 case GATHER:
3009 case GATHER_AVX2: {
3011 Info.ptrVal = nullptr;
3012 MVT DataVT = MVT::getVT(I.getType());
3013 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3014 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3015 IndexVT.getVectorNumElements());
3016 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3017 Info.align = Align(1);
3019 break;
3020 }
3021 case SCATTER: {
3023 Info.ptrVal = nullptr;
3024 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
3025 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3026 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3027 IndexVT.getVectorNumElements());
3028 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3029 Info.align = Align(1);
3031 break;
3032 }
3033 default:
3034 return false;
3035 }
3036
3037 return true;
3038}
3039
3040/// Returns true if the target can instruction select the
3041/// specified FP immediate natively. If false, the legalizer will
3042/// materialize the FP immediate as a load from a constant pool.
3044 bool ForCodeSize) const {
3045 for (const APFloat &FPImm : LegalFPImmediates)
3046 if (Imm.bitwiseIsEqual(FPImm))
3047 return true;
3048 return false;
3049}
3050
3052 ISD::LoadExtType ExtTy,
3053 EVT NewVT) const {
3054 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
3055
3056 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3057 // relocation target a movq or addq instruction: don't let the load shrink.
3058 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3059 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3060 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3061 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3062
3063 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
3064 // those uses are extracted directly into a store, then the extract + store
3065 // can be store-folded. Therefore, it's probably not worth splitting the load.
3066 EVT VT = Load->getValueType(0);
3067 if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
3068 for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
3069 // Skip uses of the chain value. Result 0 of the node is the load value.
3070 if (UI.getUse().getResNo() != 0)
3071 continue;
3072
3073 // If this use is not an extract + store, it's probably worth splitting.
3074 if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
3075 UI->use_begin()->getOpcode() != ISD::STORE)
3076 return true;
3077 }
3078 // All non-chain uses are extract + store.
3079 return false;
3080 }
3081
3082 return true;
3083}
3084
3085/// Returns true if it is beneficial to convert a load of a constant
3086/// to just the constant itself.
3088 Type *Ty) const {
3089 assert(Ty->isIntegerTy());
3090
3091 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3092 if (BitSize == 0 || BitSize > 64)
3093 return false;
3094 return true;
3095}
3096
3098 // If we are using XMM registers in the ABI and the condition of the select is
3099 // a floating-point compare and we have blendv or conditional move, then it is
3100 // cheaper to select instead of doing a cross-register move and creating a
3101 // load that depends on the compare result.
3102 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
3103 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
3104}
3105
3107 // TODO: It might be a win to ease or lift this restriction, but the generic
3108 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
3109 if (VT.isVector() && Subtarget.hasAVX512())
3110 return false;
3111
3112 return true;
3113}
3114
3116 SDValue C) const {
3117 // TODO: We handle scalars using custom code, but generic combining could make
3118 // that unnecessary.
3119 APInt MulC;
3120 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
3121 return false;
3122
3123 // Find the type this will be legalized too. Otherwise we might prematurely
3124 // convert this to shl+add/sub and then still have to type legalize those ops.
3125 // Another choice would be to defer the decision for illegal types until
3126 // after type legalization. But constant splat vectors of i64 can't make it
3127 // through type legalization on 32-bit targets so we would need to special
3128 // case vXi64.
3129 while (getTypeAction(Context, VT) != TypeLegal)
3130 VT = getTypeToTransformTo(Context, VT);
3131
3132 // If vector multiply is legal, assume that's faster than shl + add/sub.
3133 // Multiply is a complex op with higher latency and lower throughput in
3134 // most implementations, sub-vXi32 vector multiplies are always fast,
3135 // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
3136 // is always going to be slow.
3137 unsigned EltSizeInBits = VT.getScalarSizeInBits();
3138 if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
3139 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
3140 return false;
3141
3142 // shl+add, shl+sub, shl+add+neg
3143 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
3144 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
3145}
3146
3148 unsigned Index) const {
3150 return false;
3151
3152 // Mask vectors support all subregister combinations and operations that
3153 // extract half of vector.
3154 if (ResVT.getVectorElementType() == MVT::i1)
3155 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
3156 (Index == ResVT.getVectorNumElements()));
3157
3158 return (Index % ResVT.getVectorNumElements()) == 0;
3159}
3160
3162 unsigned Opc = VecOp.getOpcode();
3163
3164 // Assume target opcodes can't be scalarized.
3165 // TODO - do we have any exceptions?
3166 if (Opc >= ISD::BUILTIN_OP_END)
3167 return false;
3168
3169 // If the vector op is not supported, try to convert to scalar.
3170 EVT VecVT = VecOp.getValueType();
3171 if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
3172 return true;
3173
3174 // If the vector op is supported, but the scalar op is not, the transform may
3175 // not be worthwhile.
3176 EVT ScalarVT = VecVT.getScalarType();
3177 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
3178}
3179
3181 bool) const {
3182 // TODO: Allow vectors?
3183 if (VT.isVector())
3184 return false;
3185 return VT.isSimple() || !isOperationExpand(Opcode, VT);
3186}
3187
3189 // Speculate cttz only if we can directly use TZCNT or can promote to i32.
3190 return Subtarget.hasBMI() ||
3191 (!Ty->isVectorTy() && Ty->getScalarSizeInBits() < 32);
3192}
3193
3195 // Speculate ctlz only if we can directly use LZCNT.
3196 return Subtarget.hasLZCNT();
3197}
3198
3200 // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
3201 // expensive than a straight movsd. On the other hand, it's important to
3202 // shrink long double fp constant since fldt is very slow.
3203 return !Subtarget.hasSSE2() || VT == MVT::f80;
3204}
3205
3207 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
3208 (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
3209}
3210
3212 const SelectionDAG &DAG,
3213 const MachineMemOperand &MMO) const {
3214 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
3215 BitcastVT.getVectorElementType() == MVT::i1)
3216 return false;
3217
3218 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
3219 return false;
3220
3221 // If both types are legal vectors, it's always ok to convert them.
3222 if (LoadVT.isVector() && BitcastVT.isVector() &&
3223 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
3224 return true;
3225
3226 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
3227}
3228
3230 const MachineFunction &MF) const {
3231 // Do not merge to float value size (128 bytes) if no implicit
3232 // float attribute is set.
3233 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
3234
3235 if (NoFloat) {
3236 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
3237 return (MemVT.getSizeInBits() <= MaxIntSize);
3238 }
3239 // Make sure we don't merge greater than our preferred vector
3240 // width.
3241 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
3242 return false;
3243
3244 return true;
3245}
3246
3248 return Subtarget.hasFastLZCNT();
3249}
3250
3252 const Instruction &AndI) const {
3253 return true;
3254}
3255
3257 EVT VT = Y.getValueType();
3258
3259 if (VT.isVector())
3260 return false;
3261
3262 if (!Subtarget.hasBMI())
3263 return false;
3264
3265 // There are only 32-bit and 64-bit forms for 'andn'.
3266 if (VT != MVT::i32 && VT != MVT::i64)
3267 return false;
3268
3269 return !isa<ConstantSDNode>(Y);
3270}
3271
3273 EVT VT = Y.getValueType();
3274
3275 if (!VT.isVector())
3276 return hasAndNotCompare(Y);
3277
3278 // Vector.
3279
3280 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
3281 return false;
3282
3283 if (VT == MVT::v4i32)
3284 return true;
3285
3286 return Subtarget.hasSSE2();
3287}
3288
3290 return X.getValueType().isScalarInteger(); // 'bt'
3291}
3292
3296 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
3297 SelectionDAG &DAG) const {
3298 // Does baseline recommend not to perform the fold by default?
3300 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
3301 return false;
3302 // For scalars this transform is always beneficial.
3303 if (X.getValueType().isScalarInteger())
3304 return true;
3305 // If all the shift amounts are identical, then transform is beneficial even
3306 // with rudimentary SSE2 shifts.
3307 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
3308 return true;
3309 // If we have AVX2 with it's powerful shift operations, then it's also good.
3310 if (Subtarget.hasAVX2())
3311 return true;
3312 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
3313 return NewShiftOpcode == ISD::SHL;
3314}
3315
3317 EVT VT, unsigned ShiftOpc, bool MayTransformRotate,
3318 const APInt &ShiftOrRotateAmt, const std::optional<APInt> &AndMask) const {
3319 if (!VT.isInteger())
3320 return ShiftOpc;
3321
3322 bool PreferRotate = false;
3323 if (VT.isVector()) {
3324 // For vectors, if we have rotate instruction support, then its definetly
3325 // best. Otherwise its not clear what the best so just don't make changed.
3326 PreferRotate = Subtarget.hasAVX512() && (VT.getScalarType() == MVT::i32 ||
3327 VT.getScalarType() == MVT::i64);
3328 } else {
3329 // For scalar, if we have bmi prefer rotate for rorx. Otherwise prefer
3330 // rotate unless we have a zext mask+shr.
3331 PreferRotate = Subtarget.hasBMI2();
3332 if (!PreferRotate) {
3333 unsigned MaskBits =
3334 VT.getScalarSizeInBits() - ShiftOrRotateAmt.getZExtValue();
3335 PreferRotate = (MaskBits != 8) && (MaskBits != 16) && (MaskBits != 32);
3336 }
3337 }
3338
3339 if (ShiftOpc == ISD::SHL || ShiftOpc == ISD::SRL) {
3340 assert(AndMask.has_value() && "Null andmask when querying about shift+and");
3341
3342 if (PreferRotate && MayTransformRotate)
3343 return ISD::ROTL;
3344
3345 // If vector we don't really get much benefit swapping around constants.
3346 // Maybe we could check if the DAG has the flipped node already in the
3347 // future.
3348 if (VT.isVector())
3349 return ShiftOpc;
3350
3351 // See if the beneficial to swap shift type.
3352 if (ShiftOpc == ISD::SHL) {
3353 // If the current setup has imm64 mask, then inverse will have
3354 // at least imm32 mask (or be zext i32 -> i64).
3355 if (VT == MVT::i64)
3356 return AndMask->getSignificantBits() > 32 ? (unsigned)ISD::SRL
3357 : ShiftOpc;
3358
3359 // We can only benefit if req at least 7-bit for the mask. We
3360 // don't want to replace shl of 1,2,3 as they can be implemented
3361 // with lea/add.
3362 return ShiftOrRotateAmt.uge(7) ? (unsigned)ISD::SRL : ShiftOpc;
3363 }
3364
3365 if (VT == MVT::i64)
3366 // Keep exactly 32-bit imm64, this is zext i32 -> i64 which is
3367 // extremely efficient.
3368 return AndMask->getSignificantBits() > 33 ? (unsigned)ISD::SHL : ShiftOpc;
3369
3370 // Keep small shifts as shl so we can generate add/lea.
3371 return ShiftOrRotateAmt.ult(7) ? (unsigned)ISD::SHL : ShiftOpc;
3372 }
3373
3374 // We prefer rotate for vectors of if we won't get a zext mask with SRL
3375 // (PreferRotate will be set in the latter case).
3376 if (PreferRotate || VT.isVector())
3377 return ShiftOpc;
3378
3379 // Non-vector type and we have a zext mask with SRL.
3380 return ISD::SRL;
3381}
3382
3385 const Value *Lhs,
3386 const Value *Rhs) const {
3387 using namespace llvm::PatternMatch;
3388 int BaseCost = BrMergingBaseCostThresh.getValue();
3389 // a == b && a == c is a fast pattern on x86.
3391 if (BaseCost >= 0 && Opc == Instruction::And &&
3392 match(Lhs, m_ICmp(Pred, m_Value(), m_Value())) &&
3393 Pred == ICmpInst::ICMP_EQ &&
3394 match(Rhs, m_ICmp(Pred, m_Value(), m_Value())) &&
3395 Pred == ICmpInst::ICMP_EQ)
3396 BaseCost += 1;
3397 return {BaseCost, BrMergingLikelyBias.getValue(),
3398 BrMergingUnlikelyBias.getValue()};
3399}
3400
3402 return N->getOpcode() != ISD::FP_EXTEND;
3403}
3404
3406 const SDNode *N, CombineLevel Level) const {
3407 assert(((N->getOpcode() == ISD::SHL &&
3408 N->getOperand(0).getOpcode() == ISD::SRL) ||
3409 (N->getOpcode() == ISD::SRL &&
3410 N->getOperand(0).getOpcode() == ISD::SHL)) &&
3411 "Expected shift-shift mask");
3412 // TODO: Should we always create i64 masks? Or only folded immediates?
3413 EVT VT = N->getValueType(0);
3414 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
3415 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
3416 // Only fold if the shift values are equal - so it folds to AND.
3417 // TODO - we should fold if either is a non-uniform vector but we don't do
3418 // the fold for non-splats yet.
3419 return N->getOperand(1) == N->getOperand(0).getOperand(1);
3420 }
3422}
3423
3425 EVT VT = Y.getValueType();
3426
3427 // For vectors, we don't have a preference, but we probably want a mask.
3428 if (VT.isVector())
3429 return false;
3430
3431 // 64-bit shifts on 32-bit targets produce really bad bloated code.
3432 if (VT == MVT::i64 && !Subtarget.is64Bit())
3433 return false;
3434
3435 return true;
3436}
3437
3440 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
3442 !Subtarget.isOSWindows())
3445 ExpansionFactor);
3446}
3447
3449 // Any legal vector type can be splatted more efficiently than
3450 // loading/spilling from memory.
3451 return isTypeLegal(VT);
3452}
3453
3455 MVT VT = MVT::getIntegerVT(NumBits);
3456 if (isTypeLegal(VT))
3457 return VT;
3458
3459 // PMOVMSKB can handle this.
3460 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
3461 return MVT::v16i8;
3462
3463 // VPMOVMSKB can handle this.
3464 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
3465 return MVT::v32i8;
3466
3467 // TODO: Allow 64-bit type for 32-bit target.
3468 // TODO: 512-bit types should be allowed, but make sure that those
3469 // cases are handled in combineVectorSizedSetCCEquality().
3470
3472}
3473
3474/// Val is the undef sentinel value or equal to the specified value.
3475static bool isUndefOrEqual(int Val, int CmpVal) {
3476 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
3477}
3478
3479/// Return true if every element in Mask is the undef sentinel value or equal to
3480/// the specified value.
3481static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
3482 return llvm::all_of(Mask, [CmpVal](int M) {
3483 return (M == SM_SentinelUndef) || (M == CmpVal);
3484 });
3485}
3486
3487/// Return true if every element in Mask, beginning from position Pos and ending
3488/// in Pos+Size is the undef sentinel value or equal to the specified value.
3489static bool isUndefOrEqualInRange(ArrayRef<int> Mask, int CmpVal, unsigned Pos,
3490 unsigned Size) {
3491 return llvm::all_of(Mask.slice(Pos, Size),
3492 [CmpVal](int M) { return isUndefOrEqual(M, CmpVal); });
3493}
3494
3495/// Val is either the undef or zero sentinel value.
3496static bool isUndefOrZero(int Val) {
3497 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
3498}
3499
3500/// Return true if every element in Mask, beginning from position Pos and ending
3501/// in Pos+Size is the undef sentinel value.
3502static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
3503 return llvm::all_of(Mask.slice(Pos, Size),
3504 [](int M) { return M == SM_SentinelUndef; });
3505}
3506
3507/// Return true if the mask creates a vector whose lower half is undefined.
3509 unsigned NumElts = Mask.size();
3510 return isUndefInRange(Mask, 0, NumElts / 2);
3511}
3512
3513/// Return true if the mask creates a vector whose upper half is undefined.
3515 unsigned NumElts = Mask.size();
3516 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
3517}
3518
3519/// Return true if Val falls within the specified range (L, H].
3520static bool isInRange(int Val, int Low, int Hi) {
3521 return (Val >= Low && Val < Hi);
3522}
3523
3524/// Return true if the value of any element in Mask falls within the specified
3525/// range (L, H].
3526static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
3527 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
3528}
3529
3530/// Return true if the value of any element in Mask is the zero sentinel value.
3531static bool isAnyZero(ArrayRef<int> Mask) {
3532 return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
3533}
3534
3535/// Return true if Val is undef or if its value falls within the
3536/// specified range (L, H].
3537static bool isUndefOrInRange(int Val, int Low, int Hi) {
3538 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
3539}
3540
3541/// Return true if every element in Mask is undef or if its value
3542/// falls within the specified range (L, H].
3543static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3544 return llvm::all_of(
3545 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
3546}
3547
3548/// Return true if Val is undef, zero or if its value falls within the
3549/// specified range (L, H].
3550static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
3551 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
3552}
3553
3554/// Return true if every element in Mask is undef, zero or if its value
3555/// falls within the specified range (L, H].
3556static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3557 return llvm::all_of(
3558 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
3559}
3560
3561/// Return true if every element in Mask, beginning
3562/// from position Pos and ending in Pos + Size, falls within the specified
3563/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
3564static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
3565 unsigned Size, int Low, int Step = 1) {
3566 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3567 if (!isUndefOrEqual(Mask[i], Low))
3568 return false;
3569 return true;
3570}
3571
3572/// Return true if every element in Mask, beginning
3573/// from position Pos and ending in Pos+Size, falls within the specified
3574/// sequential range (Low, Low+Size], or is undef or is zero.
3576 unsigned Size, int Low,
3577 int Step = 1) {
3578 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3579 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
3580 return false;
3581 return true;
3582}
3583
3584/// Return true if every element in Mask, beginning
3585/// from position Pos and ending in Pos+Size is undef or is zero.
3586static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
3587 unsigned Size) {
3588 return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
3589}
3590
3591/// Return true if every element of a single input is referenced by the shuffle
3592/// mask. i.e. it just permutes them all.
3594 unsigned NumElts = Mask.size();
3595 APInt DemandedElts = APInt::getZero(NumElts);
3596 for (int M : Mask)
3597 if (isInRange(M, 0, NumElts))
3598 DemandedElts.setBit(M);
3599 return DemandedElts.isAllOnes();
3600}
3601
3602/// Helper function to test whether a shuffle mask could be
3603/// simplified by widening the elements being shuffled.
3604///
3605/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
3606/// leaves it in an unspecified state.
3607///
3608/// NOTE: This must handle normal vector shuffle masks and *target* vector
3609/// shuffle masks. The latter have the special property of a '-2' representing
3610/// a zero-ed lane of a vector.
3612 SmallVectorImpl<int> &WidenedMask) {
3613 WidenedMask.assign(Mask.size() / 2, 0);
3614 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
3615 int M0 = Mask[i];
3616 int M1 = Mask[i + 1];
3617
3618 // If both elements are undef, its trivial.
3619 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
3620 WidenedMask[i / 2] = SM_SentinelUndef;
3621 continue;
3622 }
3623
3624 // Check for an undef mask and a mask value properly aligned to fit with
3625 // a pair of values. If we find such a case, use the non-undef mask's value.
3626 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
3627 WidenedMask[i / 2] = M1 / 2;
3628 continue;
3629 }
3630 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
3631 WidenedMask[i / 2] = M0 / 2;
3632 continue;
3633 }
3634
3635 // When zeroing, we need to spread the zeroing across both lanes to widen.
3636 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
3637 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
3639 WidenedMask[i / 2] = SM_SentinelZero;
3640 continue;
3641 }
3642 return false;
3643 }
3644
3645 // Finally check if the two mask values are adjacent and aligned with
3646 // a pair.
3647 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
3648 WidenedMask[i / 2] = M0 / 2;
3649 continue;
3650 }
3651
3652 // Otherwise we can't safely widen the elements used in this shuffle.
3653 return false;
3654 }
3655 assert(WidenedMask.size() == Mask.size() / 2 &&
3656 "Incorrect size of mask after widening the elements!");
3657
3658 return true;
3659}
3660
3662 const APInt &Zeroable,
3663 bool V2IsZero,
3664 SmallVectorImpl<int> &WidenedMask) {
3665 // Create an alternative mask with info about zeroable elements.
3666 // Here we do not set undef elements as zeroable.
3667 SmallVector<int, 64> ZeroableMask(Mask);
3668 if (V2IsZero) {
3669 assert(!Zeroable.isZero() && "V2's non-undef elements are used?!");
3670 for (int i = 0, Size = Mask.size(); i != Size; ++i)
3671 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
3672 ZeroableMask[i] = SM_SentinelZero;
3673 }
3674 return canWidenShuffleElements(ZeroableMask, WidenedMask);
3675}
3676
3678 SmallVector<int, 32> WidenedMask;
3679 return canWidenShuffleElements(Mask, WidenedMask);
3680}
3681
3682// Attempt to narrow/widen shuffle mask until it matches the target number of
3683// elements.
3684static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
3685 SmallVectorImpl<int> &ScaledMask) {
3686 unsigned NumSrcElts = Mask.size();
3687 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
3688 "Illegal shuffle scale factor");
3689
3690 // Narrowing is guaranteed to work.
3691 if (NumDstElts >= NumSrcElts) {
3692 int Scale = NumDstElts / NumSrcElts;
3693 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
3694 return true;
3695 }
3696
3697 // We have to repeat the widening until we reach the target size, but we can
3698 // split out the first widening as it sets up ScaledMask for us.
3699 if (canWidenShuffleElements(Mask, ScaledMask)) {
3700 while (ScaledMask.size() > NumDstElts) {
3701 SmallVector<int, 16> WidenedMask;
3702 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
3703 return false;
3704 ScaledMask = std::move(WidenedMask);
3705 }
3706 return true;
3707 }
3708
3709 return false;
3710}
3711
3712/// Returns true if Elt is a constant zero or a floating point constant +0.0.
3714 return isNullConstant(Elt) || isNullFPConstant(Elt);
3715}
3716
3717// Build a vector of constants.
3718// Use an UNDEF node if MaskElt == -1.
3719// Split 64-bit constants in the 32-bit mode.
3721 const SDLoc &dl, bool IsMask = false) {
3722
3724 bool Split = false;
3725
3726 MVT ConstVecVT = VT;
3727 unsigned NumElts = VT.getVectorNumElements();
3728 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
3729 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
3730 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
3731 Split = true;
3732 }
3733
3734 MVT EltVT = ConstVecVT.getVectorElementType();
3735 for (unsigned i = 0; i < NumElts; ++i) {
3736 bool IsUndef = Values[i] < 0 && IsMask;
3737 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
3738 DAG.getConstant(Values[i], dl, EltVT);
3739 Ops.push_back(OpNode);
3740 if (Split)
3741 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
3742 DAG.getConstant(0, dl, EltVT));
3743 }
3744 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
3745 if (Split)
3746 ConstsNode = DAG.getBitcast(VT, ConstsNode);
3747 return ConstsNode;
3748}
3749
3750static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
3751 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
3752 assert(Bits.size() == Undefs.getBitWidth() &&
3753 "Unequal constant and undef arrays");
3755 bool Split = false;
3756
3757 MVT ConstVecVT = VT;
3758 unsigned NumElts = VT.getVectorNumElements();
3759 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
3760 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
3761 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
3762 Split = true;
3763 }
3764
3765 MVT EltVT = ConstVecVT.getVectorElementType();
3766 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
3767 if (Undefs[i]) {
3768 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
3769 continue;
3770 }
3771 const APInt &V = Bits[i];
3772 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
3773 if (Split) {
3774 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
3775 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
3776 } else if (EltVT == MVT::f32) {
3778 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
3779 } else if (EltVT == MVT::f64) {
3781 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
3782 } else {
3783 Ops.push_back(DAG.getConstant(V, dl, EltVT));
3784 }
3785 }
3786
3787 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
3788 return DAG.getBitcast(VT, ConstsNode);
3789}
3790
3792 SelectionDAG &DAG, const SDLoc &dl) {
3793 APInt Undefs = APInt::getZero(Bits.size());
3794 return getConstVector(Bits, Undefs, VT, DAG, dl);
3795}
3796
3797/// Returns a vector of specified type with all zero elements.
3798static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
3799 SelectionDAG &DAG, const SDLoc &dl) {
3800 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
3801 VT.getVectorElementType() == MVT::i1) &&
3802 "Unexpected vector type");
3803
3804 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
3805 // type. This ensures they get CSE'd. But if the integer type is not
3806 // available, use a floating-point +0.0 instead.
3807 SDValue Vec;
3808 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3809 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
3810 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
3811 } else if (VT.isFloatingPoint() &&
3813 Vec = DAG.getConstantFP(+0.0, dl, VT);
3814 } else if (VT.getVectorElementType() == MVT::i1) {
3815 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
3816 "Unexpected vector type");
3817 Vec = DAG.getConstant(0, dl, VT);
3818 } else {
3819 unsigned Num32BitElts = VT.getSizeInBits() / 32;
3820 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
3821 }
3822 return DAG.getBitcast(VT, Vec);
3823}
3824
3825// Helper to determine if the ops are all the extracted subvectors come from a
3826// single source. If we allow commute they don't have to be in order (Lo/Hi).
3827static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
3828 if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
3829 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
3830 LHS.getValueType() != RHS.getValueType() ||
3831 LHS.getOperand(0) != RHS.getOperand(0))
3832 return SDValue();
3833
3834 SDValue Src = LHS.getOperand(0);
3835 if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
3836 return SDValue();
3837
3838 unsigned NumElts = LHS.getValueType().getVectorNumElements();
3839 if ((LHS.getConstantOperandAPInt(1) == 0 &&
3840 RHS.getConstantOperandAPInt(1) == NumElts) ||
3841 (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
3842 LHS.getConstantOperandAPInt(1) == NumElts))
3843 return Src;
3844
3845 return SDValue();
3846}
3847
3848static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
3849 const SDLoc &dl, unsigned vectorWidth) {
3850 EVT VT = Vec.getValueType();
3851 EVT ElVT = VT.getVectorElementType();
3852 unsigned Factor = VT.getSizeInBits() / vectorWidth;
3853 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
3854 VT.getVectorNumElements() / Factor);
3855
3856 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
3857 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
3858 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
3859
3860 // This is the index of the first element of the vectorWidth-bit chunk
3861 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
3862 IdxVal &= ~(ElemsPerChunk - 1);
3863
3864 // If the input is a buildvector just emit a smaller one.
3865 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
3866 return DAG.getBuildVector(ResultVT, dl,
3867 Vec->ops().slice(IdxVal, ElemsPerChunk));
3868
3869 // Check if we're extracting the upper undef of a widening pattern.
3870 if (Vec.getOpcode() == ISD::INSERT_SUBVECTOR && Vec.getOperand(0).isUndef() &&
3871 Vec.getOperand(1).getValueType().getVectorNumElements() <= IdxVal &&
3872 isNullConstant(Vec.getOperand(2)))
3873 return DAG.getUNDEF(ResultVT);
3874
3875 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
3876 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
3877}
3878
3879/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
3880/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
3881/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
3882/// instructions or a simple subregister reference. Idx is an index in the
3883/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
3884/// lowering EXTRACT_VECTOR_ELT operations easier.
3885static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
3886 SelectionDAG &DAG, const SDLoc &dl) {
3888 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
3889 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
3890}
3891
3892/// Generate a DAG to grab 256-bits from a 512-bit vector.
3893static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
3894 SelectionDAG &DAG, const SDLoc &dl) {
3895 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
3896 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
3897}
3898
3899static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
3900 SelectionDAG &DAG, const SDLoc &dl,
3901 unsigned vectorWidth) {
3902 assert((vectorWidth == 128 || vectorWidth == 256) &&
3903 "Unsupported vector width");
3904 // Inserting UNDEF is Result
3905 if (Vec.isUndef())
3906 return Result;
3907 EVT VT = Vec.getValueType();
3908 EVT ElVT = VT.getVectorElementType();
3909 EVT ResultVT = Result.getValueType();
3910
3911 // Insert the relevant vectorWidth bits.
3912 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
3913 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
3914
3915 // This is the index of the first element of the vectorWidth-bit chunk
3916 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
3917 IdxVal &= ~(ElemsPerChunk - 1);
3918
3919 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
3920 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
3921}
3922
3923/// Generate a DAG to put 128-bits into a vector > 128 bits. This
3924/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
3925/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
3926/// simple superregister reference. Idx is an index in the 128 bits
3927/// we want. It need not be aligned to a 128-bit boundary. That makes
3928/// lowering INSERT_VECTOR_ELT operations easier.
3929static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
3930 SelectionDAG &DAG, const SDLoc &dl) {
3931 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
3932 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
3933}
3934
3935/// Widen a vector to a larger size with the same scalar type, with the new
3936/// elements either zero or undef.
3937static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
3938 const X86Subtarget &Subtarget, SelectionDAG &DAG,
3939 const SDLoc &dl) {
3941 Vec.getValueType().getScalarType() == VT.getScalarType() &&
3942 "Unsupported vector widening type");
3943 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
3944 : DAG.getUNDEF(VT);
3945 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
3946 DAG.getIntPtrConstant(0, dl));
3947}
3948
3949/// Widen a vector to a larger size with the same scalar type, with the new
3950/// elements either zero or undef.
3951static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
3952 const X86Subtarget &Subtarget, SelectionDAG &DAG,
3953 const SDLoc &dl, unsigned WideSizeInBits) {
3954 assert(Vec.getValueSizeInBits() <= WideSizeInBits &&
3955 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
3956 "Unsupported vector widening type");
3957 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
3958 MVT SVT = Vec.getSimpleValueType().getScalarType();
3959 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
3960 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
3961}
3962
3963/// Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT
3964/// and bitcast with integer types.
3965static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget) {
3966 assert(VT.getVectorElementType() == MVT::i1 && "Expected bool vector");
3967 unsigned NumElts = VT.getVectorNumElements();
3968 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
3969 return Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
3970 return VT;
3971}
3972
3973/// Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and
3974/// bitcast with integer types.
3975static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements,
3976 const X86Subtarget &Subtarget, SelectionDAG &DAG,
3977 const SDLoc &dl) {
3978 MVT VT = widenMaskVectorType(Vec.getSimpleValueType(), Subtarget);
3979 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
3980}
3981
3982// Helper function to collect subvector ops that are concatenated together,
3983// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
3984// The subvectors in Ops are guaranteed to be the same type.
3986 SelectionDAG &DAG) {
3987 assert(Ops.empty() && "Expected an empty ops vector");
3988
3989 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
3990 Ops.append(N->op_begin(), N->op_end());
3991 return true;
3992 }
3993
3994 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
3995 SDValue Src = N->getOperand(0);
3996 SDValue Sub = N->getOperand(1);
3997 const APInt &Idx = N->getConstantOperandAPInt(2);
3998 EVT VT = Src.getValueType();
3999 EVT SubVT = Sub.getValueType();
4000
4001 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
4002 // insert_subvector(undef, x, lo)
4003 if (Idx == 0 && Src.isUndef()) {
4004 Ops.push_back(Sub);
4005 Ops.push_back(DAG.getUNDEF(SubVT));
4006 return true;
4007 }
4008 if (Idx == (VT.getVectorNumElements() / 2)) {
4009 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
4010 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
4011 Src.getOperand(1).getValueType() == SubVT &&
4012 isNullConstant(Src.getOperand(2))) {
4013 // Attempt to recurse into inner (matching) concats.
4014 SDValue Lo = Src.getOperand(1);
4015 SDValue Hi = Sub;
4016 SmallVector<SDValue, 2> LoOps, HiOps;
4017 if (collectConcatOps(Lo.getNode(), LoOps, DAG) &&
4018 collectConcatOps(Hi.getNode(), HiOps, DAG) &&
4019 LoOps.size() == HiOps.size()) {
4020 Ops.append(LoOps);
4021 Ops.append(HiOps);
4022 return true;
4023 }
4024 Ops.push_back(Lo);
4025 Ops.push_back(Hi);
4026 return true;
4027 }
4028 // insert_subvector(x, extract_subvector(x, lo), hi)
4029 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4030 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
4031 Ops.append(2, Sub);
4032 return true;
4033 }
4034 // insert_subvector(undef, x, hi)
4035 if (Src.isUndef()) {
4036 Ops.push_back(DAG.getUNDEF(SubVT));
4037 Ops.push_back(Sub);
4038 return true;
4039 }
4040 }
4041 }
4042 }
4043
4044 return false;
4045}
4046
4047// Helper to check if \p V can be split into subvectors and the upper subvectors
4048// are all undef. In which case return the lower subvector.
4050 SelectionDAG &DAG) {
4051 SmallVector<SDValue> SubOps;
4052 if (!collectConcatOps(V.getNode(), SubOps, DAG))
4053 return SDValue();
4054
4055 unsigned NumSubOps = SubOps.size();
4056 unsigned HalfNumSubOps = NumSubOps / 2;
4057 assert((NumSubOps % 2) == 0 && "Unexpected number of subvectors");
4058
4059 ArrayRef<SDValue> UpperOps(SubOps.begin() + HalfNumSubOps, SubOps.end());
4060 if (any_of(UpperOps, [](SDValue Op) { return !Op.isUndef(); }))
4061 return SDValue();
4062
4063 EVT HalfVT = V.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
4064 ArrayRef<SDValue> LowerOps(SubOps.begin(), SubOps.begin() + HalfNumSubOps);
4065 return DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT, LowerOps);
4066}
4067
4068// Helper to check if we can access all the constituent subvectors without any
4069// extract ops.
4072 return collectConcatOps(N, Ops, DAG);
4073}
4074
4075static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
4076 const SDLoc &dl) {
4077 EVT VT = Op.getValueType();
4078 unsigned NumElems = VT.getVectorNumElements();
4079 unsigned SizeInBits = VT.getSizeInBits();
4080 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
4081 "Can't split odd sized vector");
4082
4083 // If this is a splat value (with no-undefs) then use the lower subvector,
4084 // which should be a free extraction.
4085 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
4086 if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
4087 return std::make_pair(Lo, Lo);
4088
4089 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
4090 return std::make_pair(Lo, Hi);
4091}
4092
4093/// Break an operation into 2 half sized ops and then concatenate the results.
4095 unsigned NumOps = Op.getNumOperands();
4096 EVT VT = Op.getValueType();
4097
4098 // Extract the LHS Lo/Hi vectors
4099 SmallVector<SDValue> LoOps(NumOps, SDValue());
4100 SmallVector<SDValue> HiOps(NumOps, SDValue());
4101 for (unsigned I = 0; I != NumOps; ++I) {
4102 SDValue SrcOp = Op.getOperand(I);
4103 if (!SrcOp.getValueType().isVector()) {
4104 LoOps[I] = HiOps[I] = SrcOp;
4105 continue;
4106 }
4107 std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
4108 }
4109
4110 EVT LoVT, HiVT;
4111 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
4112 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
4113 DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
4114 DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
4115}
4116
4117/// Break an unary integer operation into 2 half sized ops and then
4118/// concatenate the result back.
4120 const SDLoc &dl) {
4121 // Make sure we only try to split 256/512-bit types to avoid creating
4122 // narrow vectors.
4123 EVT VT = Op.getValueType();
4124 (void)VT;
4125 assert((Op.getOperand(0).getValueType().is256BitVector() ||
4126 Op.getOperand(0).getValueType().is512BitVector()) &&
4127 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4128 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
4129 VT.getVectorNumElements() &&
4130 "Unexpected VTs!");
4131 return splitVectorOp(Op, DAG, dl);
4132}
4133
4134/// Break a binary integer operation into 2 half sized ops and then
4135/// concatenate the result back.
4137 const SDLoc &dl) {
4138 // Assert that all the types match.
4139 EVT VT = Op.getValueType();
4140 (void)VT;
4141 assert(Op.getOperand(0).getValueType() == VT &&
4142 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
4143 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4144 return splitVectorOp(Op, DAG, dl);
4145}
4146
4147// Helper for splitting operands of an operation to legal target size and
4148// apply a function on each part.
4149// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
4150// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
4151// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
4152// The argument Builder is a function that will be applied on each split part:
4153// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
4154template <typename F>
4156 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
4157 F Builder, bool CheckBWI = true) {
4158 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
4159 unsigned NumSubs = 1;
4160 if ((CheckBWI && Subtarget.useBWIRegs()) ||
4161 (!CheckBWI && Subtarget.useAVX512Regs())) {
4162 if (VT.getSizeInBits() > 512) {
4163 NumSubs = VT.getSizeInBits() / 512;
4164 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
4165 }
4166 } else if (Subtarget.hasAVX2()) {
4167 if (VT.getSizeInBits() > 256) {
4168 NumSubs = VT.getSizeInBits() / 256;
4169 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
4170 }
4171 } else {
4172 if (VT.getSizeInBits() > 128) {
4173 NumSubs = VT.getSizeInBits() / 128;
4174 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
4175 }
4176 }
4177
4178 if (NumSubs == 1)
4179 return Builder(DAG, DL, Ops);
4180
4182 for (unsigned i = 0; i != NumSubs; ++i) {
4184 for (SDValue Op : Ops) {
4185 EVT OpVT = Op.getValueType();
4186 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
4187 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
4188 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
4189 }
4190 Subs.push_back(Builder(DAG, DL, SubOps));
4191 }
4192 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
4193}
4194
4195// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
4196// targets.
4197static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
4199 const X86Subtarget &Subtarget) {
4200 assert(Subtarget.hasAVX512() && "AVX512 target expected");
4201 MVT SVT = VT.getScalarType();
4202
4203 // If we have a 32/64 splatted constant, splat it to DstTy to
4204 // encourage a foldable broadcast'd operand.
4205 auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
4206 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
4207 // AVX512 broadcasts 32/64-bit operands.
4208 // TODO: Support float once getAVX512Node is used by fp-ops.
4209 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
4211 return SDValue();
4212 // If we're not widening, don't bother if we're not bitcasting.
4213 if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
4214 return SDValue();
4215 if (auto *BV = dyn_cast<BuildVectorSDNode>(peekThroughBitcasts(Op))) {
4216 APInt SplatValue, SplatUndef;
4217 unsigned SplatBitSize;
4218 bool HasAnyUndefs;
4219 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
4220 HasAnyUndefs, OpEltSizeInBits) &&
4221 !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
4222 return DAG.getConstant(SplatValue, DL, DstVT);
4223 }
4224 return SDValue();
4225 };
4226
4227 bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
4228
4229 MVT DstVT = VT;
4230 if (Widen)
4231 DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
4232
4233 // Canonicalize src operands.
4234 SmallVector<SDValue> SrcOps(Ops.begin(), Ops.end());
4235 for (SDValue &Op : SrcOps) {
4236 MVT OpVT = Op.getSimpleValueType();
4237 // Just pass through scalar operands.
4238 if (!OpVT.isVector())
4239 continue;
4240 assert(OpVT == VT && "Vector type mismatch");
4241
4242 if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
4243 Op = BroadcastOp;
4244 continue;
4245 }
4246
4247 // Just widen the subvector by inserting into an undef wide vector.
4248 if (Widen)
4249 Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
4250 }
4251
4252 SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
4253
4254 // Perform the 512-bit op then extract the bottom subvector.
4255 if (Widen)
4256 Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
4257 return Res;
4258}
4259
4260/// Insert i1-subvector to i1-vector.
4262 const X86Subtarget &Subtarget) {
4263
4264 SDLoc dl(Op);
4265 SDValue Vec = Op.getOperand(0);
4266 SDValue SubVec = Op.getOperand(1);
4267 SDValue Idx = Op.getOperand(2);
4268 unsigned IdxVal = Op.getConstantOperandVal(2);
4269
4270 // Inserting undef is a nop. We can just return the original vector.
4271 if (SubVec.isUndef())
4272 return Vec;
4273
4274 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
4275 return Op;
4276
4277 MVT OpVT = Op.getSimpleValueType();
4278 unsigned NumElems = OpVT.getVectorNumElements();
4279 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
4280
4281 // Extend to natively supported kshift.
4282 MVT WideOpVT = widenMaskVectorType(OpVT, Subtarget);
4283
4284 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
4285 // if necessary.
4286 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
4287 // May need to promote to a legal type.
4288 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4289 DAG.getConstant(0, dl, WideOpVT),
4290 SubVec, Idx);
4291 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4292 }
4293
4294 MVT SubVecVT = SubVec.getSimpleValueType();
4295 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4296 assert(IdxVal + SubVecNumElems <= NumElems &&
4297 IdxVal % SubVecVT.getSizeInBits() == 0 &&
4298 "Unexpected index value in INSERT_SUBVECTOR");
4299
4300 SDValue Undef = DAG.getUNDEF(WideOpVT);
4301
4302 if (IdxVal == 0) {
4303 // Zero lower bits of the Vec
4304 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
4305 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
4306 ZeroIdx);
4307 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4308 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4309 // Merge them together, SubVec should be zero extended.
4310 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4311 DAG.getConstant(0, dl, WideOpVT),
4312 SubVec, ZeroIdx);
4313 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4314 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4315 }
4316
4317 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4318 Undef, SubVec, ZeroIdx);
4319
4320 if (Vec.isUndef()) {
4321 assert(IdxVal != 0 && "Unexpected index");
4322 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4323 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4324 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4325 }
4326
4328 assert(IdxVal != 0 && "Unexpected index");
4329 // If upper elements of Vec are known undef, then just shift into place.
4330 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
4331 [](SDValue V) { return V.isUndef(); })) {
4332 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4333 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4334 } else {
4335 NumElems = WideOpVT.getVectorNumElements();
4336 unsigned ShiftLeft = NumElems - SubVecNumElems;
4337 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4338 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4339 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4340 if (ShiftRight != 0)
4341 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4342 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4343 }
4344 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4345 }
4346
4347 // Simple case when we put subvector in the upper part
4348 if (IdxVal + SubVecNumElems == NumElems) {
4349 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4350 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4351 if (SubVecNumElems * 2 == NumElems) {
4352 // Special case, use legal zero extending insert_subvector. This allows
4353 // isel to optimize when bits are known zero.
4354 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
4355 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4356 DAG.getConstant(0, dl, WideOpVT),
4357 Vec, ZeroIdx);
4358 } else {
4359 // Otherwise use explicit shifts to zero the bits.
4360 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4361 Undef, Vec, ZeroIdx);
4362 NumElems = WideOpVT.getVectorNumElements();
4363 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
4364 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4365 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4366 }
4367 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4368 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4369 }
4370
4371 // Inserting into the middle is more complicated.
4372
4373 NumElems = WideOpVT.getVectorNumElements();
4374
4375 // Widen the vector if needed.
4376 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4377
4378 unsigned ShiftLeft = NumElems - SubVecNumElems;
4379 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4380
4381 // Do an optimization for the most frequently used types.
4382 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
4383 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
4384 Mask0.flipAllBits();
4385 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
4386 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
4387 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
4388 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4389 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4390 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4391 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4392 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4393
4394 // Reduce to original width if needed.
4395 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4396 }
4397
4398 // Clear the upper bits of the subvector and move it to its insert position.
4399 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4400 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4401 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4402 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4403
4404 // Isolate the bits below the insertion point.
4405 unsigned LowShift = NumElems - IdxVal;
4406 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
4407 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4408 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
4409 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4410
4411 // Isolate the bits after the last inserted bit.
4412 unsigned HighShift = IdxVal + SubVecNumElems;
4413 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
4414 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4415 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
4416 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4417
4418 // Now OR all 3 pieces together.
4419 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
4420 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
4421
4422 // Reduce to original width if needed.
4423 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4424}
4425
4427 const SDLoc &dl) {
4428 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
4429 EVT SubVT = V1.getValueType();
4430 EVT SubSVT = SubVT.getScalarType();
4431 unsigned SubNumElts = SubVT.getVectorNumElements();
4432 unsigned SubVectorWidth = SubVT.getSizeInBits();
4433 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
4434 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
4435 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
4436}
4437
4438/// Returns a vector of specified type with all bits set.
4439/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
4440/// Then bitcast to their original type, ensuring they get CSE'd.
4441static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4442 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
4443 "Expected a 128/256/512-bit vector type");
4444
4445 APInt Ones = APInt::getAllOnes(32);
4446 unsigned NumElts = VT.getSizeInBits() / 32;
4447 SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
4448 return DAG.getBitcast(VT, Vec);
4449}
4450
4451static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
4452 SDValue In, SelectionDAG &DAG) {
4453 EVT InVT = In.getValueType();
4454 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
4455 assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
4456 ISD::ZERO_EXTEND == Opcode) &&
4457 "Unknown extension opcode");
4458
4459 // For 256-bit vectors, we only need the lower (128-bit) input half.
4460 // For 512-bit vectors, we only need the lower input half or quarter.
4461 if (InVT.getSizeInBits() > 128) {
4462 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
4463 "Expected VTs to be the same size!");
4464 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
4465 In = extractSubVector(In, 0, DAG, DL,
4466 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
4467 InVT = In.getValueType();
4468 }
4469
4470 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
4471 Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);
4472
4473 return DAG.getNode(Opcode, DL, VT, In);
4474}
4475
4476// Create OR(AND(LHS,MASK),AND(RHS,~MASK)) bit select pattern
4477static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS,
4478 SDValue Mask, SelectionDAG &DAG) {
4479 LHS = DAG.getNode(ISD::AND, DL, VT, LHS, Mask);
4480 RHS = DAG.getNode(X86ISD::ANDNP, DL, VT, Mask, RHS);
4481 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
4482}
4483
4485 bool Lo, bool Unary) {
4486 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
4487 "Illegal vector type to unpack");
4488 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4489 int NumElts = VT.getVectorNumElements();
4490 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
4491 for (int i = 0; i < NumElts; ++i) {
4492 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
4493 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
4494 Pos += (Unary ? 0 : NumElts * (i % 2));
4495 Pos += (Lo ? 0 : NumEltsInLane / 2);
4496 Mask.push_back(Pos);
4497 }
4498}
4499
4500/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
4501/// imposed by AVX and specific to the unary pattern. Example:
4502/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
4503/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
4505 bool Lo) {
4506 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4507 int NumElts = VT.getVectorNumElements();
4508 for (int i = 0; i < NumElts; ++i) {
4509 int Pos = i / 2;
4510 Pos += (Lo ? 0 : NumElts / 2);
4511 Mask.push_back(Pos);
4512 }
4513}
4514
4515// Attempt to constant fold, else just create a VECTOR_SHUFFLE.
4516static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
4517 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
4519 (ISD::isBuildVectorOfConstantSDNodes(V2.getNode()) || V2.isUndef())) {
4520 SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
4521 for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
4522 int M = Mask[I];
4523 if (M < 0)
4524 continue;
4525 SDValue V = (M < NumElts) ? V1 : V2;
4526 if (V.isUndef())
4527 continue;
4528 Ops[I] = V.getOperand(M % NumElts);
4529 }
4530 return DAG.getBuildVector(VT, dl, Ops);
4531 }
4532
4533 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4534}
4535
4536/// Returns a vector_shuffle node for an unpackl operation.
4537static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4538 SDValue V1, SDValue V2) {
4540 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
4541 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4542}
4543
4544/// Returns a vector_shuffle node for an unpackh operation.
4545static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4546 SDValue V1, SDValue V2) {
4548 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
4549 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4550}
4551
4552/// Returns a node that packs the LHS + RHS nodes together at half width.
4553/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
4554/// TODO: Add subvector splitting if/when we have a need for it.
4555static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
4556 const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
4557 bool PackHiHalf = false) {
4558 MVT OpVT = LHS.getSimpleValueType();
4559 unsigned EltSizeInBits = VT.getScalarSizeInBits();
4560 bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
4561 assert(OpVT == RHS.getSimpleValueType() &&
4562 VT.getSizeInBits() == OpVT.getSizeInBits() &&
4563 (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&
4564 "Unexpected PACK operand types");
4565 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&
4566 "Unexpected PACK result type");
4567
4568 // Rely on vector shuffles for vXi64 -> vXi32 packing.
4569 if (EltSizeInBits == 32) {
4570 SmallVector<int> PackMask;
4571 int Offset = PackHiHalf ? 1 : 0;
4572 int NumElts = VT.getVectorNumElements();
4573 for (int I = 0; I != NumElts; I += 4) {
4574 PackMask.push_back(I + Offset);
4575 PackMask.push_back(I + Offset + 2);
4576 PackMask.push_back(I + Offset + NumElts);
4577 PackMask.push_back(I + Offset + NumElts + 2);
4578 }
4579 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
4580 DAG.getBitcast(VT, RHS), PackMask);
4581 }
4582
4583 // See if we already have sufficient leading bits for PACKSS/PACKUS.
4584 if (!PackHiHalf) {
4585 if (UsePackUS &&
4586 DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
4587 DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
4588 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4589
4590 if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
4591 DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
4592 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4593 }
4594
4595 // Fallback to sign/zero extending the requested half and pack.
4596 SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
4597 if (UsePackUS) {
4598 if (PackHiHalf) {
4599 LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
4600 RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
4601 } else {
4602 SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
4603 LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
4604 RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
4605 };
4606 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4607 };
4608
4609 if (!PackHiHalf) {
4610 LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
4611 RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
4612 }
4613 LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
4614 RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
4615 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4616}
4617
4618/// Return a vector_shuffle of the specified vector of zero or undef vector.
4619/// This produces a shuffle where the low element of V2 is swizzled into the
4620/// zero/undef vector, landing at element Idx.
4621/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
4623 bool IsZero,
4624 const X86Subtarget &Subtarget,
4625 SelectionDAG &DAG) {
4626 MVT VT = V2.getSimpleValueType();
4627 SDValue V1 = IsZero
4628 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
4629 int NumElems = VT.getVectorNumElements();
4630 SmallVector<int, 16> MaskVec(NumElems);
4631 for (int i = 0; i != NumElems; ++i)
4632 // If this is the insertion idx, put the low elt of V2 here.
4633 MaskVec[i] = (i == Idx) ? NumElems : i;
4634 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
4635}
4636
4638 if (Ptr.getOpcode() == X86ISD::Wrapper ||
4639 Ptr.getOpcode() == X86ISD::WrapperRIP)
4640 Ptr = Ptr.getOperand(0);
4641 return dyn_cast<ConstantPoolSDNode>(Ptr);
4642}
4643
4646 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
4647 return nullptr;
4648 return CNode->getConstVal();
4649}
4650
4652 if (!Load || !ISD::isNormalLoad(Load))
4653 return nullptr;
4654 return getTargetConstantFromBasePtr(Load->getBasePtr());
4655}
4656
4659 return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
4660}
4661
4662const Constant *
4664 assert(LD && "Unexpected null LoadSDNode");
4665 return getTargetConstantFromNode(LD);
4666}
4667
4668// Extract raw constant bits from constant pools.
4669static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
4670 APInt &UndefElts,
4671 SmallVectorImpl<APInt> &EltBits,
4672 bool AllowWholeUndefs = true,
4673 bool AllowPartialUndefs = false) {
4674 assert(EltBits.empty() && "Expected an empty EltBits vector");
4675
4677
4678 EVT VT = Op.getValueType();
4679 unsigned SizeInBits = VT.getSizeInBits();
4680 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
4681 unsigned NumElts = SizeInBits / EltSizeInBits;
4682
4683 // Bitcast a source array of element bits to the target size.
4684 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
4685 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
4686 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
4687 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
4688 "Constant bit sizes don't match");
4689
4690 // Don't split if we don't allow undef bits.
4691 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
4692 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
4693 return false;
4694
4695 // If we're already the right size, don't bother bitcasting.
4696 if (NumSrcElts == NumElts) {
4697 UndefElts = UndefSrcElts;
4698 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
4699 return true;
4700 }
4701
4702 // Extract all the undef/constant element data and pack into single bitsets.
4703 APInt UndefBits(SizeInBits, 0);
4704 APInt MaskBits(SizeInBits, 0);
4705
4706 for (unsigned i = 0; i != NumSrcElts; ++i) {
4707 unsigned BitOffset = i * SrcEltSizeInBits;
4708 if (UndefSrcElts[i])
4709 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
4710 MaskBits.insertBits(SrcEltBits[i], BitOffset);
4711 }
4712
4713 // Split the undef/constant single bitset data into the target elements.
4714 UndefElts = APInt(NumElts, 0);
4715 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
4716
4717 for (unsigned i = 0; i != NumElts; ++i) {
4718 unsigned BitOffset = i * EltSizeInBits;
4719 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
4720
4721 // Only treat an element as UNDEF if all bits are UNDEF.
4722 if (UndefEltBits.isAllOnes()) {
4723 if (!AllowWholeUndefs)
4724 return false;
4725 UndefElts.setBit(i);
4726 continue;
4727 }
4728
4729 // If only some bits are UNDEF then treat them as zero (or bail if not
4730 // supported).
4731 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
4732 return false;
4733
4734 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
4735 }
4736 return true;
4737 };
4738
4739 // Collect constant bits and insert into mask/undef bit masks.
4740 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
4741 unsigned UndefBitIndex) {
4742 if (!Cst)
4743 return false;
4744 if (isa<UndefValue>(Cst)) {
4745 Undefs.setBit(UndefBitIndex);
4746 return true;
4747 }
4748 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
4749 Mask = CInt->getValue();
4750 return true;
4751 }
4752 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
4753 Mask = CFP->getValueAPF().bitcastToAPInt();
4754 return true;
4755 }
4756 if (auto *CDS = dyn_cast<ConstantDataSequential>(Cst)) {
4757 Type *Ty = CDS->getType();
4759 Type *EltTy = CDS->getElementType();
4760 bool IsInteger = EltTy->isIntegerTy();
4761 bool IsFP =
4762 EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy();
4763 if (!IsInteger && !IsFP)
4764 return false;
4765 unsigned EltBits = EltTy->getPrimitiveSizeInBits();
4766 for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
4767 if (IsInteger)
4768 Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits);
4769 else
4770 Mask.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(),
4771 I * EltBits);
4772 return true;
4773 }
4774 return false;
4775 };
4776
4777 // Handle UNDEFs.
4778 if (Op.isUndef()) {
4779 APInt UndefSrcElts = APInt::getAllOnes(NumElts);
4780 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
4781 return CastBitData(UndefSrcElts, SrcEltBits);
4782 }
4783
4784 // Extract scalar constant bits.
4785 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
4786 APInt UndefSrcElts = APInt::getZero(1);
4787 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
4788 return CastBitData(UndefSrcElts, SrcEltBits);
4789 }
4790 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
4791 APInt UndefSrcElts = APInt::getZero(1);
4792 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
4793 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
4794 return CastBitData(UndefSrcElts, SrcEltBits);
4795 }
4796
4797 // Extract constant bits from build vector.
4798 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
4799 BitVector Undefs;
4800 SmallVector<APInt> SrcEltBits;
4801 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
4802 if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
4803 APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());
4804 for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
4805 if (Undefs[I])
4806 UndefSrcElts.setBit(I);
4807 return CastBitData(UndefSrcElts, SrcEltBits);
4808 }
4809 }
4810
4811 // Extract constant bits from constant pool vector.
4812 if (auto *Cst = getTargetConstantFromNode(Op)) {
4813 Type *CstTy = Cst->getType();
4814 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
4815 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
4816 return false;
4817
4818 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
4819 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
4820 if ((SizeInBits % SrcEltSizeInBits) != 0)
4821 return false;
4822
4823 APInt UndefSrcElts(NumSrcElts, 0);
4824 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
4825 for (unsigned i = 0; i != NumSrcElts; ++i)
4826 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
4827 UndefSrcElts, i))
4828 return false;
4829
4830 return CastBitData(UndefSrcElts, SrcEltBits);
4831 }
4832
4833 // Extract constant bits from a broadcasted constant pool scalar.
4834 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
4835 EltSizeInBits <= VT.getScalarSizeInBits()) {
4836 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
4837 if (MemIntr->getMemoryVT().getStoreSizeInBits() != VT.getScalarSizeInBits())
4838 return false;
4839
4840 SDValue Ptr = MemIntr->getBasePtr();
4842 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
4843 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
4844
4845 APInt UndefSrcElts(NumSrcElts, 0);
4846 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
4847 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
4848 if (UndefSrcElts[0])
4849 UndefSrcElts.setBits(0, NumSrcElts);
4850 if (SrcEltBits[0].getBitWidth() != SrcEltSizeInBits)
4851 SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);
4852 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
4853 return CastBitData(UndefSrcElts, SrcEltBits);
4854 }
4855 }
4856 }
4857
4858 // Extract constant bits from a subvector broadcast.
4859 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
4860 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
4861 SDValue Ptr = MemIntr->getBasePtr();
4862 // The source constant may be larger than the subvector broadcast,
4863 // ensure we extract the correct subvector constants.
4864 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
4865 Type *CstTy = Cst->getType();
4866 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
4867 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
4868 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
4869 (SizeInBits % SubVecSizeInBits) != 0)
4870 return false;
4871 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
4872 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
4873 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
4874 APInt UndefSubElts(NumSubElts, 0);
4875 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
4876 APInt(CstEltSizeInBits, 0));
4877 for (unsigned i = 0; i != NumSubElts; ++i) {
4878 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
4879 UndefSubElts, i))
4880 return false;
4881 for (unsigned j = 1; j != NumSubVecs; ++j)
4882 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
4883 }
4884 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
4885 UndefSubElts);
4886 return CastBitData(UndefSubElts, SubEltBits);
4887 }
4888 }
4889
4890 // Extract a rematerialized scalar constant insertion.
4891 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
4892 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
4893 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
4894 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
4895 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
4896
4897 APInt UndefSrcElts(NumSrcElts, 0);
4898 SmallVector<APInt, 64> SrcEltBits;
4899 const APInt &C = Op.getOperand(0).getConstantOperandAPInt(0);
4900 SrcEltBits.push_back(C.zextOrTrunc(SrcEltSizeInBits));
4901 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
4902 return CastBitData(UndefSrcElts, SrcEltBits);
4903 }
4904
4905 // Insert constant bits from a base and sub vector sources.
4906 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
4907 // If bitcasts to larger elements we might lose track of undefs - don't
4908 // allow any to be safe.
4909 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
4910 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
4911
4912 APInt UndefSrcElts, UndefSubElts;
4913 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
4914 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
4915 UndefSubElts, EltSubBits,
4916 AllowWholeUndefs && AllowUndefs,
4917 AllowPartialUndefs && AllowUndefs) &&
4918 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
4919 UndefSrcElts, EltSrcBits,
4920 AllowWholeUndefs && AllowUndefs,
4921 AllowPartialUndefs && AllowUndefs)) {
4922 unsigned BaseIdx = Op.getConstantOperandVal(2);
4923 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
4924 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
4925 EltSrcBits[BaseIdx + i] = EltSubBits[i];
4926 return CastBitData(UndefSrcElts, EltSrcBits);
4927 }
4928 }
4929
4930 // Extract constant bits from a subvector's source.
4931 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
4932 // TODO - support extract_subvector through bitcasts.
4933 if (EltSizeInBits != VT.getScalarSizeInBits())
4934 return false;
4935
4936 if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
4937 UndefElts, EltBits, AllowWholeUndefs,
4938 AllowPartialUndefs)) {
4939 EVT SrcVT = Op.getOperand(0).getValueType();
4940 unsigned NumSrcElts = SrcVT.getVectorNumElements();
4941 unsigned NumSubElts = VT.getVectorNumElements();
4942 unsigned BaseIdx = Op.getConstantOperandVal(1);
4943 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
4944 if ((BaseIdx + NumSubElts) != NumSrcElts)
4945 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
4946 if (BaseIdx != 0)
4947 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
4948 return true;
4949 }
4950 }
4951
4952 // Extract constant bits from shuffle node sources.
4953 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
4954 // TODO - support shuffle through bitcasts.
4955 if (EltSizeInBits != VT.getScalarSizeInBits())
4956 return false;
4957
4958 ArrayRef<int> Mask = SVN->getMask();
4959 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
4960 llvm::any_of(Mask, [](int M) { return M < 0; }))
4961 return false;
4962
4963 APInt UndefElts0, UndefElts1;
4964 SmallVector<APInt, 32> EltBits0, EltBits1;
4965 if (isAnyInRange(Mask, 0, NumElts) &&
4966 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
4967 UndefElts0, EltBits0, AllowWholeUndefs,
4968 AllowPartialUndefs))
4969 return false;
4970 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
4971 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
4972 UndefElts1, EltBits1, AllowWholeUndefs,
4973 AllowPartialUndefs))
4974 return false;
4975
4976 UndefElts = APInt::getZero(NumElts);
4977 for (int i = 0; i != (int)NumElts; ++i) {
4978 int M = Mask[i];
4979 if (M < 0) {
4980 UndefElts.setBit(i);
4981 EltBits.push_back(APInt::getZero(EltSizeInBits));
4982 } else if (M < (int)NumElts) {
4983 if (UndefElts0[M])
4984 UndefElts.setBit(i);
4985 EltBits.push_back(EltBits0[M]);
4986 } else {
4987 if (UndefElts1[M - NumElts])
4988 UndefElts.setBit(i);
4989 EltBits.push_back(EltBits1[M - NumElts]);
4990 }
4991 }
4992 return true;
4993 }
4994
4995 return false;
4996}
4997
4998namespace llvm {
4999namespace X86 {
5000bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
5001 APInt UndefElts;
5002 SmallVector<APInt, 16> EltBits;
5004 Op, Op.getScalarValueSizeInBits(), UndefElts, EltBits,
5005 /*AllowWholeUndefs*/ true, AllowPartialUndefs)) {
5006 int SplatIndex = -1;
5007 for (int i = 0, e = EltBits.size(); i != e; ++i) {
5008 if (UndefElts[i])
5009 continue;
5010 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
5011 SplatIndex = -1;
5012 break;
5013 }
5014 SplatIndex = i;
5015 }
5016 if (0 <= SplatIndex) {
5017 SplatVal = EltBits[SplatIndex];
5018 return true;
5019 }
5020 }
5021
5022 return false;
5023}
5024} // namespace X86
5025} // namespace llvm
5026
5028 unsigned MaskEltSizeInBits,
5030 APInt &UndefElts) {
5031 // Extract the raw target constant bits.
5032 SmallVector<APInt, 64> EltBits;
5033 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5034 EltBits, /* AllowWholeUndefs */ true,
5035 /* AllowPartialUndefs */ false))
5036 return false;
5037
5038 // Insert the extracted elements into the mask.
5039 for (const APInt &Elt : EltBits)
5040 RawMask.push_back(Elt.getZExtValue());
5041
5042 return true;
5043}
5044
5045// Match not(xor X, -1) -> X.
5046// Match not(pcmpgt(C, X)) -> pcmpgt(X, C - 1).
5047// Match not(extract_subvector(xor X, -1)) -> extract_subvector(X).
5048// Match not(concat_vectors(xor X, -1, xor Y, -1)) -> concat_vectors(X, Y).
5050 V = peekThroughBitcasts(V);
5051 if (V.getOpcode() == ISD::XOR &&
5052 (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||
5053 isAllOnesConstant(V.getOperand(1))))
5054 return V.getOperand(0);
5055 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5056 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
5057 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
5058 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
5059 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
5060 Not, V.getOperand(1));
5061 }
5062 }
5063 if (V.getOpcode() == X86ISD::PCMPGT &&
5064 !ISD::isBuildVectorAllZeros(V.getOperand(0).getNode()) &&
5065 !ISD::isBuildVectorAllOnes(V.getOperand(0).getNode()) &&
5066 V.getOperand(0).hasOneUse()) {
5067 APInt UndefElts;
5068 SmallVector<APInt> EltBits;
5069 if (getTargetConstantBitsFromNode(V.getOperand(0),
5070 V.getScalarValueSizeInBits(), UndefElts,
5071 EltBits)) {
5072 // Don't fold min_signed_value -> (min_signed_value - 1)
5073 bool MinSigned = false;
5074 for (APInt &Elt : EltBits) {
5075 MinSigned |= Elt.isMinSignedValue();
5076 Elt -= 1;
5077 }
5078 if (!MinSigned) {
5079 SDLoc DL(V);
5080 MVT VT = V.getSimpleValueType();
5081 return DAG.getNode(X86ISD::PCMPGT, DL, VT, V.getOperand(1),
5082 getConstVector(EltBits, UndefElts, VT, DAG, DL));
5083 }
5084 }
5085 }
5087 if (collectConcatOps(V.getNode(), CatOps, DAG)) {
5088 for (SDValue &CatOp : CatOps) {
5089 SDValue NotCat = IsNOT(CatOp, DAG);
5090 if (!NotCat) return SDValue();
5091 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
5092 }
5093 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
5094 }
5095 return SDValue();
5096}
5097
5098/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5099/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
5100/// Note: This ignores saturation, so inputs must be checked first.
5102 bool Unary, unsigned NumStages = 1) {
5103 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5104 unsigned NumElts = VT.getVectorNumElements();
5105 unsigned NumLanes = VT.getSizeInBits() / 128;
5106 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5107 unsigned Offset = Unary ? 0 : NumElts;
5108 unsigned Repetitions = 1u << (NumStages - 1);
5109 unsigned Increment = 1u << NumStages;
5110 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
5111
5112 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5113 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
5114 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5115 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5116 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5117 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5118 }
5119 }
5120}
5121
5122// Split the demanded elts of a PACKSS/PACKUS node between its operands.
5123static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
5124 APInt &DemandedLHS, APInt &DemandedRHS) {
5125 int NumLanes = VT.getSizeInBits() / 128;
5126 int NumElts = DemandedElts.getBitWidth();
5127 int NumInnerElts = NumElts / 2;
5128 int NumEltsPerLane = NumElts / NumLanes;
5129 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
5130
5131 DemandedLHS = APInt::getZero(NumInnerElts);
5132 DemandedRHS = APInt::getZero(NumInnerElts);
5133
5134 // Map DemandedElts to the packed operands.
5135 for (int Lane = 0; Lane != NumLanes; ++Lane) {
5136 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
5137 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
5138 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
5139 if (DemandedElts[OuterIdx])
5140 DemandedLHS.setBit(InnerIdx);
5141 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
5142 DemandedRHS.setBit(InnerIdx);
5143 }
5144 }
5145}
5146
5147// Split the demanded elts of a HADD/HSUB node between its operands.
5148static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
5149 APInt &DemandedLHS, APInt &DemandedRHS) {
5150 int NumLanes = VT.getSizeInBits() / 128;
5151 int NumElts = DemandedElts.getBitWidth();
5152 int NumEltsPerLane = NumElts / NumLanes;
5153 int HalfEltsPerLane = NumEltsPerLane / 2;
5154
5155 DemandedLHS = APInt::getZero(NumElts);
5156 DemandedRHS = APInt::getZero(NumElts);
5157
5158 // Map DemandedElts to the horizontal operands.
5159 for (int Idx = 0; Idx != NumElts; ++Idx) {
5160 if (!DemandedElts[Idx])
5161 continue;
5162 int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
5163 int LocalIdx = Idx % NumEltsPerLane;
5164 if (LocalIdx < HalfEltsPerLane) {
5165 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
5166 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
5167 } else {
5168 LocalIdx -= HalfEltsPerLane;
5169 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
5170 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
5171 }
5172 }
5173}
5174
5175/// Calculates the shuffle mask corresponding to the target-specific opcode.
5176/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5177/// operands in \p Ops, and returns true.
5178/// Sets \p IsUnary to true if only one source is used. Note that this will set
5179/// IsUnary for shuffles which use a single input multiple times, and in those
5180/// cases it will adjust the mask to only have indices within that single input.
5181/// It is an error to call this with non-empty Mask/Ops vectors.
5182static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5184 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5185 if (!isTargetShuffle(N.getOpcode()))
5186 return false;
5187
5188 MVT VT = N.getSimpleValueType();
5189 unsigned NumElems = VT.getVectorNumElements();
5190 unsigned MaskEltSize = VT.getScalarSizeInBits();
5192 APInt RawUndefs;
5193 uint64_t ImmN;
5194
5195 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5196 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5197
5198 IsUnary = false;
5199 bool IsFakeUnary = false;
5200 switch (N.getOpcode()) {
5201 case X86ISD::BLENDI:
5202 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5203 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5204 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5205 DecodeBLENDMask(NumElems, ImmN, Mask);
5206 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5207 break;
5208 case X86ISD::SHUFP:
5209 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5210 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5211 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5212 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
5213 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5214 break;
5215 case X86ISD::INSERTPS:
5216 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5217 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5218 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5219 DecodeINSERTPSMask(ImmN, Mask);
5220 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5221 break;
5222 case X86ISD::EXTRQI:
5223 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5224 if (isa<ConstantSDNode>(N.getOperand(1)) &&
5225 isa<ConstantSDNode>(N.getOperand(2))) {
5226 int BitLen = N.getConstantOperandVal(1);
5227 int BitIdx = N.getConstantOperandVal(2);
5228 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5229 IsUnary = true;
5230 }
5231 break;
5232 case X86ISD::INSERTQI:
5233 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5234 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5235 if (isa<ConstantSDNode>(N.getOperand(2)) &&
5236 isa<ConstantSDNode>(N.getOperand(3))) {
5237 int BitLen = N.getConstantOperandVal(2);
5238 int BitIdx = N.getConstantOperandVal(3);
5239 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5240 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5241 }
5242 break;
5243 case X86ISD::UNPCKH:
5244 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5245 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5246 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
5247 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5248 break;
5249 case X86ISD::UNPCKL:
5250 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5251 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5252 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
5253 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5254 break;
5255 case X86ISD::MOVHLPS:
5256 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5257 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5258 DecodeMOVHLPSMask(NumElems, Mask);
5259 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5260 break;
5261 case X86ISD::MOVLHPS:
5262 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5263 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5264 DecodeMOVLHPSMask(NumElems, Mask);
5265 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5266 break;
5267 case X86ISD::VALIGN:
5268 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
5269 "Only 32-bit and 64-bit elements are supported!");
5270 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5271 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5272 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5273 DecodeVALIGNMask(NumElems, ImmN, Mask);
5274 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5275 Ops.push_back(N.getOperand(1));
5276 Ops.push_back(N.getOperand(0));
5277 break;
5278 case X86ISD::PALIGNR:
5279 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5280 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5281 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5282 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5283 DecodePALIGNRMask(NumElems, ImmN, Mask);
5284 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5285 Ops.push_back(N.getOperand(1));
5286 Ops.push_back(N.getOperand(0));
5287 break;
5288 case X86ISD::VSHLDQ:
5289 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5290 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5291 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5292 DecodePSLLDQMask(NumElems, ImmN, Mask);
5293 IsUnary = true;
5294 break;
5295 case X86ISD::VSRLDQ:
5296 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5297 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5298 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5299 DecodePSRLDQMask(NumElems, ImmN, Mask);
5300 IsUnary = true;
5301 break;
5302 case X86ISD::PSHUFD:
5303 case X86ISD::VPERMILPI:
5304 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5305 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5306 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
5307 IsUnary = true;
5308 break;
5309 case X86ISD::PSHUFHW:
5310 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5311 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5312 DecodePSHUFHWMask(NumElems, ImmN, Mask);
5313 IsUnary = true;
5314 break;
5315 case X86ISD::PSHUFLW:
5316 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5317 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5318 DecodePSHUFLWMask(NumElems, ImmN, Mask);
5319 IsUnary = true;
5320 break;
5321 case X86ISD::VZEXT_MOVL:
5322 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5323 DecodeZeroMoveLowMask(NumElems, Mask);
5324 IsUnary = true;
5325 break;
5326 case X86ISD::VBROADCAST:
5327 // We only decode broadcasts of same-sized vectors, peeking through to
5328 // extracted subvectors is likely to cause hasOneUse issues with
5329 // SimplifyDemandedBits etc.
5330 if (N.getOperand(0).getValueType() == VT) {
5331 DecodeVectorBroadcast(NumElems, Mask);
5332 IsUnary = true;
5333 break;
5334 }
5335 return false;
5336 case X86ISD::VPERMILPV: {
5337 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5338 IsUnary = true;
5339 SDValue MaskNode = N.getOperand(1);
5340 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5341 RawUndefs)) {
5342 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
5343 break;
5344 }
5345 return false;
5346 }
5347 case X86ISD::PSHUFB: {
5348 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5349 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5350 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5351 IsUnary = true;
5352 SDValue MaskNode = N.getOperand(1);
5353 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5354 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
5355 break;
5356 }
5357 return false;
5358 }
5359 case X86ISD::VPERMI:
5360 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5361 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5362 DecodeVPERMMask(NumElems, ImmN, Mask);
5363 IsUnary = true;
5364 break;
5365 case X86ISD::MOVSS:
5366 case X86ISD::MOVSD:
5367 case X86ISD::MOVSH:
5368 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5369 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5370 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
5371 break;
5372 case X86ISD::VPERM2X128:
5373 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5374 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5375 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5376 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
5377 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5378 break;
5379 case X86ISD::SHUF128:
5380 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5381 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5382 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5383 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
5384 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5385 break;
5386 case X86ISD::MOVSLDUP:
5387 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5388 DecodeMOVSLDUPMask(NumElems, Mask);
5389 IsUnary = true;
5390 break;
5391 case X86ISD::MOVSHDUP:
5392 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5393 DecodeMOVSHDUPMask(NumElems, Mask);
5394 IsUnary = true;
5395 break;
5396 case X86ISD::MOVDDUP:
5397 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5398 DecodeMOVDDUPMask(NumElems, Mask);
5399 IsUnary = true;
5400 break;
5401 case X86ISD::VPERMIL2: {
5402 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5403 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5404 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5405 SDValue MaskNode = N.getOperand(2);
5406 SDValue CtrlNode = N.getOperand(3);
5407 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5408 unsigned CtrlImm = CtrlOp->getZExtValue();
5409 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5410 RawUndefs)) {
5411 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
5412 Mask);
5413 break;
5414 }
5415 }
5416 return false;
5417 }
5418 case X86ISD::VPPERM: {
5419 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5420 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5421 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5422 SDValue MaskNode = N.getOperand(2);
5423 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5424 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
5425 break;
5426 }
5427 return false;
5428 }
5429 case X86ISD::VPERMV: {
5430 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5431 IsUnary = true;
5432 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5433 Ops.push_back(N.getOperand(1));
5434 SDValue MaskNode = N.getOperand(0);
5435 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5436 RawUndefs)) {
5437 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
5438 break;
5439 }
5440 return false;
5441 }
5442 case X86ISD::VPERMV3: {
5443 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5444 assert(N.getOperand(2).getValueType() == VT && "Unexpected value type");
5445 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(2);
5446 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5447 Ops.push_back(N.getOperand(0));
5448 Ops.push_back(N.getOperand(2));
5449 SDValue MaskNode = N.getOperand(1);
5450 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5451 RawUndefs)) {
5452 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
5453 break;
5454 }
5455 return false;
5456 }
5457 default:
5458 llvm_unreachable("unknown target shuffle node");
5459 }
5460
5461 // Empty mask indicates the decode failed.
5462 if (Mask.empty())
5463 return false;
5464
5465 // Check if we're getting a shuffle mask with zero'd elements.
5466 if (!AllowSentinelZero && isAnyZero(Mask))
5467 return false;
5468
5469 // If we have a fake unary shuffle, the shuffle mask is spread across two
5470 // inputs that are actually the same node. Re-map the mask to always point
5471 // into the first input.
5472 if (IsFakeUnary)
5473 for (int &M : Mask)
5474 if (M >= (int)Mask.size())
5475 M -= Mask.size();
5476
5477 // If we didn't already add operands in the opcode-specific code, default to
5478 // adding 1 or 2 operands starting at 0.
5479 if (Ops.empty()) {
5480 Ops.push_back(N.getOperand(0));
5481 if (!IsUnary || IsFakeUnary)
5482 Ops.push_back(N.getOperand(1));
5483 }
5484
5485 return true;
5486}
5487
5488// Wrapper for getTargetShuffleMask with InUnary;
5489static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5491 SmallVectorImpl<int> &Mask) {
5492 bool IsUnary;
5493 return getTargetShuffleMask(N, AllowSentinelZero, Ops, Mask, IsUnary);
5494}
5495
5496/// Compute whether each element of a shuffle is zeroable.
5497///
5498/// A "zeroable" vector shuffle element is one which can be lowered to zero.
5499/// Either it is an undef element in the shuffle mask, the element of the input
5500/// referenced is undef, or the element of the input referenced is known to be
5501/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
5502/// as many lanes with this technique as possible to simplify the remaining
5503/// shuffle.
5505 SDValue V1, SDValue V2,
5506 APInt &KnownUndef, APInt &KnownZero) {
5507 int Size = Mask.size();
5508 KnownUndef = KnownZero = APInt::getZero(Size);
5509
5510 V1 = peekThroughBitcasts(V1);
5511 V2 = peekThroughBitcasts(V2);
5512
5513 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
5514 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
5515
5516 int VectorSizeInBits = V1.getValueSizeInBits();
5517 int ScalarSizeInBits = VectorSizeInBits / Size;
5518 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
5519
5520 for (int i = 0; i < Size; ++i) {
5521 int M = Mask[i];
5522 // Handle the easy cases.
5523 if (M < 0) {
5524 KnownUndef.setBit(i);
5525 continue;
5526 }
5527 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
5528 KnownZero.setBit(i);
5529 continue;
5530 }
5531
5532 // Determine shuffle input and normalize the mask.
5533 SDValue V = M < Size ? V1 : V2;
5534 M %= Size;
5535
5536 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
5537 if (V.getOpcode() != ISD::BUILD_VECTOR)
5538 continue;
5539
5540 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
5541 // the (larger) source element must be UNDEF/ZERO.
5542 if ((Size % V.getNumOperands()) == 0) {
5543 int Scale = Size / V->getNumOperands();
5544 SDValue Op = V.getOperand(M / Scale);
5545 if (Op.isUndef())
5546 KnownUndef.setBit(i);
5547 if (X86::isZeroNode(Op))
5548 KnownZero.setBit(i);
5549 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
5550 APInt Val = Cst->getAPIntValue();
5551 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5552 if (Val == 0)
5553 KnownZero.setBit(i);
5554 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5555 APInt Val = Cst->getValueAPF().bitcastToAPInt();
5556 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5557 if (Val == 0)
5558 KnownZero.setBit(i);
5559 }
5560 continue;
5561 }
5562
5563 // If the BUILD_VECTOR has more elements then all the (smaller) source
5564 // elements must be UNDEF or ZERO.
5565 if ((V.getNumOperands() % Size) == 0) {
5566 int Scale = V->getNumOperands() / Size;
5567 bool AllUndef = true;
5568 bool AllZero = true;
5569 for (int j = 0; j < Scale; ++j) {
5570 SDValue Op = V.getOperand((M * Scale) + j);
5571 AllUndef &= Op.isUndef();
5572 AllZero &= X86::isZeroNode(Op);
5573 }
5574 if (AllUndef)
5575 KnownUndef.setBit(i);
5576 if (AllZero)
5577 KnownZero.setBit(i);
5578 continue;
5579 }
5580 }
5581}
5582
5583/// Decode a target shuffle mask and inputs and see if any values are
5584/// known to be undef or zero from their inputs.
5585/// Returns true if the target shuffle mask was decoded.
5586/// FIXME: Merge this with computeZeroableShuffleElements?
5589 APInt &KnownUndef, APInt &KnownZero) {
5590 bool IsUnary;
5591 if (!isTargetShuffle(N.getOpcode()))
5592 return false;
5593
5594 MVT VT = N.getSimpleValueType();
5595 if (!getTargetShuffleMask(N, true, Ops, Mask, IsUnary))
5596 return false;
5597
5598 int Size = Mask.size();
5599 SDValue V1 = Ops[0];
5600 SDValue V2 = IsUnary ? V1 : Ops[1];
5601 KnownUndef = KnownZero = APInt::getZero(Size);
5602
5603 V1 = peekThroughBitcasts(V1);
5604 V2 = peekThroughBitcasts(V2);
5605
5606 assert((VT.getSizeInBits() % Size) == 0 &&
5607 "Illegal split of shuffle value type");
5608 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
5609
5610 // Extract known constant input data.
5611 APInt UndefSrcElts[2];
5612 SmallVector<APInt, 32> SrcEltBits[2];
5613 bool IsSrcConstant[2] = {
5614 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5615 SrcEltBits[0], /*AllowWholeUndefs*/ true,
5616 /*AllowPartialUndefs*/ false),
5617 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5618 SrcEltBits[1], /*AllowWholeUndefs*/ true,
5619 /*AllowPartialUndefs*/ false)};
5620
5621 for (int i = 0; i < Size; ++i) {
5622 int M = Mask[i];
5623
5624 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5625 if (M < 0) {
5626 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
5627 if (SM_SentinelUndef == M)
5628 KnownUndef.setBit(i);
5629 if (SM_SentinelZero == M)
5630 KnownZero.setBit(i);
5631 continue;
5632 }
5633
5634 // Determine shuffle input and normalize the mask.
5635 unsigned SrcIdx = M / Size;
5636 SDValue V = M < Size ? V1 : V2;
5637 M %= Size;
5638
5639 // We are referencing an UNDEF input.
5640 if (V.isUndef()) {
5641 KnownUndef.setBit(i);
5642 continue;
5643 }
5644
5645 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5646 // TODO: We currently only set UNDEF for integer types - floats use the same
5647 // registers as vectors and many of the scalar folded loads rely on the
5648 // SCALAR_TO_VECTOR pattern.
5649 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
5650 (Size % V.getValueType().getVectorNumElements()) == 0) {
5651 int Scale = Size / V.getValueType().getVectorNumElements();
5652 int Idx = M / Scale;
5653 if (Idx != 0 && !VT.isFloatingPoint())
5654 KnownUndef.setBit(i);
5655 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
5656 KnownZero.setBit(i);
5657 continue;
5658 }
5659
5660 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
5661 // base vectors.
5662 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
5663 SDValue Vec = V.getOperand(0);
5664 int NumVecElts = Vec.getValueType().getVectorNumElements();
5665 if (Vec.isUndef() && Size == NumVecElts) {
5666 int Idx = V.getConstantOperandVal(2);
5667 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
5668 if (M < Idx || (Idx + NumSubElts) <= M)
5669 KnownUndef.setBit(i);
5670 }
5671 continue;
5672 }
5673
5674 // Attempt to extract from the source's constant bits.
5675 if (IsSrcConstant[SrcIdx]) {
5676 if (UndefSrcElts[SrcIdx][M])
5677 KnownUndef.setBit(i);
5678 else if (SrcEltBits[SrcIdx][M] == 0)
5679 KnownZero.setBit(i);
5680 }
5681 }
5682
5683 assert(VT.getVectorNumElements() == (unsigned)Size &&
5684 "Different mask size from vector size!");
5685 return true;
5686}
5687
5688// Replace target shuffle mask elements with known undef/zero sentinels.
5690 const APInt &KnownUndef,
5691 const APInt &KnownZero,
5692 bool ResolveKnownZeros= true) {
5693 unsigned NumElts = Mask.size();
5694 assert(KnownUndef.getBitWidth() == NumElts &&
5695 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
5696
5697 for (unsigned i = 0; i != NumElts; ++i) {
5698 if (KnownUndef[i])
5699 Mask[i] = SM_SentinelUndef;
5700 else if (ResolveKnownZeros && KnownZero[i])
5701 Mask[i] = SM_SentinelZero;
5702 }
5703}
5704
5705// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
5707 APInt &KnownUndef,
5708 APInt &KnownZero) {
5709 unsigned NumElts = Mask.size();
5710 KnownUndef = KnownZero = APInt::getZero(NumElts);
5711
5712 for (unsigned i = 0; i != NumElts; ++i) {
5713 int M = Mask[i];
5714 if (SM_SentinelUndef == M)
5715 KnownUndef.setBit(i);
5716 if (SM_SentinelZero == M)
5717 KnownZero.setBit(i);
5718 }
5719}
5720
5721// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
5723 SDValue Cond, bool IsBLENDV = false) {
5724 EVT CondVT = Cond.getValueType();
5725 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
5726 unsigned NumElts = CondVT.getVectorNumElements();
5727
5728 APInt UndefElts;
5729 SmallVector<APInt, 32> EltBits;
5730 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
5731 /*AllowWholeUndefs*/ true,
5732 /*AllowPartialUndefs*/ false))
5733 return false;
5734
5735 Mask.resize(NumElts, SM_SentinelUndef);
5736
5737 for (int i = 0; i != (int)NumElts; ++i) {
5738 Mask[i] = i;
5739 // Arbitrarily choose from the 2nd operand if the select condition element
5740 // is undef.
5741 // TODO: Can we do better by matching patterns such as even/odd?
5742 if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
5743 (IsBLENDV && EltBits[i].isNonNegative()))
5744 Mask[i] += NumElts;
5745 }
5746
5747 return true;
5748}
5749
5750// Forward declaration (for getFauxShuffleMask recursive check).
5751static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
5754 const SelectionDAG &DAG, unsigned Depth,
5755 bool ResolveKnownElts);
5756
5757// Attempt to decode ops that could be represented as a shuffle mask.
5758// The decoded shuffle mask may contain a different number of elements to the
5759// destination value type.
5760// TODO: Merge into getTargetShuffleInputs()
5761static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
5764 const SelectionDAG &DAG, unsigned Depth,
5765 bool ResolveKnownElts) {
5766 Mask.clear();
5767 Ops.clear();
5768
5769 MVT VT = N.getSimpleValueType();
5770 unsigned NumElts = VT.getVectorNumElements();
5771 unsigned NumSizeInBits = VT.getSizeInBits();
5772 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
5773 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
5774 return false;
5775 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
5776 unsigned NumSizeInBytes = NumSizeInBits / 8;
5777 unsigned NumBytesPerElt = NumBitsPerElt / 8;
5778
5779 unsigned Opcode = N.getOpcode();
5780 switch (Opcode) {
5781 case ISD::VECTOR_SHUFFLE: {
5782 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
5783 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
5784 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
5785 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
5786 Ops.push_back(N.getOperand(0));
5787 Ops.push_back(N.getOperand(1));
5788 return true;
5789 }
5790 return false;
5791 }
5792 case ISD::AND:
5793 case X86ISD::ANDNP: {
5794 // Attempt to decode as a per-byte mask.
5795 APInt UndefElts;
5796 SmallVector<APInt, 32> EltBits;
5797 SDValue N0 = N.getOperand(0);
5798 SDValue N1 = N.getOperand(1);
5799 bool IsAndN = (X86ISD::ANDNP == Opcode);
5800 uint64_t ZeroMask = IsAndN ? 255 : 0;
5801 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits,
5802 /*AllowWholeUndefs*/ false,
5803 /*AllowPartialUndefs*/ false))
5804 return false;
5805 // We can't assume an undef src element gives an undef dst - the other src
5806 // might be zero.
5807 assert(UndefElts.isZero() && "Unexpected UNDEF element in AND/ANDNP mask");
5808 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
5809 const APInt &ByteBits = EltBits[i];
5810 if (ByteBits != 0 && ByteBits != 255)
5811 return false;
5812 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
5813 }
5814 Ops.push_back(IsAndN ? N1 : N0);
5815 return true;
5816 }
5817 case ISD::OR: {
5818 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
5819 // is a valid shuffle index.
5820 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
5821 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
5822 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
5823 return false;
5824
5825 SmallVector<int, 64> SrcMask0, SrcMask1;
5826 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
5829 if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,
5830 Depth + 1, true) ||
5831 !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,
5832 Depth + 1, true))
5833 return false;
5834
5835 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
5836 SmallVector<int, 64> Mask0, Mask1;
5837 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
5838 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
5839 for (int i = 0; i != (int)MaskSize; ++i) {
5840 // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
5841 // loops converting between OR and BLEND shuffles due to
5842 // canWidenShuffleElements merging away undef elements, meaning we
5843 // fail to recognise the OR as the undef element isn't known zero.
5844 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
5845 Mask.push_back(SM_SentinelZero);
5846 else if (Mask1[i] == SM_SentinelZero)
5847 Mask.push_back(i);
5848 else if (Mask0[i] == SM_SentinelZero)
5849 Mask.push_back(i + MaskSize);
5850 else
5851 return false;
5852 }
5853 Ops.push_back(N0);
5854 Ops.push_back(N1);
5855 return true;
5856 }
5857 case ISD::INSERT_SUBVECTOR: {
5858 SDValue Src = N.getOperand(0);
5859 SDValue Sub = N.getOperand(1);
5860 EVT SubVT = Sub.getValueType();
5861 unsigned NumSubElts = SubVT.getVectorNumElements();
5862 if (!N->isOnlyUserOf(Sub.getNode()))
5863 return false;
5864 SDValue SubBC = peekThroughBitcasts(Sub);
5865 uint64_t InsertIdx = N.getConstantOperandVal(2);
5866 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
5867 if (SubBC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5868 SubBC.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
5869 uint64_t ExtractIdx = SubBC.getConstantOperandVal(1);
5870 SDValue SubBCSrc = SubBC.getOperand(0);
5871 unsigned NumSubSrcBCElts = SubBCSrc.getValueType().getVectorNumElements();
5872 unsigned MaxElts = std::max(NumElts, NumSubSrcBCElts);
5873 assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcBCElts) == 0 &&
5874 "Subvector valuetype mismatch");
5875 InsertIdx *= (MaxElts / NumElts);
5876 ExtractIdx *= (MaxElts / NumSubSrcBCElts);
5877 NumSubElts *= (MaxElts / NumElts);
5878 bool SrcIsUndef = Src.isUndef();
5879 for (int i = 0; i != (int)MaxElts; ++i)
5880 Mask.push_back(SrcIsUndef ? SM_SentinelUndef : i);
5881 for (int i = 0; i != (int)NumSubElts; ++i)
5882 Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i;
5883 if (!SrcIsUndef)
5884 Ops.push_back(Src);
5885 Ops.push_back(SubBCSrc);
5886 return true;
5887 }
5888 // Handle CONCAT(SUB0, SUB1).
5889 // Limit this to vXi64 512-bit vector cases to make the most of AVX512
5890 // cross lane shuffles.
5891 if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
5892 NumBitsPerElt == 64 && NumSizeInBits == 512 &&
5893 Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
5894 Src.getOperand(0).isUndef() &&
5895 Src.getOperand(1).getValueType() == SubVT &&
5896 Src.getConstantOperandVal(2) == 0) {
5897 for (int i = 0; i != (int)NumSubElts; ++i)
5898 Mask.push_back(i);
5899 for (int i = 0; i != (int)NumSubElts; ++i)
5900 Mask.push_back(i + NumElts);
5901 Ops.push_back(Src.getOperand(1));
5902 Ops.push_back(Sub);
5903 return true;
5904 }
5905 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
5906 SmallVector<int, 64> SubMask;
5907 SmallVector<SDValue, 2> SubInputs;
5908 SDValue SubSrc = peekThroughOneUseBitcasts(Sub);
5909 EVT SubSrcVT = SubSrc.getValueType();
5910 if (!SubSrcVT.isVector())
5911 return false;
5912
5913 APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());
5914 if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,
5915 Depth + 1, ResolveKnownElts))
5916 return false;
5917
5918 // Subvector shuffle inputs must not be larger than the subvector.
5919 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
5920 return SubVT.getFixedSizeInBits() <
5921 SubInput.getValueSizeInBits().getFixedValue();
5922 }))
5923 return false;
5924
5925 if (SubMask.size() != NumSubElts) {
5926 assert(((SubMask.size() % NumSubElts) == 0 ||
5927 (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale");
5928 if ((NumSubElts % SubMask.size()) == 0) {
5929 int Scale = NumSubElts / SubMask.size();
5930 SmallVector<int,64> ScaledSubMask;
5931 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
5932 SubMask = ScaledSubMask;
5933 } else {
5934 int Scale = SubMask.size() / NumSubElts;
5935 NumSubElts = SubMask.size();
5936 NumElts *= Scale;
5937 InsertIdx *= Scale;
5938 }
5939 }
5940 Ops.push_back(Src);
5941 Ops.append(SubInputs.begin(), SubInputs.end());
5942 if (ISD::isBuildVectorAllZeros(Src.getNode()))
5943 Mask.append(NumElts, SM_SentinelZero);
5944 else
5945 for (int i = 0; i != (int)NumElts; ++i)
5946 Mask.push_back(i);
5947 for (int i = 0; i != (int)NumSubElts; ++i) {
5948 int M = SubMask[i];
5949 if (0 <= M) {
5950 int InputIdx = M / NumSubElts;
5951 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
5952 }
5953 Mask[i + InsertIdx] = M;
5954 }
5955 return true;
5956 }
5957 case X86ISD::PINSRB:
5958 case X86ISD::PINSRW:
5961 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
5962 // vector, for matching src/dst vector types.
5963 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
5964
5965 unsigned DstIdx = 0;
5966 if (Opcode != ISD::SCALAR_TO_VECTOR) {
5967 // Check we have an in-range constant insertion index.
5968 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
5969 N.getConstantOperandAPInt(2).uge(NumElts))
5970 return false;
5971 DstIdx = N.getConstantOperandVal(2);
5972
5973 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
5974 if (X86::isZeroNode(Scl)) {
5975 Ops.push_back(N.getOperand(0));
5976 for (unsigned i = 0; i != NumElts; ++i)
5977 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
5978 return true;
5979 }
5980 }
5981
5982 // Peek through trunc/aext/zext/bitcast.
5983 // TODO: aext shouldn't require SM_SentinelZero padding.
5984 // TODO: handle shift of scalars.
5985 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
5986 while (Scl.getOpcode() == ISD::TRUNCATE ||
5987 Scl.getOpcode() == ISD::ANY_EXTEND ||
5988 Scl.getOpcode() == ISD::ZERO_EXTEND ||
5989 (Scl.getOpcode() == ISD::BITCAST &&
5992 Scl = Scl.getOperand(0);
5993 MinBitsPerElt =
5994 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
5995 }
5996 if ((MinBitsPerElt % 8) != 0)
5997 return false;
5998
5999 // Attempt to find the source vector the scalar was extracted from.
6000 SDValue SrcExtract;
6001 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
6002 Scl.getOpcode() == X86ISD::PEXTRW ||
6003 Scl.getOpcode() == X86ISD::PEXTRB) &&
6004 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6005 SrcExtract = Scl;
6006 }
6007 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
6008 return false;
6009
6010 SDValue SrcVec = SrcExtract.getOperand(0);
6011 EVT SrcVT = SrcVec.getValueType();
6012 if (!SrcVT.getScalarType().isByteSized())
6013 return false;
6014 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6015 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
6016 unsigned DstByte = DstIdx * NumBytesPerElt;
6017 MinBitsPerElt =
6018 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
6019
6020 // Create 'identity' byte level shuffle mask and then add inserted bytes.
6021 if (Opcode == ISD::SCALAR_TO_VECTOR) {
6022 Ops.push_back(SrcVec);
6023 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6024 } else {
6025 Ops.push_back(SrcVec);
6026 Ops.push_back(N.getOperand(0));
6027 for (int i = 0; i != (int)NumSizeInBytes; ++i)
6028 Mask.push_back(NumSizeInBytes + i);
6029 }
6030
6031 unsigned MinBytesPerElts = MinBitsPerElt / 8;
6032 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
6033 for (unsigned i = 0; i != MinBytesPerElts; ++i)
6034 Mask[DstByte + i] = SrcByte + i;
6035 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
6036 Mask[DstByte + i] = SM_SentinelZero;
6037 return true;
6038 }
6039 case X86ISD::PACKSS:
6040 case X86ISD::PACKUS: {
6041 SDValue N0 = N.getOperand(0);
6042 SDValue N1 = N.getOperand(1);
6043 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6044 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6045 "Unexpected input value type");
6046
6047 APInt EltsLHS, EltsRHS;
6048 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
6049
6050 // If we know input saturation won't happen (or we don't care for particular
6051 // lanes), we can treat this as a truncation shuffle.
6052 bool Offset0 = false, Offset1 = false;
6053 if (Opcode == X86ISD::PACKSS) {
6054 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6055 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
6056 (!(N1.isUndef() || EltsRHS.isZero()) &&
6057 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
6058 return false;
6059 // We can't easily fold ASHR into a shuffle, but if it was feeding a
6060 // PACKSS then it was likely being used for sign-extension for a
6061 // truncation, so just peek through and adjust the mask accordingly.
6062 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
6063 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
6064 Offset0 = true;
6065 N0 = N0.getOperand(0);
6066 }
6067 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
6068 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
6069 Offset1 = true;
6070 N1 = N1.getOperand(0);
6071 }
6072 } else {
6073 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6074 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6075 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
6076 (!(N1.isUndef() || EltsRHS.isZero()) &&
6077 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
6078 return false;
6079 }
6080
6081 bool IsUnary = (N0 == N1);
6082
6083 Ops.push_back(N0);
6084 if (!IsUnary)
6085 Ops.push_back(N1);
6086
6087 createPackShuffleMask(VT, Mask, IsUnary);
6088
6089 if (Offset0 || Offset1) {
6090 for (int &M : Mask)
6091 if ((Offset0 && isInRange(M, 0, NumElts)) ||
6092 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
6093 ++M;
6094 }
6095 return true;
6096 }
6097 case ISD::VSELECT:
6098 case X86ISD::BLENDV: {
6099 SDValue Cond = N.getOperand(0);
6100 if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {
6101 Ops.push_back(N.getOperand(1));
6102 Ops.push_back(N.getOperand(2));
6103 return true;
6104 }
6105 return false;
6106 }
6107 case X86ISD::VTRUNC: {
6108 SDValue Src = N.getOperand(0);
6109 EVT SrcVT = Src.getValueType();
6110 // Truncated source must be a simple vector.
6111 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6112 (SrcVT.getScalarSizeInBits() % 8) != 0)
6113 return false;
6114 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6115 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6116 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
6117 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
6118 for (unsigned i = 0; i != NumSrcElts; ++i)
6119 Mask.push_back(i * Scale);
6120 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
6121 Ops.push_back(Src);
6122 return true;
6123 }
6124 case X86ISD::VSHLI:
6125 case X86ISD::VSRLI: {
6126 uint64_t ShiftVal = N.getConstantOperandVal(1);
6127 // Out of range bit shifts are guaranteed to be zero.
6128 if (NumBitsPerElt <= ShiftVal) {
6129 Mask.append(NumElts, SM_SentinelZero);
6130 return true;
6131 }
6132
6133 // We can only decode 'whole byte' bit shifts as shuffles.
6134 if ((ShiftVal % 8) != 0)
6135 break;
6136
6137 uint64_t ByteShift = ShiftVal / 8;
6138 Ops.push_back(N.getOperand(0));
6139
6140 // Clear mask to all zeros and insert the shifted byte indices.
6141 Mask.append(NumSizeInBytes, SM_SentinelZero);
6142
6143 if (X86ISD::VSHLI == Opcode) {
6144 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6145 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6146 Mask[i + j] = i + j - ByteShift;
6147 } else {
6148 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6149 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6150 Mask[i + j - ByteShift] = i + j;
6151 }
6152 return true;
6153 }
6154 case X86ISD::VROTLI:
6155 case X86ISD::VROTRI: {
6156 // We can only decode 'whole byte' bit rotates as shuffles.
6157 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
6158 if ((RotateVal % 8) != 0)
6159 return false;
6160 Ops.push_back(N.getOperand(0));
6161 int Offset = RotateVal / 8;
6162 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
6163 for (int i = 0; i != (int)NumElts; ++i) {
6164 int BaseIdx = i * NumBytesPerElt;
6165 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
6166 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
6167 }
6168 }
6169 return true;
6170 }
6171 case X86ISD::VBROADCAST: {
6172 SDValue Src = N.getOperand(0);
6173 if (!Src.getSimpleValueType().isVector()) {
6174 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6175 !isNullConstant(Src.getOperand(1)) ||
6176 Src.getOperand(0).getValueType().getScalarType() !=
6177 VT.getScalarType())
6178 return false;
6179 Src = Src.getOperand(0);
6180 }
6181 Ops.push_back(Src);
6182 Mask.append(NumElts, 0);
6183 return true;
6184 }
6186 SDValue Src = N.getOperand(0);
6187 EVT SrcVT = Src.getValueType();
6188 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6189
6190 // Extended source must be a simple vector.
6191 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6192 (NumBitsPerSrcElt % 8) != 0)
6193 return false;
6194
6195 // We can only handle all-signbits extensions.
6196 APInt DemandedSrcElts =
6197 DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
6198 if (DAG.ComputeNumSignBits(Src, DemandedSrcElts) != NumBitsPerSrcElt)
6199 return false;
6200
6201 assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 && "Unexpected extension");
6202 unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
6203 for (unsigned I = 0; I != NumElts; ++I)
6204 Mask.append(Scale, I);
6205 Ops.push_back(Src);
6206 return true;
6207 }
6208 case ISD::ZERO_EXTEND:
6209 case ISD::ANY_EXTEND:
6212 SDValue Src = N.getOperand(0);
6213 EVT SrcVT = Src.getValueType();
6214
6215 // Extended source must be a simple vector.
6216 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6217 (SrcVT.getScalarSizeInBits() % 8) != 0)
6218 return false;
6219
6220 bool IsAnyExtend =
6221 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
6222 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
6223 IsAnyExtend, Mask);
6224 Ops.push_back(Src);
6225 return true;
6226 }
6227 }
6228
6229 return false;
6230}
6231
6232/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
6234 SmallVectorImpl<int> &Mask) {
6235 int MaskWidth = Mask.size();
6236 SmallVector<SDValue, 16> UsedInputs;
6237 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6238 int lo = UsedInputs.size() * MaskWidth;
6239 int hi = lo + MaskWidth;
6240
6241 // Strip UNDEF input usage.
6242 if (Inputs[i].isUndef())
6243 for (int &M : Mask)
6244 if ((lo <= M) && (M < hi))
6245 M = SM_SentinelUndef;
6246
6247 // Check for unused inputs.
6248 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6249 for (int &M : Mask)
6250 if (lo <= M)
6251 M -= MaskWidth;
6252 continue;
6253 }
6254
6255 // Check for repeated inputs.
6256 bool IsRepeat = false;
6257 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
6258 if (UsedInputs[j] != Inputs[i])
6259 continue;
6260 for (int &M : Mask)
6261 if (lo <= M)
6262 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
6263 IsRepeat = true;
6264 break;
6265 }
6266 if (IsRepeat)
6267 continue;
6268
6269 UsedInputs.push_back(Inputs[i]);
6270 }
6271 Inputs = UsedInputs;
6272}
6273
6274/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
6275/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
6276/// Returns true if the target shuffle mask was decoded.
6277static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6280 APInt &KnownUndef, APInt &KnownZero,
6281 const SelectionDAG &DAG, unsigned Depth,
6282 bool ResolveKnownElts) {
6284 return false; // Limit search depth.
6285
6286 EVT VT = Op.getValueType();
6287 if (!VT.isSimple() || !VT.isVector())
6288 return false;
6289
6290 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
6291 if (ResolveKnownElts)
6292 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
6293 return true;
6294 }
6295 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
6296 ResolveKnownElts)) {
6297 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
6298 return true;
6299 }
6300 return false;
6301}
6302
6303static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6306 const SelectionDAG &DAG, unsigned Depth,
6307 bool ResolveKnownElts) {
6308 APInt KnownUndef, KnownZero;
6309 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
6310 KnownZero, DAG, Depth, ResolveKnownElts);
6311}
6312
6315 const SelectionDAG &DAG, unsigned Depth = 0,
6316 bool ResolveKnownElts = true) {
6317 EVT VT = Op.getValueType();
6318 if (!VT.isSimple() || !VT.isVector())
6319 return false;
6320
6321 unsigned NumElts = Op.getValueType().getVectorNumElements();
6322 APInt DemandedElts = APInt::getAllOnes(NumElts);
6323 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,
6324 ResolveKnownElts);
6325}
6326
6327// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
6328static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
6329 EVT MemVT, MemSDNode *Mem, unsigned Offset,
6330 SelectionDAG &DAG) {
6331 assert((Opcode == X86ISD::VBROADCAST_LOAD ||
6332 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&
6333 "Unknown broadcast load type");
6334
6335 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
6336 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
6337 return SDValue();
6338
6341 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
6342 SDValue Ops[] = {Mem->getChain(), Ptr};
6343 SDValue BcstLd = DAG.getMemIntrinsicNode(
6344 Opcode, DL, Tys, Ops, MemVT,
6346 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
6347 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
6348 return BcstLd;
6349}
6350
6351/// Returns the scalar element that will make up the i'th
6352/// element of the result of the vector shuffle.
6354 SelectionDAG &DAG, unsigned Depth) {
6356 return SDValue(); // Limit search depth.
6357
6358 EVT VT = Op.getValueType();
6359 unsigned Opcode = Op.getOpcode();
6360 unsigned NumElems = VT.getVectorNumElements();
6361
6362 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6363 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
6364 int Elt = SV->getMaskElt(Index);
6365
6366 if (Elt < 0)
6367 return DAG.getUNDEF(VT.getVectorElementType());
6368
6369 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
6370 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6371 }
6372
6373 // Recurse into target specific vector shuffles to find scalars.
6374 if (isTargetShuffle(Opcode)) {
6375 MVT ShufVT = VT.getSimpleVT();
6376 MVT ShufSVT = ShufVT.getVectorElementType();
6377 int NumElems = (int)ShufVT.getVectorNumElements();
6378 SmallVector<int, 16> ShuffleMask;
6380 if (!getTargetShuffleMask(Op, true, ShuffleOps, ShuffleMask))
6381 return SDValue();
6382
6383 int Elt = ShuffleMask[Index];
6384 if (Elt == SM_SentinelZero)
6385 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
6386 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
6387 if (Elt == SM_SentinelUndef)
6388 return DAG.getUNDEF(ShufSVT);
6389
6390 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
6391 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6392 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6393 }
6394
6395 // Recurse into insert_subvector base/sub vector to find scalars.
6396 if (Opcode == ISD::INSERT_SUBVECTOR) {
6397 SDValue Vec = Op.getOperand(0);
6398 SDValue Sub = Op.getOperand(1);
6399 uint64_t SubIdx = Op.getConstantOperandVal(2);
6400 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
6401
6402 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
6403 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
6404 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
6405 }
6406
6407 // Recurse into concat_vectors sub vector to find scalars.
6408 if (Opcode == ISD::CONCAT_VECTORS) {
6409 EVT SubVT = Op.getOperand(0).getValueType();
6410 unsigned NumSubElts = SubVT.getVectorNumElements();
6411 uint64_t SubIdx = Index / NumSubElts;
6412 uint64_t SubElt = Index % NumSubElts;
6413 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
6414 }
6415
6416 // Recurse into extract_subvector src vector to find scalars.
6417 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
6418 SDValue Src = Op.getOperand(0);
6419 uint64_t SrcIdx = Op.getConstantOperandVal(1);
6420 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
6421 }
6422
6423 // We only peek through bitcasts of the same vector width.
6424 if (Opcode == ISD::BITCAST) {
6425 SDValue Src = Op.getOperand(0);
6426 EVT SrcVT = Src.getValueType();
6427 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
6428 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
6429 return SDValue();
6430 }
6431
6432 // Actual nodes that may contain scalar elements
6433
6434 // For insert_vector_elt - either return the index matching scalar or recurse
6435 // into the base vector.
6436 if (Opcode == ISD::INSERT_VECTOR_ELT &&
6437 isa<ConstantSDNode>(Op.getOperand(2))) {
6438 if (Op.getConstantOperandAPInt(2) == Index)
6439 return Op.getOperand(1);
6440 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
6441 }
6442
6443 if (Opcode == ISD::SCALAR_TO_VECTOR)
6444 return (Index == 0) ? Op.getOperand(0)
6445 : DAG.getUNDEF(VT.getVectorElementType());
6446
6447 if (Opcode == ISD::BUILD_VECTOR)
6448 return Op.getOperand(Index);
6449
6450 return SDValue();
6451}
6452
6453// Use PINSRB/PINSRW/PINSRD to create a build vector.
6455 const APInt &NonZeroMask,
6456 unsigned NumNonZero, unsigned NumZero,
6457 SelectionDAG &DAG,
6458 const X86Subtarget &Subtarget) {
6459 MVT VT = Op.getSimpleValueType();
6460 unsigned NumElts = VT.getVectorNumElements();
6461 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
6462 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
6463 "Illegal vector insertion");
6464
6465 SDValue V;
6466 bool First = true;
6467
6468 for (unsigned i = 0; i < NumElts; ++i) {
6469 bool IsNonZero = NonZeroMask[i];
6470 if (!IsNonZero)
6471 continue;
6472
6473 // If the build vector contains zeros or our first insertion is not the
6474 // first index then insert into zero vector to break any register
6475 // dependency else use SCALAR_TO_VECTOR.
6476 if (First) {
6477 First = false;
6478 if (NumZero || 0 != i)
6479 V = getZeroVector(VT, Subtarget, DAG, DL);
6480 else {
6481 assert(0 == i && "Expected insertion into zero-index");
6482 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6483 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6484 V = DAG.getBitcast(VT, V);
6485 continue;
6486 }
6487 }
6488 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, V, Op.getOperand(i),
6489 DAG.getIntPtrConstant(i, DL));
6490 }
6491
6492 return V;
6493}
6494
6495/// Custom lower build_vector of v16i8.
6497 const APInt &NonZeroMask,
6498 unsigned NumNonZero, unsigned NumZero,
6499 SelectionDAG &DAG,
6500 const X86Subtarget &Subtarget) {
6501 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6502 return SDValue();
6503
6504 // SSE4.1 - use PINSRB to insert each byte directly.
6505 if (Subtarget.hasSSE41())
6506 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero,
6507 DAG, Subtarget);
6508
6509 SDValue V;
6510
6511 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6512 // If both the lowest 16-bits are non-zero, then convert to MOVD.
6513 if (!NonZeroMask.extractBits(2, 0).isZero() &&
6514 !NonZeroMask.extractBits(2, 2).isZero()) {
6515 for (unsigned I = 0; I != 4; ++I) {
6516 if (!NonZeroMask[I])
6517 continue;
6518 SDValue Elt = DAG.getZExtOrTrunc(Op.getOperand(I), DL, MVT::i32);
6519 if (I != 0)
6520 Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt,
6521 DAG.getConstant(I * 8, DL, MVT::i8));
6522 V = V ? DAG.getNode(ISD::OR, DL, MVT::i32, V, Elt) : Elt;
6523 }
6524 assert(V && "Failed to fold v16i8 vector to zero");
6525 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6526 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32, V);
6527 V = DAG.getBitcast(MVT::v8i16, V);
6528 }
6529 for (unsigned i = V ? 4 : 0; i < 16; i += 2) {
6530 bool ThisIsNonZero = NonZeroMask[i];
6531 bool NextIsNonZero = NonZeroMask[i + 1];
6532 if (!ThisIsNonZero && !NextIsNonZero)
6533 continue;
6534
6535 SDValue Elt;
6536 if (ThisIsNonZero) {
6537 if (NumZero || NextIsNonZero)
6538 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6539 else
6540 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6541 }
6542
6543 if (NextIsNonZero) {
6544 SDValue NextElt = Op.getOperand(i + 1);
6545 if (i == 0 && NumZero)
6546 NextElt = DAG.getZExtOrTrunc(NextElt, DL, MVT::i32);
6547 else
6548 NextElt = DAG.getAnyExtOrTrunc(NextElt, DL, MVT::i32);
6549 NextElt = DAG.getNode(ISD::SHL, DL, MVT::i32, NextElt,
6550 DAG.getConstant(8, DL, MVT::i8));
6551 if (ThisIsNonZero)
6552 Elt = DAG.getNode(ISD::OR, DL, MVT::i32, NextElt, Elt);
6553 else
6554 Elt = NextElt;
6555 }
6556
6557 // If our first insertion is not the first index or zeros are needed, then
6558 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
6559 // elements undefined).
6560 if (!V) {
6561 if (i != 0 || NumZero)
6562 V = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
6563 else {
6564 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, Elt);
6565 V = DAG.getBitcast(MVT::v8i16, V);
6566 continue;
6567 }
6568 }
6569 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6570 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16, V, Elt,
6571 DAG.getIntPtrConstant(i / 2, DL));
6572 }
6573
6574 return DAG.getBitcast(MVT::v16i8, V);
6575}
6576
6577/// Custom lower build_vector of v8i16.
6579 const APInt &NonZeroMask,
6580 unsigned NumNonZero, unsigned NumZero,
6581 SelectionDAG &DAG,
6582 const X86Subtarget &Subtarget) {
6583 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6584 return SDValue();
6585
6586 // Use PINSRW to insert each byte directly.
6587 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero, DAG,
6588 Subtarget);
6589}
6590
6591/// Custom lower build_vector of v4i32 or v4f32.
6593 SelectionDAG &DAG,
6594 const X86Subtarget &Subtarget) {
6595 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
6596 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
6597 // Because we're creating a less complicated build vector here, we may enable
6598 // further folding of the MOVDDUP via shuffle transforms.
6599 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
6600 Op.getOperand(0) == Op.getOperand(2) &&
6601 Op.getOperand(1) == Op.getOperand(3) &&
6602 Op.getOperand(0) != Op.getOperand(1)) {
6603 MVT VT = Op.getSimpleValueType();
6604 MVT EltVT = VT.getVectorElementType();
6605 // Create a new build vector with the first 2 elements followed by undef
6606 // padding, bitcast to v2f64, duplicate, and bitcast back.
6607 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
6608 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
6609 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
6610 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
6611 return DAG.getBitcast(VT, Dup);
6612 }
6613
6614 // Find all zeroable elements.
6615 std::bitset<4> Zeroable, Undefs;
6616 for (int i = 0; i < 4; ++i) {
6617 SDValue Elt = Op.getOperand(i);
6618 Undefs[i] = Elt.isUndef();
6619 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6620 }
6621 assert(Zeroable.size() - Zeroable.count() > 1 &&
6622 "We expect at least two non-zero elements!");
6623
6624 // We only know how to deal with build_vector nodes where elements are either
6625 // zeroable or extract_vector_elt with constant index.
6626 SDValue FirstNonZero;
6627 unsigned FirstNonZeroIdx;
6628 for (unsigned i = 0; i < 4; ++i) {
6629 if (Zeroable[i])
6630 continue;
6631 SDValue Elt = Op.getOperand(i);
6632 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6633 !isa<ConstantSDNode>(Elt.getOperand(1)))
6634 return SDValue();
6635 // Make sure that this node is extracting from a 128-bit vector.
6636 MVT VT = Elt.getOperand(0).getSimpleValueType();
6637 if (!VT.is128BitVector())
6638 return SDValue();
6639 if (!FirstNonZero.getNode()) {
6640 FirstNonZero = Elt;
6641 FirstNonZeroIdx = i;
6642 }
6643 }
6644
6645 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6646 SDValue V1 = FirstNonZero.getOperand(0);
6647 MVT VT = V1.getSimpleValueType();
6648
6649 // See if this build_vector can be lowered as a blend with zero.
6650 SDValue Elt;
6651 unsigned EltMaskIdx, EltIdx;
6652 int Mask[4];
6653 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6654 if (Zeroable[EltIdx]) {
6655 // The zero vector will be on the right hand side.
6656 Mask[EltIdx] = EltIdx+4;
6657 continue;
6658 }
6659
6660 Elt = Op->getOperand(EltIdx);
6661 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6662 EltMaskIdx = Elt.getConstantOperandVal(1);
6663 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6664 break;
6665 Mask[EltIdx] = EltIdx;
6666 }
6667
6668 if (EltIdx == 4) {
6669 // Let the shuffle legalizer deal with blend operations.
6670 SDValue VZeroOrUndef = (Zeroable == Undefs)
6671 ? DAG.getUNDEF(VT)
6672 : getZeroVector(VT, Subtarget, DAG, DL);
6673 if (V1.getSimpleValueType() != VT)
6674 V1 = DAG.getBitcast(VT, V1);
6675 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
6676 }
6677
6678 // See if we can lower this build_vector to a INSERTPS.
6679 if (!Subtarget.hasSSE41())
6680 return SDValue();
6681
6682 SDValue V2 = Elt.getOperand(0);
6683 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6684 V1 = SDValue();
6685
6686 bool CanFold = true;
6687 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6688 if (Zeroable[i])
6689 continue;
6690
6691 SDValue Current = Op->getOperand(i);
6692 SDValue SrcVector = Current->getOperand(0);
6693 if (!V1.getNode())
6694 V1 = SrcVector;
6695 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
6696 }
6697
6698 if (!CanFold)
6699 return SDValue();
6700
6701 assert(V1.getNode() && "Expected at least two non-zero elements!");
6702 if (V1.getSimpleValueType() != MVT::v4f32)
6703 V1 = DAG.getBitcast(MVT::v4f32, V1);
6704 if (V2.getSimpleValueType() != MVT::v4f32)
6705 V2 = DAG.getBitcast(MVT::v4f32, V2);
6706
6707 // Ok, we can emit an INSERTPS instruction.
6708 unsigned ZMask = Zeroable.to_ulong();
6709
6710 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6711 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6712 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6713 DAG.getIntPtrConstant(InsertPSMask, DL, true));
6714 return DAG.getBitcast(VT, Result);
6715}
6716
6717/// Return a vector logical shift node.
6718static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6719 SelectionDAG &DAG, const TargetLowering &TLI,
6720 const SDLoc &dl) {
6721 assert(VT.is128BitVector() && "Unknown type for VShift");
6722 MVT ShVT = MVT::v16i8;
6723 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6724 SrcOp = DAG.getBitcast(ShVT, SrcOp);
6725 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6726 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
6727 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6728}
6729
6731 SelectionDAG &DAG) {
6732
6733 // Check if the scalar load can be widened into a vector load. And if
6734 // the address is "base + cst" see if the cst can be "absorbed" into
6735 // the shuffle mask.
6736 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
6737 SDValue Ptr = LD->getBasePtr();
6738 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
6739 return SDValue();
6740 EVT PVT = LD->getValueType(0);
6741 if (PVT != MVT::i32 && PVT != MVT::f32)
6742 return SDValue();
6743
6744 int FI = -1;
6745 int64_t Offset = 0;
6746 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
6747 FI = FINode->getIndex();
6748 Offset = 0;
6749 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
6750 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
6751 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
6752 Offset = Ptr.getConstantOperandVal(1);
6753 Ptr = Ptr.getOperand(0);
6754 } else {
6755 return SDValue();
6756 }
6757
6758 // FIXME: 256-bit vector instructions don't require a strict alignment,
6759 // improve this code to support it better.
6760 Align RequiredAlign(VT.getSizeInBits() / 8);
6761 SDValue Chain = LD->getChain();
6762 // Make sure the stack object alignment is at least 16 or 32.
6764 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
6765 if (!InferredAlign || *InferredAlign < RequiredAlign) {
6766 if (MFI.isFixedObjectIndex(FI)) {
6767 // Can't change the alignment. FIXME: It's possible to compute
6768 // the exact stack offset and reference FI + adjust offset instead.
6769 // If someone *really* cares about this. That's the way to implement it.
6770 return SDValue();
6771 } else {
6772 MFI.setObjectAlignment(FI, RequiredAlign);
6773 }
6774 }
6775
6776 // (Offset % 16 or 32) must be multiple of 4. Then address is then
6777 // Ptr + (Offset & ~15).
6778 if (Offset < 0)
6779 return SDValue();
6780 if ((Offset % RequiredAlign.value()) & 3)
6781 return SDValue();
6782 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
6783 if (StartOffset) {
6784 SDLoc DL(Ptr);
6785 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6786 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
6787 }
6788
6789 int EltNo = (Offset - StartOffset) >> 2;
6790 unsigned NumElems = VT.getVectorNumElements();
6791
6792 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6793 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6794 LD->getPointerInfo().getWithOffset(StartOffset));
6795
6796 SmallVector<int, 8> Mask(NumElems, EltNo);
6797
6798 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
6799 }
6800
6801 return SDValue();
6802}
6803
6804// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
6805static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
6806 if (ISD::isNON_EXTLoad(Elt.getNode())) {
6807 auto *BaseLd = cast<LoadSDNode>(Elt);
6808 if (!BaseLd->isSimple())
6809 return false;
6810 Ld = BaseLd;
6811 ByteOffset = 0;
6812 return true;
6813 }
6814
6815 switch (Elt.getOpcode()) {
6816 case ISD::BITCAST:
6817 case ISD::TRUNCATE:
6819 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
6820 case ISD::SRL:
6821 if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
6822 uint64_t Amt = AmtC->getZExtValue();
6823 if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
6824 ByteOffset += Amt / 8;
6825 return true;
6826 }
6827 }
6828 break;
6830 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
6831 SDValue Src = Elt.getOperand(0);
6832 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
6833 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
6834 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
6835 findEltLoadSrc(Src, Ld, ByteOffset)) {
6836 uint64_t Idx = IdxC->getZExtValue();
6837 ByteOffset += Idx * (SrcSizeInBits / 8);
6838 return true;
6839 }
6840 }
6841 break;
6842 }
6843
6844 return false;
6845}
6846
6847/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6848/// elements can be replaced by a single large load which has the same value as
6849/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6850///
6851/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
6853 const SDLoc &DL, SelectionDAG &DAG,
6854 const X86Subtarget &Subtarget,
6855 bool IsAfterLegalize) {
6856 if ((VT.getScalarSizeInBits() % 8) != 0)
6857 return SDValue();
6858
6859 unsigned NumElems = Elts.size();
6860
6861 int LastLoadedElt = -1;
6862 APInt LoadMask = APInt::getZero(NumElems);
6863 APInt ZeroMask = APInt::getZero(NumElems);
6864 APInt UndefMask = APInt::getZero(NumElems);
6865
6866 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
6867 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
6868
6869 // For each element in the initializer, see if we've found a load, zero or an
6870 // undef.
6871 for (unsigned i = 0; i < NumElems; ++i) {
6872 SDValue Elt = peekThroughBitcasts(Elts[i]);
6873 if (!Elt.getNode())
6874 return SDValue();
6875 if (Elt.isUndef()) {
6876 UndefMask.setBit(i);
6877 continue;
6878 }
6880 ZeroMask.setBit(i);
6881 continue;
6882 }
6883
6884 // Each loaded element must be the correct fractional portion of the
6885 // requested vector load.
6886 unsigned EltSizeInBits = Elt.getValueSizeInBits();
6887 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
6888 return SDValue();
6889
6890 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
6891 return SDValue();
6892 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
6893 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
6894 return SDValue();
6895
6896 LoadMask.setBit(i);
6897 LastLoadedElt = i;
6898 }
6899 assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==
6900 NumElems &&
6901 "Incomplete element masks");
6902
6903 // Handle Special Cases - all undef or undef/zero.
6904 if (UndefMask.popcount() == NumElems)
6905 return DAG.getUNDEF(VT);
6906 if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)
6907 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
6908 : DAG.getConstantFP(0.0, DL, VT);
6909
6910 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6911 int FirstLoadedElt = LoadMask.countr_zero();
6912 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
6913 EVT EltBaseVT = EltBase.getValueType();
6914 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
6915 "Register/Memory size mismatch");
6916 LoadSDNode *LDBase = Loads[FirstLoadedElt];
6917 assert(LDBase && "Did not find base load for merging consecutive loads");
6918 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
6919 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
6920 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
6921 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
6922 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
6923
6924 // TODO: Support offsetting the base load.
6925 if (ByteOffsets[FirstLoadedElt] != 0)
6926 return SDValue();
6927
6928 // Check to see if the element's load is consecutive to the base load
6929 // or offset from a previous (already checked) load.
6930 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
6931 LoadSDNode *Ld = Loads[EltIdx];
6932 int64_t ByteOffset = ByteOffsets[EltIdx];
6933 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
6934 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
6935 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
6936 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
6937 }
6938 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
6939 EltIdx - FirstLoadedElt);
6940 };
6941
6942 // Consecutive loads can contain UNDEFS but not ZERO elements.
6943 // Consecutive loads with UNDEFs and ZEROs elements require a
6944 // an additional shuffle stage to clear the ZERO elements.
6945 bool IsConsecutiveLoad = true;
6946 bool IsConsecutiveLoadWithZeros = true;
6947 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
6948 if (LoadMask[i]) {
6949 if (!CheckConsecutiveLoad(LDBase, i)) {
6950 IsConsecutiveLoad = false;
6951 IsConsecutiveLoadWithZeros = false;
6952 break;
6953 }
6954 } else if (ZeroMask[i]) {
6955 IsConsecutiveLoad = false;
6956 }
6957 }
6958
6959 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
6960 auto MMOFlags = LDBase->getMemOperand()->getFlags();
6961 assert(LDBase->isSimple() &&
6962 "Cannot merge volatile or atomic loads.");
6963 SDValue NewLd =
6964 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6965 LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
6966 MMOFlags);
6967 for (auto *LD : Loads)
6968 if (LD)
6969 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
6970 return NewLd;
6971 };
6972
6973 // Check if the base load is entirely dereferenceable.
6974 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
6975 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
6976
6977 // LOAD - all consecutive load/undefs (must start/end with a load or be
6978 // entirely dereferenceable). If we have found an entire vector of loads and
6979 // undefs, then return a large load of the entire vector width starting at the
6980 // base pointer. If the vector contains zeros, then attempt to shuffle those
6981 // elements.
6982 if (FirstLoadedElt == 0 &&
6983 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
6984 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
6985 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
6986 return SDValue();
6987
6988 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
6989 // will lower to regular temporal loads and use the cache.
6990 if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
6991 VT.is256BitVector() && !Subtarget.hasInt256())
6992 return SDValue();
6993
6994 if (NumElems == 1)
6995 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
6996
6997 if (!ZeroMask)
6998 return CreateLoad(VT, LDBase);
6999
7000 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
7001 // vector and a zero vector to clear out the zero elements.
7002 if (!IsAfterLegalize && VT.isVector()) {
7003 unsigned NumMaskElts = VT.getVectorNumElements();
7004 if ((NumMaskElts % NumElems) == 0) {
7005 unsigned Scale = NumMaskElts / NumElems;
7006 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
7007 for (unsigned i = 0; i < NumElems; ++i) {
7008 if (UndefMask[i])
7009 continue;
7010 int Offset = ZeroMask[i] ? NumMaskElts : 0;
7011 for (unsigned j = 0; j != Scale; ++j)
7012 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
7013 }
7014 SDValue V = CreateLoad(VT, LDBase);
7015 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
7016 : DAG.getConstantFP(0.0, DL, VT);
7017 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
7018 }
7019 }
7020 }
7021
7022 // If the upper half of a ymm/zmm load is undef then just load the lower half.
7023 if (VT.is256BitVector() || VT.is512BitVector()) {
7024 unsigned HalfNumElems = NumElems / 2;
7025 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
7026 EVT HalfVT =
7027 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
7028 SDValue HalfLD =
7029 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
7030 DAG, Subtarget, IsAfterLegalize);
7031 if (HalfLD)
7032 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
7033 HalfLD, DAG.getIntPtrConstant(0, DL));
7034 }
7035 }
7036
7037 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
7038 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
7039 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
7040 LoadSizeInBits == 64) &&
7041 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
7042 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
7043 : MVT::getIntegerVT(LoadSizeInBits);
7044 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
7045 // Allow v4f32 on SSE1 only targets.
7046 // FIXME: Add more isel patterns so we can just use VT directly.
7047 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
7048 VecVT = MVT::v4f32;
7049 if (TLI.isTypeLegal(VecVT)) {
7050 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
7051 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
7052 SDValue ResNode = DAG.getMemIntrinsicNode(
7053 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
7055 for (auto *LD : Loads)
7056 if (LD)
7057 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
7058 return DAG.getBitcast(VT, ResNode);
7059 }
7060 }
7061
7062 // BROADCAST - match the smallest possible repetition pattern, load that
7063 // scalar/subvector element and then broadcast to the entire vector.
7064 if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
7065 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
7066 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
7067 unsigned RepeatSize = SubElems * BaseSizeInBits;
7068 unsigned ScalarSize = std::min(RepeatSize, 64u);
7069 if (!Subtarget.hasAVX2() && ScalarSize < 32)
7070 continue;
7071
7072 // Don't attempt a 1:N subvector broadcast - it should be caught by
7073 // combineConcatVectorOps, else will cause infinite loops.
7074 if (RepeatSize > ScalarSize && SubElems == 1)
7075 continue;
7076
7077 bool Match = true;
7078 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
7079 for (unsigned i = 0; i != NumElems && Match; ++i) {
7080 if (!LoadMask[i])
7081 continue;
7082 SDValue Elt = peekThroughBitcasts(Elts[i]);
7083 if (RepeatedLoads[i % SubElems].isUndef())
7084 RepeatedLoads[i % SubElems] = Elt;
7085 else
7086 Match &= (RepeatedLoads[i % SubElems] == Elt);
7087 }
7088
7089 // We must have loads at both ends of the repetition.
7090 Match &= !RepeatedLoads.front().isUndef();
7091 Match &= !RepeatedLoads.back().isUndef();
7092 if (!Match)
7093 continue;
7094
7095 EVT RepeatVT =
7096 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
7097 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
7098 : EVT::getFloatingPointVT(ScalarSize);
7099 if (RepeatSize > ScalarSize)
7100 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
7101 RepeatSize / ScalarSize);
7102 EVT BroadcastVT =
7103 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
7104 VT.getSizeInBits() / ScalarSize);
7105 if (TLI.isTypeLegal(BroadcastVT)) {
7106 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
7107 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
7108 SDValue Broadcast = RepeatLoad;
7109 if (RepeatSize > ScalarSize) {
7110 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
7111 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
7112 } else {
7113 if (!Subtarget.hasAVX2() &&
7115 RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
7116 Subtarget,
7117 /*AssumeSingleUse=*/true))
7118 return SDValue();
7119 Broadcast =
7120 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
7121 }
7122 return DAG.getBitcast(VT, Broadcast);
7123 }
7124 }
7125 }
7126 }
7127
7128 return SDValue();
7129}
7130
7131// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
7132// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
7133// are consecutive, non-overlapping, and in the right order.
7135 SelectionDAG &DAG,
7136 const X86Subtarget &Subtarget,
7137 bool IsAfterLegalize) {
7139 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
7140 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
7141 Elts.push_back(Elt);
7142 continue;
7143 }
7144 return SDValue();
7145 }
7146 assert(Elts.size() == VT.getVectorNumElements());
7147 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
7148 IsAfterLegalize);
7149}
7150
7152 const APInt &Undefs, LLVMContext &C) {
7153 unsigned ScalarSize = VT.getScalarSizeInBits();
7154 Type *Ty = EVT(VT.getScalarType()).getTypeForEVT(C);
7155
7156 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7157 if (VT.isFloatingPoint()) {
7158 if (ScalarSize == 16)
7159 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7160 if (ScalarSize == 32)
7161 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7162 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7163 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7164 }
7165 return Constant::getIntegerValue(Ty, Val);
7166 };
7167
7168 SmallVector<Constant *, 32> ConstantVec;
7169 for (unsigned I = 0, E = Bits.size(); I != E; ++I)
7170 ConstantVec.push_back(Undefs[I] ? UndefValue::get(Ty)
7171 : getConstantScalar(Bits[I]));
7172
7173 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7174}
7175
7176static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
7177 unsigned SplatBitSize, LLVMContext &C) {
7178 unsigned ScalarSize = VT.getScalarSizeInBits();
7179
7180 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7181 if (VT.isFloatingPoint()) {
7182 if (ScalarSize == 16)
7183 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7184 if (ScalarSize == 32)
7185 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7186 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7187 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7188 }
7189 return Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
7190 };
7191
7192 if (ScalarSize == SplatBitSize)
7193 return getConstantScalar(SplatValue);
7194
7195 unsigned NumElm = SplatBitSize / ScalarSize;
7196 SmallVector<Constant *, 32> ConstantVec;
7197 for (unsigned I = 0; I != NumElm; ++I) {
7198 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * I);
7199 ConstantVec.push_back(getConstantScalar(Val));
7200 }
7201 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7202}
7203
7205 for (auto *U : N->uses()) {
7206 unsigned Opc = U->getOpcode();
7207 // VPERMV/VPERMV3 shuffles can never fold their index operands.
7208 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
7209 return false;
7210 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
7211 return false;
7212 if (isTargetShuffle(Opc))
7213 return true;
7214 if (Opc == ISD::BITCAST) // Ignore bitcasts
7215 return isFoldableUseOfShuffle(U);
7216 if (N->hasOneUse()) {
7217 // TODO, there may be some general way to know if a SDNode can
7218 // be folded. We now only know whether an MI is foldable.
7219 if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
7220 return false;
7221 return true;
7222 }
7223 }
7224 return false;
7225}
7226
7227/// Attempt to use the vbroadcast instruction to generate a splat value
7228/// from a splat BUILD_VECTOR which uses:
7229/// a. A single scalar load, or a constant.
7230/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
7231///
7232/// The VBROADCAST node is returned when a pattern is found,
7233/// or SDValue() otherwise.
7235 const SDLoc &dl,
7236 const X86Subtarget &Subtarget,
7237 SelectionDAG &DAG) {
7238 // VBROADCAST requires AVX.
7239 // TODO: Splats could be generated for non-AVX CPUs using SSE
7240 // instructions, but there's less potential gain for only 128-bit vectors.
7241 if (!Subtarget.hasAVX())
7242 return SDValue();
7243
7244 MVT VT = BVOp->getSimpleValueType(0);
7245 unsigned NumElts = VT.getVectorNumElements();
7246 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7247 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
7248 "Unsupported vector type for broadcast.");
7249
7250 // See if the build vector is a repeating sequence of scalars (inc. splat).
7251 SDValue Ld;
7252 BitVector UndefElements;
7253 SmallVector<SDValue, 16> Sequence;
7254 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
7255 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
7256 if (Sequence.size() == 1)
7257 Ld = Sequence[0];
7258 }
7259
7260 // Attempt to use VBROADCASTM
7261 // From this pattern:
7262 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
7263 // b. t1 = (build_vector t0 t0)
7264 //
7265 // Create (VBROADCASTM v2i1 X)
7266 if (!Sequence.empty() && Subtarget.hasCDI()) {
7267 // If not a splat, are the upper sequence values zeroable?
7268 unsigned SeqLen = Sequence.size();
7269 bool UpperZeroOrUndef =
7270 SeqLen == 1 ||
7271 llvm::all_of(ArrayRef(Sequence).drop_front(), [](SDValue V) {
7272 return !V || V.isUndef() || isNullConstant(V);
7273 });
7274 SDValue Op0 = Sequence[0];
7275 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
7276 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
7277 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
7278 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
7279 ? Op0.getOperand(0)
7280 : Op0.getOperand(0).getOperand(0);
7281 MVT MaskVT = BOperand.getSimpleValueType();
7282 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
7283 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
7284 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
7285 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
7286 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
7287 unsigned Scale = 512 / VT.getSizeInBits();
7288 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
7289 }
7290 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
7291 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
7292 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
7293 return DAG.getBitcast(VT, Bcst);
7294 }
7295 }
7296 }
7297
7298 unsigned NumUndefElts = UndefElements.count();
7299 if (!Ld || (NumElts - NumUndefElts) <= 1) {
7300 APInt SplatValue, Undef;
7301 unsigned SplatBitSize;
7302 bool HasUndef;
7303 // Check if this is a repeated constant pattern suitable for broadcasting.
7304 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
7305 SplatBitSize > VT.getScalarSizeInBits() &&
7306 SplatBitSize < VT.getSizeInBits()) {
7307 // Avoid replacing with broadcast when it's a use of a shuffle
7308 // instruction to preserve the present custom lowering of shuffles.
7309 if (isFoldableUseOfShuffle(BVOp))
7310 return SDValue();
7311 // replace BUILD_VECTOR with broadcast of the repeated constants.
7312 LLVMContext *Ctx = DAG.getContext();
7313 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
7314 if (SplatBitSize == 32 || SplatBitSize == 64 ||
7315 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
7316 // Load the constant scalar/subvector and broadcast it.
7317 MVT CVT = MVT::getIntegerVT(SplatBitSize);
7318 Constant *C = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7319 SDValue CP = DAG.getConstantPool(C, PVT);
7320 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7321
7322 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7323 SDVTList Tys = DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
7324 SDValue Ops[] = {DAG.getEntryNode(), CP};
7325 MachinePointerInfo MPI =
7327 SDValue Brdcst =
7328 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7329 MPI, Alignment, MachineMemOperand::MOLoad);
7330 return DAG.getBitcast(VT, Brdcst);
7331 }
7332 if (SplatBitSize > 64) {
7333 // Load the vector of constants and broadcast it.
7334 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7335 SDValue VCP = DAG.getConstantPool(VecC, PVT);
7336 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
7337 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
7338 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
7339 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7340 SDValue Ops[] = {DAG.getEntryNode(), VCP};
7341 MachinePointerInfo MPI =
7344 Ops, VVT, MPI, Alignment,
7346 }
7347 }
7348
7349 // If we are moving a scalar into a vector (Ld must be set and all elements
7350 // but 1 are undef) and that operation is not obviously supported by
7351 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
7352 // That's better than general shuffling and may eliminate a load to GPR and
7353 // move from scalar to vector register.
7354 if (!Ld || NumElts - NumUndefElts != 1)
7355 return SDValue();
7356 unsigned ScalarSize = Ld.getValueSizeInBits();
7357 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
7358 return SDValue();
7359 }
7360
7361 bool ConstSplatVal =
7362 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
7363 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
7364
7365 // TODO: Handle broadcasts of non-constant sequences.
7366
7367 // Make sure that all of the users of a non-constant load are from the
7368 // BUILD_VECTOR node.
7369 // FIXME: Is the use count needed for non-constant, non-load case?
7370 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
7371 return SDValue();
7372
7373 unsigned ScalarSize = Ld.getValueSizeInBits();
7374 bool IsGE256 = (VT.getSizeInBits() >= 256);
7375
7376 // When optimizing for size, generate up to 5 extra bytes for a broadcast
7377 // instruction to save 8 or more bytes of constant pool data.
7378 // TODO: If multiple splats are generated to load the same constant,
7379 // it may be detrimental to overall size. There needs to be a way to detect
7380 // that condition to know if this is truly a size win.
7381 bool OptForSize = DAG.shouldOptForSize();
7382
7383 // Handle broadcasting a single constant scalar from the constant pool
7384 // into a vector.
7385 // On Sandybridge (no AVX2), it is still better to load a constant vector
7386 // from the constant pool and not to broadcast it from a scalar.
7387 // But override that restriction when optimizing for size.
7388 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
7389 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
7390 EVT CVT = Ld.getValueType();
7391 assert(!CVT.isVector() && "Must not broadcast a vector type");
7392
7393 // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
7394 // For size optimization, also splat v2f64 and v2i64, and for size opt
7395 // with AVX2, also splat i8 and i16.
7396 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
7397 if (ScalarSize == 32 ||
7398 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
7399 CVT == MVT::f16 ||
7400 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
7401 const Constant *C = nullptr;
7402 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
7403 C = CI->getConstantIntValue();
7404 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
7405 C = CF->getConstantFPValue();
7406
7407 assert(C && "Invalid constant type");
7408
7409 SDValue CP =
7411 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7412
7413 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7414 SDValue Ops[] = {DAG.getEntryNode(), CP};
7415 MachinePointerInfo MPI =
7417 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7418 MPI, Alignment, MachineMemOperand::MOLoad);
7419 }
7420 }
7421
7422 // Handle AVX2 in-register broadcasts.
7423 if (!IsLoad && Subtarget.hasInt256() &&
7424 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
7425 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7426
7427 // The scalar source must be a normal load.
7428 if (!IsLoad)
7429 return SDValue();
7430
7431 // Make sure the non-chain result is only used by this build vector.
7432 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
7433 return SDValue();
7434
7435 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7436 (Subtarget.hasVLX() && ScalarSize == 64)) {
7437 auto *LN = cast<LoadSDNode>(Ld);
7438 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7439 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7440 SDValue BCast =
7442 LN->getMemoryVT(), LN->getMemOperand());
7443 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7444 return BCast;
7445 }
7446
7447 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
7448 // double since there is no vbroadcastsd xmm
7449 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
7450 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
7451 auto *LN = cast<LoadSDNode>(Ld);
7452 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7453 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7454 SDValue BCast =
7456 LN->getMemoryVT(), LN->getMemOperand());
7457 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7458 return BCast;
7459 }
7460
7461 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
7462 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7463
7464 // Unsupported broadcast.
7465 return SDValue();
7466}
7467
7468/// For an EXTRACT_VECTOR_ELT with a constant index return the real
7469/// underlying vector and index.
7470///
7471/// Modifies \p ExtractedFromVec to the real vector and returns the real
7472/// index.
7473static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
7474 SDValue ExtIdx) {
7475 int Idx = ExtIdx->getAsZExtVal();
7476 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
7477 return Idx;
7478
7479 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7480 // lowered this:
7481 // (extract_vector_elt (v8f32 %1), Constant<6>)
7482 // to:
7483 // (extract_vector_elt (vector_shuffle<2,u,u,u>
7484 // (extract_subvector (v8f32 %0), Constant<4>),
7485 // undef)
7486 // Constant<0>)
7487 // In this case the vector is the extract_subvector expression and the index
7488 // is 2, as specified by the shuffle.
7489 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
7490 SDValue ShuffleVec = SVOp->getOperand(0);
7491 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
7492 assert(ShuffleVecVT.getVectorElementType() ==
7493 ExtractedFromVec.getSimpleValueType().getVectorElementType());
7494
7495 int ShuffleIdx = SVOp->getMaskElt(Idx);
7496 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
7497 ExtractedFromVec = ShuffleVec;
7498 return ShuffleIdx;
7499 }
7500 return Idx;
7501}
7502
7504 SelectionDAG &DAG) {
7505 MVT VT = Op.getSimpleValueType();
7506
7507 // Skip if insert_vec_elt is not supported.
7508 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7510 return SDValue();
7511
7512 unsigned NumElems = Op.getNumOperands();
7513 SDValue VecIn1;
7514 SDValue VecIn2;
7515 SmallVector<unsigned, 4> InsertIndices;
7516 SmallVector<int, 8> Mask(NumElems, -1);
7517
7518 for (unsigned i = 0; i != NumElems; ++i) {
7519 unsigned Opc = Op.getOperand(i).getOpcode();
7520
7521 if (Opc == ISD::UNDEF)
7522 continue;
7523
7524 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
7525 // Quit if more than 1 elements need inserting.
7526 if (InsertIndices.size() > 1)
7527 return SDValue();
7528
7529 InsertIndices.push_back(i);
7530 continue;
7531 }
7532
7533 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
7534 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
7535
7536 // Quit if non-constant index.
7537 if (!isa<ConstantSDNode>(ExtIdx))
7538 return SDValue();
7539 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
7540
7541 // Quit if extracted from vector of different type.
7542 if (ExtractedFromVec.getValueType() != VT)
7543 return SDValue();
7544
7545 if (!VecIn1.getNode())
7546 VecIn1 = ExtractedFromVec;
7547 else if (VecIn1 != ExtractedFromVec) {
7548 if (!VecIn2.getNode())
7549 VecIn2 = ExtractedFromVec;
7550 else if (VecIn2 != ExtractedFromVec)
7551 // Quit if more than 2 vectors to shuffle
7552 return SDValue();
7553 }
7554
7555 if (ExtractedFromVec == VecIn1)
7556 Mask[i] = Idx;
7557 else if (ExtractedFromVec == VecIn2)
7558 Mask[i] = Idx + NumElems;
7559 }
7560
7561 if (!VecIn1.getNode())
7562 return SDValue();
7563
7564 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
7565 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
7566
7567 for (unsigned Idx : InsertIndices)
7568 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
7569 DAG.getIntPtrConstant(Idx, DL));
7570
7571 return NV;
7572}
7573
7574// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.
7576 const X86Subtarget &Subtarget) {
7577 MVT VT = Op.getSimpleValueType();
7578 MVT IVT =
7579 VT.changeVectorElementType(Subtarget.hasFP16() ? MVT::f16 : MVT::i16);
7581 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
7582 NewOps.push_back(DAG.getBitcast(Subtarget.hasFP16() ? MVT::f16 : MVT::i16,
7583 Op.getOperand(I)));
7584 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
7585 return DAG.getBitcast(VT, Res);
7586}
7587
7588// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
7590 SelectionDAG &DAG,
7591 const X86Subtarget &Subtarget) {
7592
7593 MVT VT = Op.getSimpleValueType();
7594 assert((VT.getVectorElementType() == MVT::i1) &&
7595 "Unexpected type in LowerBUILD_VECTORvXi1!");
7596 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
7597 ISD::isBuildVectorAllOnes(Op.getNode()))
7598 return Op;
7599
7600 uint64_t Immediate = 0;
7601 SmallVector<unsigned, 16> NonConstIdx;
7602 bool IsSplat = true;
7603 bool HasConstElts = false;
7604 int SplatIdx = -1;
7605 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7606 SDValue In = Op.getOperand(idx);
7607 if (In.isUndef())
7608 continue;
7609 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
7610 Immediate |= (InC->getZExtValue() & 0x1) << idx;
7611 HasConstElts = true;
7612 } else {
7613 NonConstIdx.push_back(idx);
7614 }
7615 if (SplatIdx < 0)
7616 SplatIdx = idx;
7617 else if (In != Op.getOperand(SplatIdx))
7618 IsSplat = false;
7619 }
7620
7621 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
7622 if (IsSplat) {
7623 // The build_vector allows the scalar element to be larger than the vector
7624 // element type. We need to mask it to use as a condition unless we know
7625 // the upper bits are zero.
7626 // FIXME: Use computeKnownBits instead of checking specific opcode?
7627 SDValue Cond = Op.getOperand(SplatIdx);
7628 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
7629 if (Cond.getOpcode() != ISD::SETCC)
7630 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
7631 DAG.getConstant(1, dl, MVT::i8));
7632
7633 // Perform the select in the scalar domain so we can use cmov.
7634 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7635 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
7636 DAG.getAllOnesConstant(dl, MVT::i32),
7637 DAG.getConstant(0, dl, MVT::i32));
7638 Select = DAG.getBitcast(MVT::v32i1, Select);
7639 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
7640 } else {
7641 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
7642 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
7643 DAG.getAllOnesConstant(dl, ImmVT),
7644 DAG.getConstant(0, dl, ImmVT));
7645 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
7646 Select = DAG.getBitcast(VecVT, Select);
7647 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
7648 DAG.getIntPtrConstant(0, dl));
7649 }
7650 }
7651
7652 // insert elements one by one
7653 SDValue DstVec;
7654 if (HasConstElts) {
7655 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7656 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
7657 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
7658 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
7659 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
7660 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
7661 } else {
7662 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
7663 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
7664 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
7665 DstVec = DAG.getBitcast(VecVT, Imm);
7666 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
7667 DAG.getIntPtrConstant(0, dl));
7668 }
7669 } else
7670 DstVec = DAG.getUNDEF(VT);
7671
7672 for (unsigned InsertIdx : NonConstIdx) {
7673 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
7674 Op.getOperand(InsertIdx),
7675 DAG.getIntPtrConstant(InsertIdx, dl));
7676 }
7677 return DstVec;
7678}
7679
7680LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) {
7681 switch (Opcode) {
7682 case X86ISD::PACKSS:
7683 case X86ISD::PACKUS:
7684 case X86ISD::FHADD:
7685 case X86ISD::FHSUB:
7686 case X86ISD::HADD:
7687 case X86ISD::HSUB:
7688 return true;
7689 }
7690 return false;
7691}
7692
7693/// This is a helper function of LowerToHorizontalOp().
7694/// This function checks that the build_vector \p N in input implements a
7695/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
7696/// may not match the layout of an x86 256-bit horizontal instruction.
7697/// In other words, if this returns true, then some extraction/insertion will
7698/// be required to produce a valid horizontal instruction.
7699///
7700/// Parameter \p Opcode defines the kind of horizontal operation to match.
7701/// For example, if \p Opcode is equal to ISD::ADD, then this function
7702/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
7703/// is equal to ISD::SUB, then this function checks if this is a horizontal
7704/// arithmetic sub.
7705///
7706/// This function only analyzes elements of \p N whose indices are
7707/// in range [BaseIdx, LastIdx).
7708///
7709/// TODO: This function was originally used to match both real and fake partial
7710/// horizontal operations, but the index-matching logic is incorrect for that.
7711/// See the corrected implementation in isHopBuildVector(). Can we reduce this
7712/// code because it is only used for partial h-op matching now?
7713static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
7714 const SDLoc &DL, SelectionDAG &DAG,
7715 unsigned BaseIdx, unsigned LastIdx,
7716 SDValue &V0, SDValue &V1) {
7717 EVT VT = N->getValueType(0);
7718 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
7719 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
7720 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
7721 "Invalid Vector in input!");
7722
7723 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
7724 bool CanFold = true;
7725 unsigned ExpectedVExtractIdx = BaseIdx;
7726 unsigned NumElts = LastIdx - BaseIdx;
7727 V0 = DAG.getUNDEF(VT);
7728 V1 = DAG.getUNDEF(VT);
7729
7730 // Check if N implements a horizontal binop.
7731 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
7732 SDValue Op = N->getOperand(i + BaseIdx);
7733
7734 // Skip UNDEFs.
7735 if (Op->isUndef()) {
7736 // Update the expected vector extract index.
7737 if (i * 2 == NumElts)
7738 ExpectedVExtractIdx = BaseIdx;
7739 ExpectedVExtractIdx += 2;
7740 continue;
7741 }
7742
7743 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
7744
7745 if (!CanFold)
7746 break;
7747
7748 SDValue Op0 = Op.getOperand(0);
7749 SDValue Op1 = Op.getOperand(1);
7750
7751 // Try to match the following pattern:
7752 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
7753 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7755 Op0.getOperand(0) == Op1.getOperand(0) &&
7756 isa<ConstantSDNode>(Op0.getOperand(1)) &&
7757 isa<ConstantSDNode>(Op1.getOperand(1)));
7758 if (!CanFold)
7759 break;
7760
7761 unsigned I0 = Op0.getConstantOperandVal(1);
7762 unsigned I1 = Op1.getConstantOperandVal(1);
7763
7764 if (i * 2 < NumElts) {
7765 if (V0.isUndef()) {
7766 V0 = Op0.getOperand(0);
7767 if (V0.getValueType() != VT)
7768 return false;
7769 }
7770 } else {
7771 if (V1.isUndef()) {
7772 V1 = Op0.getOperand(0);
7773 if (V1.getValueType() != VT)
7774 return false;
7775 }
7776 if (i * 2 == NumElts)
7777 ExpectedVExtractIdx = BaseIdx;
7778 }
7779
7780 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
7781 if (I0 == ExpectedVExtractIdx)
7782 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
7783 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
7784 // Try to match the following dag sequence:
7785 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
7786 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
7787 } else
7788 CanFold = false;
7789
7790 ExpectedVExtractIdx += 2;
7791 }
7792
7793 return CanFold;
7794}
7795
7796/// Emit a sequence of two 128-bit horizontal add/sub followed by
7797/// a concat_vector.
7798///
7799/// This is a helper function of LowerToHorizontalOp().
7800/// This function expects two 256-bit vectors called V0 and V1.
7801/// At first, each vector is split into two separate 128-bit vectors.
7802/// Then, the resulting 128-bit vectors are used to implement two
7803/// horizontal binary operations.
7804///
7805/// The kind of horizontal binary operation is defined by \p X86Opcode.
7806///
7807/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
7808/// the two new horizontal binop.
7809/// When Mode is set, the first horizontal binop dag node would take as input
7810/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
7811/// horizontal binop dag node would take as input the lower 128-bit of V1
7812/// and the upper 128-bit of V1.
7813/// Example:
7814/// HADD V0_LO, V0_HI
7815/// HADD V1_LO, V1_HI
7816///
7817/// Otherwise, the first horizontal binop dag node takes as input the lower
7818/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
7819/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
7820/// Example:
7821/// HADD V0_LO, V1_LO
7822/// HADD V0_HI, V1_HI
7823///
7824/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
7825/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
7826/// the upper 128-bits of the result.
7827static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
7828 const SDLoc &DL, SelectionDAG &DAG,
7829 unsigned X86Opcode, bool Mode,
7830 bool isUndefLO, bool isUndefHI) {
7831 MVT VT = V0.getSimpleValueType();
7832 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
7833 "Invalid nodes in input!");
7834
7835 unsigned NumElts = VT.getVectorNumElements();
7836 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
7837 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
7838 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
7839 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
7840 MVT NewVT = V0_LO.getSimpleValueType();
7841
7842 SDValue LO = DAG.getUNDEF(NewVT);
7843 SDValue HI = DAG.getUNDEF(NewVT);
7844
7845 if (Mode) {
7846 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7847 if (!isUndefLO && !V0->isUndef())
7848 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
7849 if (!isUndefHI && !V1->isUndef())
7850 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
7851 } else {
7852 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7853 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
7854 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
7855
7856 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
7857 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
7858 }
7859
7860 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
7861}
7862
7863/// Returns true iff \p BV builds a vector with the result equivalent to
7864/// the result of ADDSUB/SUBADD operation.
7865/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
7866/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
7867/// \p Opnd0 and \p Opnd1.
7869 const X86Subtarget &Subtarget, SelectionDAG &DAG,
7870 SDValue &Opnd0, SDValue &Opnd1,
7871 unsigned &NumExtracts,
7872 bool &IsSubAdd) {
7873
7874 MVT VT = BV->getSimpleValueType(0);
7875 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
7876 return false;
7877
7878 unsigned NumElts = VT.getVectorNumElements();
7879 SDValue InVec0 = DAG.getUNDEF(VT);
7880 SDValue InVec1 = DAG.getUNDEF(VT);
7881
7882 NumExtracts = 0;
7883
7884 // Odd-numbered elements in the input build vector are obtained from
7885 // adding/subtracting two integer/float elements.
7886 // Even-numbered elements in the input build vector are obtained from
7887 // subtracting/adding two integer/float elements.
7888 unsigned Opc[2] = {0, 0};
7889 for (unsigned i = 0, e = NumElts; i != e; ++i) {
7890 SDValue Op = BV->getOperand(i);
7891
7892 // Skip 'undef' values.
7893 unsigned Opcode = Op.getOpcode();
7894 if (Opcode == ISD::UNDEF)
7895 continue;
7896
7897 // Early exit if we found an unexpected opcode.
7898 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
7899 return false;
7900
7901 SDValue Op0 = Op.getOperand(0);
7902 SDValue Op1 = Op.getOperand(1);
7903
7904 // Try to match the following pattern:
7905 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
7906 // Early exit if we cannot match that sequence.
7907 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7909 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
7910 Op0.getOperand(1) != Op1.getOperand(1))
7911 return false;
7912
7913 unsigned I0 = Op0.getConstantOperandVal(1);
7914 if (I0 != i)
7915 return false;
7916
7917 // We found a valid add/sub node, make sure its the same opcode as previous
7918 // elements for this parity.
7919 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
7920 return false;
7921 Opc[i % 2] = Opcode;
7922
7923 // Update InVec0 and InVec1.
7924 if (InVec0.isUndef()) {
7925 InVec0 = Op0.getOperand(0);
7926 if (InVec0.getSimpleValueType() != VT)
7927 return false;
7928 }
7929 if (InVec1.isUndef()) {
7930 InVec1 = Op1.getOperand(0);
7931 if (InVec1.getSimpleValueType() != VT)
7932 return false;
7933 }
7934
7935 // Make sure that operands in input to each add/sub node always
7936 // come from a same pair of vectors.
7937 if (InVec0 != Op0.getOperand(0)) {
7938 if (Opcode == ISD::FSUB)
7939 return false;
7940
7941 // FADD is commutable. Try to commute the operands
7942 // and then test again.
7943 std::swap(Op0, Op1);
7944 if (InVec0 != Op0.getOperand(0))
7945 return false;
7946 }
7947
7948 if (InVec1 != Op1.getOperand(0))
7949 return false;
7950
7951 // Increment the number of extractions done.
7952 ++NumExtracts;
7953 }
7954
7955 // Ensure we have found an opcode for both parities and that they are
7956 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
7957 // inputs are undef.
7958 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
7959 InVec0.isUndef() || InVec1.isUndef())
7960 return false;
7961
7962 IsSubAdd = Opc[0] == ISD::FADD;
7963
7964 Opnd0 = InVec0;
7965 Opnd1 = InVec1;
7966 return true;
7967}
7968
7969/// Returns true if is possible to fold MUL and an idiom that has already been
7970/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
7971/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
7972/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
7973///
7974/// Prior to calling this function it should be known that there is some
7975/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
7976/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
7977/// before replacement of such SDNode with ADDSUB operation. Thus the number
7978/// of \p Opnd0 uses is expected to be equal to 2.
7979/// For example, this function may be called for the following IR:
7980/// %AB = fmul fast <2 x double> %A, %B
7981/// %Sub = fsub fast <2 x double> %AB, %C
7982/// %Add = fadd fast <2 x double> %AB, %C
7983/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
7984/// <2 x i32> <i32 0, i32 3>
7985/// There is a def for %Addsub here, which potentially can be replaced by
7986/// X86ISD::ADDSUB operation:
7987/// %Addsub = X86ISD::ADDSUB %AB, %C
7988/// and such ADDSUB can further be replaced with FMADDSUB:
7989/// %Addsub = FMADDSUB %A, %B, %C.
7990///
7991/// The main reason why this method is called before the replacement of the
7992/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
7993/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
7994/// FMADDSUB is.
7995static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
7996 SelectionDAG &DAG,
7997 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
7998 unsigned ExpectedUses) {
7999 if (Opnd0.getOpcode() != ISD::FMUL ||
8000 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
8001 return false;
8002
8003 // FIXME: These checks must match the similar ones in
8004 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
8005 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
8006 // or MUL + ADDSUB to FMADDSUB.
8007 const TargetOptions &Options = DAG.getTarget().Options;
8008 bool AllowFusion =
8009 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
8010 if (!AllowFusion)
8011 return false;
8012
8013 Opnd2 = Opnd1;
8014 Opnd1 = Opnd0.getOperand(1);
8015 Opnd0 = Opnd0.getOperand(0);
8016
8017 return true;
8018}
8019
8020/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
8021/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
8022/// X86ISD::FMSUBADD node.
8024 const SDLoc &DL,
8025 const X86Subtarget &Subtarget,
8026 SelectionDAG &DAG) {
8027 SDValue Opnd0, Opnd1;
8028 unsigned NumExtracts;
8029 bool IsSubAdd;
8030 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
8031 IsSubAdd))
8032 return SDValue();
8033
8034 MVT VT = BV->getSimpleValueType(0);
8035
8036 // Try to generate X86ISD::FMADDSUB node here.
8037 SDValue Opnd2;
8038 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
8039 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
8040 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
8041 }
8042
8043 // We only support ADDSUB.
8044 if (IsSubAdd)
8045 return SDValue();
8046
8047 // There are no known X86 targets with 512-bit ADDSUB instructions!
8048 // Convert to blend(fsub,fadd).
8049 if (VT.is512BitVector()) {
8050 SmallVector<int> Mask;
8051 for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
8052 Mask.push_back(I);
8053 Mask.push_back(I + E + 1);
8054 }
8055 SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
8056 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
8057 return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
8058 }
8059
8060 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
8061}
8062
8064 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
8065 // Initialize outputs to known values.
8066 MVT VT = BV->getSimpleValueType(0);
8067 HOpcode = ISD::DELETED_NODE;
8068 V0 = DAG.getUNDEF(VT);
8069 V1 = DAG.getUNDEF(VT);
8070
8071 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
8072 // half of the result is calculated independently from the 128-bit halves of
8073 // the inputs, so that makes the index-checking logic below more complicated.
8074 unsigned NumElts = VT.getVectorNumElements();
8075 unsigned GenericOpcode = ISD::DELETED_NODE;
8076 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
8077 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
8078 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
8079 for (unsigned i = 0; i != Num128BitChunks; ++i) {
8080 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
8081 // Ignore undef elements.
8082 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
8083 if (Op.isUndef())
8084 continue;
8085
8086 // If there's an opcode mismatch, we're done.
8087 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
8088 return false;
8089
8090 // Initialize horizontal opcode.
8091 if (HOpcode == ISD::DELETED_NODE) {
8092 GenericOpcode = Op.getOpcode();
8093 switch (GenericOpcode) {
8094 // clang-format off
8095 case ISD::ADD: HOpcode = X86ISD::HADD; break;
8096 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
8097 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
8098 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
8099 default: return false;
8100 // clang-format on
8101 }
8102 }
8103
8104 SDValue Op0 = Op.getOperand(0);
8105 SDValue Op1 = Op.getOperand(1);
8106 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8108 Op0.getOperand(0) != Op1.getOperand(0) ||
8109 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
8110 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
8111 return false;
8112
8113 // The source vector is chosen based on which 64-bit half of the
8114 // destination vector is being calculated.
8115 if (j < NumEltsIn64Bits) {
8116 if (V0.isUndef())
8117 V0 = Op0.getOperand(0);
8118 } else {
8119 if (V1.isUndef())
8120 V1 = Op0.getOperand(0);
8121 }
8122
8123 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
8124 if (SourceVec != Op0.getOperand(0))
8125 return false;
8126
8127 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
8128 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
8129 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
8130 unsigned ExpectedIndex = i * NumEltsIn128Bits +
8131 (j % NumEltsIn64Bits) * 2;
8132 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
8133 continue;
8134
8135 // If this is not a commutative op, this does not match.
8136 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
8137 return false;
8138
8139 // Addition is commutative, so try swapping the extract indexes.
8140 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
8141 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
8142 continue;
8143
8144 // Extract indexes do not match horizontal requirement.
8145 return false;
8146 }
8147 }
8148 // We matched. Opcode and operands are returned by reference as arguments.
8149 return true;
8150}
8151
8153 const SDLoc &DL, SelectionDAG &DAG,
8154 unsigned HOpcode, SDValue V0, SDValue V1) {
8155 // If either input vector is not the same size as the build vector,
8156 // extract/insert the low bits to the correct size.
8157 // This is free (examples: zmm --> xmm, xmm --> ymm).
8158 MVT VT = BV->getSimpleValueType(0);
8159 unsigned Width = VT.getSizeInBits();
8160 if (V0.getValueSizeInBits() > Width)
8161 V0 = extractSubVector(V0, 0, DAG, DL, Width);
8162 else if (V0.getValueSizeInBits() < Width)
8163 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, DL, Width);
8164
8165 if (V1.getValueSizeInBits() > Width)
8166 V1 = extractSubVector(V1, 0, DAG, DL, Width);
8167 else if (V1.getValueSizeInBits() < Width)
8168 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, DL, Width);
8169
8170 unsigned NumElts = VT.getVectorNumElements();
8171 APInt DemandedElts = APInt::getAllOnes(NumElts);
8172 for (unsigned i = 0; i != NumElts; ++i)
8173 if (BV->getOperand(i).isUndef())
8174 DemandedElts.clearBit(i);
8175
8176 // If we don't need the upper xmm, then perform as a xmm hop.
8177 unsigned HalfNumElts = NumElts / 2;
8178 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
8179 MVT HalfVT = VT.getHalfNumVectorElementsVT();
8180 V0 = extractSubVector(V0, 0, DAG, DL, 128);
8181 V1 = extractSubVector(V1, 0, DAG, DL, 128);
8182 SDValue Half = DAG.getNode(HOpcode, DL, HalfVT, V0, V1);
8183 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, DL, 256);
8184 }
8185
8186 return DAG.getNode(HOpcode, DL, VT, V0, V1);
8187}
8188
8189/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
8191 const X86Subtarget &Subtarget,
8192 SelectionDAG &DAG) {
8193 // We need at least 2 non-undef elements to make this worthwhile by default.
8194 unsigned NumNonUndefs =
8195 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
8196 if (NumNonUndefs < 2)
8197 return SDValue();
8198
8199 // There are 4 sets of horizontal math operations distinguished by type:
8200 // int/FP at 128-bit/256-bit. Each type was introduced with a different
8201 // subtarget feature. Try to match those "native" patterns first.
8202 MVT VT = BV->getSimpleValueType(0);
8203 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
8204 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
8205 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
8206 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
8207 unsigned HOpcode;
8208 SDValue V0, V1;
8209 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
8210 return getHopForBuildVector(BV, DL, DAG, HOpcode, V0, V1);
8211 }
8212
8213 // Try harder to match 256-bit ops by using extract/concat.
8214 if (!Subtarget.hasAVX() || !VT.is256BitVector())
8215 return SDValue();
8216
8217 // Count the number of UNDEF operands in the build_vector in input.
8218 unsigned NumElts = VT.getVectorNumElements();
8219 unsigned Half = NumElts / 2;
8220 unsigned NumUndefsLO = 0;
8221 unsigned NumUndefsHI = 0;
8222 for (unsigned i = 0, e = Half; i != e; ++i)
8223 if (BV->getOperand(i)->isUndef())
8224 NumUndefsLO++;
8225
8226 for (unsigned i = Half, e = NumElts; i != e; ++i)
8227 if (BV->getOperand(i)->isUndef())
8228 NumUndefsHI++;
8229
8230 SDValue InVec0, InVec1;
8231 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
8232 SDValue InVec2, InVec3;
8233 unsigned X86Opcode;
8234 bool CanFold = true;
8235
8236 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, Half, InVec0, InVec1) &&
8237 isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, Half, NumElts, InVec2,
8238 InVec3) &&
8239 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8240 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8241 X86Opcode = X86ISD::HADD;
8242 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, Half, InVec0,
8243 InVec1) &&
8244 isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, Half, NumElts, InVec2,
8245 InVec3) &&
8246 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8247 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8248 X86Opcode = X86ISD::HSUB;
8249 else
8250 CanFold = false;
8251
8252 if (CanFold) {
8253 // Do not try to expand this build_vector into a pair of horizontal
8254 // add/sub if we can emit a pair of scalar add/sub.
8255 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8256 return SDValue();
8257
8258 // Convert this build_vector into a pair of horizontal binops followed by
8259 // a concat vector. We must adjust the outputs from the partial horizontal
8260 // matching calls above to account for undefined vector halves.
8261 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
8262 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
8263 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
8264 bool isUndefLO = NumUndefsLO == Half;
8265 bool isUndefHI = NumUndefsHI == Half;
8266 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
8267 isUndefHI);
8268 }
8269 }
8270
8271 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
8272 VT == MVT::v16i16) {
8273 unsigned X86Opcode;
8274 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, NumElts, InVec0,
8275 InVec1))
8276 X86Opcode = X86ISD::HADD;
8277 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, NumElts, InVec0,
8278 InVec1))
8279 X86Opcode = X86ISD::HSUB;
8280 else if (isHorizontalBinOpPart(BV, ISD::FADD, DL, DAG, 0, NumElts, InVec0,
8281 InVec1))
8282 X86Opcode = X86ISD::FHADD;
8283 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DL, DAG, 0, NumElts, InVec0,
8284 InVec1))
8285 X86Opcode = X86ISD::FHSUB;
8286 else
8287 return SDValue();
8288
8289 // Don't try to expand this build_vector into a pair of horizontal add/sub
8290 // if we can simply emit a pair of scalar add/sub.
8291 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8292 return SDValue();
8293
8294 // Convert this build_vector into two horizontal add/sub followed by
8295 // a concat vector.
8296 bool isUndefLO = NumUndefsLO == Half;
8297 bool isUndefHI = NumUndefsHI == Half;
8298 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
8299 isUndefLO, isUndefHI);
8300 }
8301
8302 return SDValue();
8303}
8304
8305static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
8306 SelectionDAG &DAG);
8307
8308/// If a BUILD_VECTOR's source elements all apply the same bit operation and
8309/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
8310/// just apply the bit to the vectors.
8311/// NOTE: Its not in our interest to start make a general purpose vectorizer
8312/// from this, but enough scalar bit operations are created from the later
8313/// legalization + scalarization stages to need basic support.
8315 const X86Subtarget &Subtarget,
8316 SelectionDAG &DAG) {
8317 MVT VT = Op->getSimpleValueType(0);
8318 unsigned NumElems = VT.getVectorNumElements();
8319 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8320
8321 // Check that all elements have the same opcode.
8322 // TODO: Should we allow UNDEFS and if so how many?
8323 unsigned Opcode = Op->getOperand(0).getOpcode();
8324 for (unsigned i = 1; i < NumElems; ++i)
8325 if (Opcode != Op->getOperand(i).getOpcode())
8326 return SDValue();
8327
8328 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
8329 bool IsShift = false;
8330 switch (Opcode) {
8331 default:
8332 return SDValue();
8333 case ISD::SHL:
8334 case ISD::SRL:
8335 case ISD::SRA:
8336 IsShift = true;
8337 break;
8338 case ISD::AND:
8339 case ISD::XOR:
8340 case ISD::OR:
8341 // Don't do this if the buildvector is a splat - we'd replace one
8342 // constant with an entire vector.
8343 if (Op->getSplatValue())
8344 return SDValue();
8345 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
8346 return SDValue();
8347 break;
8348 }
8349
8350 SmallVector<SDValue, 4> LHSElts, RHSElts;
8351 for (SDValue Elt : Op->ops()) {
8352 SDValue LHS = Elt.getOperand(0);
8353 SDValue RHS = Elt.getOperand(1);
8354
8355 // We expect the canonicalized RHS operand to be the constant.
8356 if (!isa<ConstantSDNode>(RHS))
8357 return SDValue();
8358
8359 // Extend shift amounts.
8360 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
8361 if (!IsShift)
8362 return SDValue();
8363 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
8364 }
8365
8366 LHSElts.push_back(LHS);
8367 RHSElts.push_back(RHS);
8368 }
8369
8370 // Limit to shifts by uniform immediates.
8371 // TODO: Only accept vXi8/vXi64 special cases?
8372 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
8373 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
8374 return SDValue();
8375
8376 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
8377 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
8378 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
8379
8380 if (!IsShift)
8381 return Res;
8382
8383 // Immediately lower the shift to ensure the constant build vector doesn't
8384 // get converted to a constant pool before the shift is lowered.
8385 return LowerShift(Res, Subtarget, DAG);
8386}
8387
8388/// Create a vector constant without a load. SSE/AVX provide the bare minimum
8389/// functionality to do this, so it's all zeros, all ones, or some derivation
8390/// that is cheap to calculate.
8392 SelectionDAG &DAG,
8393 const X86Subtarget &Subtarget) {
8394 MVT VT = Op.getSimpleValueType();
8395
8396 // Vectors containing all zeros can be matched by pxor and xorps.
8397 if (ISD::isBuildVectorAllZeros(Op.getNode()))
8398 return Op;
8399
8400 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
8401 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
8402 // vpcmpeqd on 256-bit vectors.
8403 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
8404 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
8405 return Op;
8406
8407 return getOnesVector(VT, DAG, DL);
8408 }
8409
8410 return SDValue();
8411}
8412
8413/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
8414/// from a vector of source values and a vector of extraction indices.
8415/// The vectors might be manipulated to match the type of the permute op.
8416static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
8417 const SDLoc &DL, SelectionDAG &DAG,
8418 const X86Subtarget &Subtarget) {
8419 MVT ShuffleVT = VT;
8420 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8421 unsigned NumElts = VT.getVectorNumElements();
8422 unsigned SizeInBits = VT.getSizeInBits();
8423
8424 // Adjust IndicesVec to match VT size.
8425 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
8426 "Illegal variable permute mask size");
8427 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
8428 // Narrow/widen the indices vector to the correct size.
8429 if (IndicesVec.getValueSizeInBits() > SizeInBits)
8430 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
8431 NumElts * VT.getScalarSizeInBits());
8432 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
8433 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
8434 SDLoc(IndicesVec), SizeInBits);
8435 // Zero-extend the index elements within the vector.
8436 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
8437 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
8438 IndicesVT, IndicesVec);
8439 }
8440 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
8441
8442 // Handle SrcVec that don't match VT type.
8443 if (SrcVec.getValueSizeInBits() != SizeInBits) {
8444 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
8445 // Handle larger SrcVec by treating it as a larger permute.
8446 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
8447 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
8448 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8449 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
8450 Subtarget, DAG, SDLoc(IndicesVec));
8451 SDValue NewSrcVec =
8452 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8453 if (NewSrcVec)
8454 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
8455 return SDValue();
8456 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
8457 // Widen smaller SrcVec to match VT.
8458 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
8459 } else
8460 return SDValue();
8461 }
8462
8463 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
8464 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
8465 EVT SrcVT = Idx.getValueType();
8466 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
8467 uint64_t IndexScale = 0;
8468 uint64_t IndexOffset = 0;
8469
8470 // If we're scaling a smaller permute op, then we need to repeat the
8471 // indices, scaling and offsetting them as well.
8472 // e.g. v4i32 -> v16i8 (Scale = 4)
8473 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
8474 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
8475 for (uint64_t i = 0; i != Scale; ++i) {
8476 IndexScale |= Scale << (i * NumDstBits);
8477 IndexOffset |= i << (i * NumDstBits);
8478 }
8479
8480 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
8481 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
8482 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
8483 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
8484 return Idx;
8485 };
8486
8487 unsigned Opcode = 0;
8488 switch (VT.SimpleTy) {
8489 default:
8490 break;
8491 case MVT::v16i8:
8492 if (Subtarget.hasSSSE3())
8493 Opcode = X86ISD::PSHUFB;
8494 break;
8495 case MVT::v8i16:
8496 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8497 Opcode = X86ISD::VPERMV;
8498 else if (Subtarget.hasSSSE3()) {
8499 Opcode = X86ISD::PSHUFB;
8500 ShuffleVT = MVT::v16i8;
8501 }
8502 break;
8503 case MVT::v4f32:
8504 case MVT::v4i32:
8505 if (Subtarget.hasAVX()) {
8506 Opcode = X86ISD::VPERMILPV;
8507 ShuffleVT = MVT::v4f32;
8508 } else if (Subtarget.hasSSSE3()) {
8509 Opcode = X86ISD::PSHUFB;
8510 ShuffleVT = MVT::v16i8;
8511 }
8512 break;
8513 case MVT::v2f64:
8514 case MVT::v2i64:
8515 if (Subtarget.hasAVX()) {
8516 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
8517 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8518 Opcode = X86ISD::VPERMILPV;
8519 ShuffleVT = MVT::v2f64;
8520 } else if (Subtarget.hasSSE41()) {
8521 // SSE41 can compare v2i64 - select between indices 0 and 1.
8522 return DAG.getSelectCC(
8523 DL, IndicesVec,
8524 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
8525 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
8526 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
8528 }
8529 break;
8530 case MVT::v32i8:
8531 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
8532 Opcode = X86ISD::VPERMV;
8533 else if (Subtarget.hasXOP()) {
8534 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
8535 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
8536 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
8537 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
8538 return DAG.getNode(
8540 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
8541 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
8542 } else if (Subtarget.hasAVX()) {
8543 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
8544 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
8545 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
8546 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
8547 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
8548 ArrayRef<SDValue> Ops) {
8549 // Permute Lo and Hi and then select based on index range.
8550 // This works as SHUFB uses bits[3:0] to permute elements and we don't
8551 // care about the bit[7] as its just an index vector.
8552 SDValue Idx = Ops[2];
8553 EVT VT = Idx.getValueType();
8554 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
8555 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
8556 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
8558 };
8559 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
8560 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
8561 PSHUFBBuilder);
8562 }
8563 break;
8564 case MVT::v16i16:
8565 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8566 Opcode = X86ISD::VPERMV;
8567 else if (Subtarget.hasAVX()) {
8568 // Scale to v32i8 and perform as v32i8.
8569 IndicesVec = ScaleIndices(IndicesVec, 2);
8570 return DAG.getBitcast(
8572 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
8573 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
8574 }
8575 break;
8576 case MVT::v8f32:
8577 case MVT::v8i32:
8578 if (Subtarget.hasAVX2())
8579 Opcode = X86ISD::VPERMV;
8580 else if (Subtarget.hasAVX()) {
8581 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
8582 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
8583 {0, 1, 2, 3, 0, 1, 2, 3});
8584 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
8585 {4, 5, 6, 7, 4, 5, 6, 7});
8586 if (Subtarget.hasXOP())
8587 return DAG.getBitcast(
8588 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
8589 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
8590 // Permute Lo and Hi and then select based on index range.
8591 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
8592 SDValue Res = DAG.getSelectCC(
8593 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
8594 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
8595 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
8597 return DAG.getBitcast(VT, Res);
8598 }
8599 break;
8600 case MVT::v4i64:
8601 case MVT::v4f64:
8602 if (Subtarget.hasAVX512()) {
8603 if (!Subtarget.hasVLX()) {
8604 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
8605 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
8606 SDLoc(SrcVec));
8607 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
8608 DAG, SDLoc(IndicesVec));
8609 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
8610 DAG, Subtarget);
8611 return extract256BitVector(Res, 0, DAG, DL);
8612 }
8613 Opcode = X86ISD::VPERMV;
8614 } else if (Subtarget.hasAVX()) {
8615 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
8616 SDValue LoLo =
8617 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
8618 SDValue HiHi =
8619 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
8620 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
8621 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8622 if (Subtarget.hasXOP())
8623 return DAG.getBitcast(
8624 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
8625 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
8626 // Permute Lo and Hi and then select based on index range.
8627 // This works as VPERMILPD only uses index bit[1] to permute elements.
8628 SDValue Res = DAG.getSelectCC(
8629 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
8630 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
8631 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
8633 return DAG.getBitcast(VT, Res);
8634 }
8635 break;
8636 case MVT::v64i8:
8637 if (Subtarget.hasVBMI())
8638 Opcode = X86ISD::VPERMV;
8639 break;
8640 case MVT::v32i16:
8641 if (Subtarget.hasBWI())
8642 Opcode = X86ISD::VPERMV;
8643 break;
8644 case MVT::v16f32:
8645 case MVT::v16i32:
8646 case MVT::v8f64:
8647 case MVT::v8i64:
8648 if (Subtarget.hasAVX512())
8649 Opcode = X86ISD::VPERMV;
8650 break;
8651 }
8652 if (!Opcode)
8653 return SDValue();
8654
8655 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
8656 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
8657 "Illegal variable permute shuffle type");
8658
8659 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
8660 if (Scale > 1)
8661 IndicesVec = ScaleIndices(IndicesVec, Scale);
8662
8663 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
8664 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
8665
8666 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
8667 SDValue Res = Opcode == X86ISD::VPERMV
8668 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
8669 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
8670 return DAG.getBitcast(VT, Res);
8671}
8672
8673// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
8674// reasoned to be a permutation of a vector by indices in a non-constant vector.
8675// (build_vector (extract_elt V, (extract_elt I, 0)),
8676// (extract_elt V, (extract_elt I, 1)),
8677// ...
8678// ->
8679// (vpermv I, V)
8680//
8681// TODO: Handle undefs
8682// TODO: Utilize pshufb and zero mask blending to support more efficient
8683// construction of vectors with constant-0 elements.
8684static SDValue
8686 SelectionDAG &DAG,
8687 const X86Subtarget &Subtarget) {
8688 SDValue SrcVec, IndicesVec;
8689 // Check for a match of the permute source vector and permute index elements.
8690 // This is done by checking that the i-th build_vector operand is of the form:
8691 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
8692 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
8693 SDValue Op = V.getOperand(Idx);
8694 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8695 return SDValue();
8696
8697 // If this is the first extract encountered in V, set the source vector,
8698 // otherwise verify the extract is from the previously defined source
8699 // vector.
8700 if (!SrcVec)
8701 SrcVec = Op.getOperand(0);
8702 else if (SrcVec != Op.getOperand(0))
8703 return SDValue();
8704 SDValue ExtractedIndex = Op->getOperand(1);
8705 // Peek through extends.
8706 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
8707 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
8708 ExtractedIndex = ExtractedIndex.getOperand(0);
8709 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8710 return SDValue();
8711
8712 // If this is the first extract from the index vector candidate, set the
8713 // indices vector, otherwise verify the extract is from the previously
8714 // defined indices vector.
8715 if (!IndicesVec)
8716 IndicesVec = ExtractedIndex.getOperand(0);
8717 else if (IndicesVec != ExtractedIndex.getOperand(0))
8718 return SDValue();
8719
8720 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
8721 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
8722 return SDValue();
8723 }
8724
8725 MVT VT = V.getSimpleValueType();
8726 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8727}
8728
8729SDValue
8730X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
8731 SDLoc dl(Op);
8732
8733 MVT VT = Op.getSimpleValueType();
8734 MVT EltVT = VT.getVectorElementType();
8735 MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
8736 unsigned NumElems = Op.getNumOperands();
8737
8738 // Generate vectors for predicate vectors.
8739 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
8740 return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget);
8741
8742 if (VT.getVectorElementType() == MVT::bf16 &&
8743 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
8744 return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
8745
8746 if (SDValue VectorCst = materializeVectorConstant(Op, dl, DAG, Subtarget))
8747 return VectorCst;
8748
8749 unsigned EVTBits = EltVT.getSizeInBits();
8750 APInt UndefMask = APInt::getZero(NumElems);
8751 APInt FrozenUndefMask = APInt::getZero(NumElems);
8752 APInt ZeroMask = APInt::getZero(NumElems);
8753 APInt NonZeroMask = APInt::getZero(NumElems);
8754 bool IsAllConstants = true;
8755 bool OneUseFrozenUndefs = true;
8756 SmallSet<SDValue, 8> Values;
8757 unsigned NumConstants = NumElems;
8758 for (unsigned i = 0; i < NumElems; ++i) {
8759 SDValue Elt = Op.getOperand(i);
8760 if (Elt.isUndef()) {
8761 UndefMask.setBit(i);
8762 continue;
8763 }
8764 if (ISD::isFreezeUndef(Elt.getNode())) {
8765 OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse();
8766 FrozenUndefMask.setBit(i);
8767 continue;
8768 }
8769 Values.insert(Elt);
8770 if (!isIntOrFPConstant(Elt)) {
8771 IsAllConstants = false;
8772 NumConstants--;
8773 }
8774 if (X86::isZeroNode(Elt)) {
8775 ZeroMask.setBit(i);
8776 } else {
8777 NonZeroMask.setBit(i);
8778 }
8779 }
8780
8781 // All undef vector. Return an UNDEF.
8782 if (UndefMask.isAllOnes())
8783 return DAG.getUNDEF(VT);
8784
8785 // All undef/freeze(undef) vector. Return a FREEZE UNDEF.
8786 if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())
8787 return DAG.getFreeze(DAG.getUNDEF(VT));
8788
8789 // All undef/freeze(undef)/zero vector. Return a zero vector.
8790 if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())
8791 return getZeroVector(VT, Subtarget, DAG, dl);
8792
8793 // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
8794 // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in
8795 // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
8796 // and blend the FREEZE-UNDEF operands back in.
8797 // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
8798 if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();
8799 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
8800 SmallVector<int, 16> BlendMask(NumElems, -1);
8801 SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));
8802 for (unsigned i = 0; i < NumElems; ++i) {
8803 if (UndefMask[i]) {
8804 BlendMask[i] = -1;
8805 continue;
8806 }
8807 BlendMask[i] = i;
8808 if (!FrozenUndefMask[i])
8809 Elts[i] = Op.getOperand(i);
8810 else
8811 BlendMask[i] += NumElems;
8812 }
8813 SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);
8814 SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));
8815 SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);
8816 return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);
8817 }
8818
8819 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
8820
8821 // If the upper elts of a ymm/zmm are undef/freeze(undef)/zero then we might
8822 // be better off lowering to a smaller build vector and padding with
8823 // undef/zero.
8824 if ((VT.is256BitVector() || VT.is512BitVector()) &&
8826 unsigned UpperElems = NumElems / 2;
8827 APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;
8828 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();
8829 if (NumUpperUndefsOrZeros >= UpperElems) {
8830 if (VT.is512BitVector() &&
8831 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
8832 UpperElems = NumElems - (NumElems / 4);
8833 // If freeze(undef) is in any upper elements, force to zero.
8834 bool UndefUpper = UndefMask.countl_one() >= UpperElems;
8835 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
8836 SDValue NewBV =
8837 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
8838 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
8839 }
8840 }
8841
8842 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, dl, Subtarget, DAG))
8843 return AddSub;
8844 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, dl, Subtarget, DAG))
8845 return HorizontalOp;
8846 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, dl, Subtarget, DAG))
8847 return Broadcast;
8848 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG))
8849 return BitOp;
8850
8851 unsigned NumZero = ZeroMask.popcount();
8852 unsigned NumNonZero = NonZeroMask.popcount();
8853
8854 // If we are inserting one variable into a vector of non-zero constants, try
8855 // to avoid loading each constant element as a scalar. Load the constants as a
8856 // vector and then insert the variable scalar element. If insertion is not
8857 // supported, fall back to a shuffle to get the scalar blended with the
8858 // constants. Insertion into a zero vector is handled as a special-case
8859 // somewhere below here.
8860 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
8861 FrozenUndefMask.isZero() &&
8864 // Create an all-constant vector. The variable element in the old
8865 // build vector is replaced by undef in the constant vector. Save the
8866 // variable scalar element and its index for use in the insertelement.
8867 LLVMContext &Context = *DAG.getContext();
8868 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
8869 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
8870 SDValue VarElt;
8871 SDValue InsIndex;
8872 for (unsigned i = 0; i != NumElems; ++i) {
8873 SDValue Elt = Op.getOperand(i);
8874 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
8875 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
8876 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
8877 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
8878 else if (!Elt.isUndef()) {
8879 assert(!VarElt.getNode() && !InsIndex.getNode() &&
8880 "Expected one variable element in this vector");
8881 VarElt = Elt;
8882 InsIndex = DAG.getVectorIdxConstant(i, dl);
8883 }
8884 }
8885 Constant *CV = ConstantVector::get(ConstVecOps);
8886 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
8887
8888 // The constants we just created may not be legal (eg, floating point). We
8889 // must lower the vector right here because we can not guarantee that we'll
8890 // legalize it before loading it. This is also why we could not just create
8891 // a new build vector here. If the build vector contains illegal constants,
8892 // it could get split back up into a series of insert elements.
8893 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
8894 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
8897 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
8898 unsigned InsertC = InsIndex->getAsZExtVal();
8899 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
8900 if (InsertC < NumEltsInLow128Bits)
8901 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
8902
8903 // There's no good way to insert into the high elements of a >128-bit
8904 // vector, so use shuffles to avoid an extract/insert sequence.
8905 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
8906 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
8907 SmallVector<int, 8> ShuffleMask;
8908 unsigned NumElts = VT.getVectorNumElements();
8909 for (unsigned i = 0; i != NumElts; ++i)
8910 ShuffleMask.push_back(i == InsertC ? NumElts : i);
8911 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
8912 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
8913 }
8914
8915 // Special case for single non-zero, non-undef, element.
8916 if (NumNonZero == 1) {
8917 unsigned Idx = NonZeroMask.countr_zero();
8918 SDValue Item = Op.getOperand(Idx);
8919
8920 // If we have a constant or non-constant insertion into the low element of
8921 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
8922 // the rest of the elements. This will be matched as movd/movq/movss/movsd
8923 // depending on what the source datatype is.
8924 if (Idx == 0) {
8925 if (NumZero == 0)
8926 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8927
8928 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
8929 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
8930 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
8931 assert((VT.is128BitVector() || VT.is256BitVector() ||
8932 VT.is512BitVector()) &&
8933 "Expected an SSE value type!");
8934 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8935 // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
8936 // zero vector.
8937 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8938 }
8939
8940 // We can't directly insert an i8 or i16 into a vector, so zero extend
8941 // it to i32 first.
8942 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
8943 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
8944 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
8945 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
8946 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8947 return DAG.getBitcast(VT, Item);
8948 }
8949 }
8950
8951 // Is it a vector logical left shift?
8952 if (NumElems == 2 && Idx == 1 &&
8953 X86::isZeroNode(Op.getOperand(0)) &&
8954 !X86::isZeroNode(Op.getOperand(1))) {
8955 unsigned NumBits = VT.getSizeInBits();
8956 return getVShift(true, VT,
8958 VT, Op.getOperand(1)),
8959 NumBits/2, DAG, *this, dl);
8960 }
8961
8962 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
8963 return SDValue();
8964
8965 // Otherwise, if this is a vector with i32 or f32 elements, and the element
8966 // is a non-constant being inserted into an element other than the low one,
8967 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
8968 // movd/movss) to move this into the low element, then shuffle it into
8969 // place.
8970 if (EVTBits == 32) {
8971 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8972 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
8973 }
8974 }
8975
8976 // Splat is obviously ok. Let legalizer expand it to a shuffle.
8977 if (Values.size() == 1) {
8978 if (EVTBits == 32) {
8979 // Instead of a shuffle like this:
8980 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
8981 // Check if it's possible to issue this instead.
8982 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
8983 unsigned Idx = NonZeroMask.countr_zero();
8984 SDValue Item = Op.getOperand(Idx);
8985 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
8986 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
8987 }
8988 return SDValue();
8989 }
8990
8991 // A vector full of immediates; various special cases are already
8992 // handled, so this is best done with a single constant-pool load.
8993 if (IsAllConstants)
8994 return SDValue();
8995
8996 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, dl, DAG, Subtarget))
8997 return V;
8998
8999 // See if we can use a vector load to get all of the elements.
9000 {
9001 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
9002 if (SDValue LD =
9003 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
9004 return LD;
9005 }
9006
9007 // If this is a splat of pairs of 32-bit elements, we can use a narrower
9008 // build_vector and broadcast it.
9009 // TODO: We could probably generalize this more.
9010 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
9011 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
9012 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
9013 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
9014 // Make sure all the even/odd operands match.
9015 for (unsigned i = 2; i != NumElems; ++i)
9016 if (Ops[i % 2] != Op.getOperand(i))
9017 return false;
9018 return true;
9019 };
9020 if (CanSplat(Op, NumElems, Ops)) {
9021 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
9022 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
9023 // Create a new build vector and cast to v2i64/v2f64.
9024 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
9025 DAG.getBuildVector(NarrowVT, dl, Ops));
9026 // Broadcast from v2i64/v2f64 and cast to final VT.
9027 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
9028 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
9029 NewBV));
9030 }
9031 }
9032
9033 // For AVX-length vectors, build the individual 128-bit pieces and use
9034 // shuffles to put them in place.
9035 if (VT.getSizeInBits() > 128) {
9036 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
9037
9038 // Build both the lower and upper subvector.
9039 SDValue Lower =
9040 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
9042 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
9043
9044 // Recreate the wider vector with the lower and upper part.
9045 return concatSubVectors(Lower, Upper, DAG, dl);
9046 }
9047
9048 // Let legalizer expand 2-wide build_vectors.
9049 if (EVTBits == 64) {
9050 if (NumNonZero == 1) {
9051 // One half is zero or undef.
9052 unsigned Idx = NonZeroMask.countr_zero();
9054 Op.getOperand(Idx));
9055 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
9056 }
9057 return SDValue();
9058 }
9059
9060 // If element VT is < 32 bits, convert it to inserts into a zero vector.
9061 if (EVTBits == 8 && NumElems == 16)
9062 if (SDValue V = LowerBuildVectorv16i8(Op, dl, NonZeroMask, NumNonZero,
9063 NumZero, DAG, Subtarget))
9064 return V;
9065
9066 if (EltVT == MVT::i16 && NumElems == 8)
9067 if (SDValue V = LowerBuildVectorv8i16(Op, dl, NonZeroMask, NumNonZero,
9068 NumZero, DAG, Subtarget))
9069 return V;
9070
9071 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
9072 if (EVTBits == 32 && NumElems == 4)
9073 if (SDValue V = LowerBuildVectorv4x32(Op, dl, DAG, Subtarget))
9074 return V;
9075
9076 // If element VT is == 32 bits, turn it into a number of shuffles.
9077 if (NumElems == 4 && NumZero > 0) {
9078 SmallVector<SDValue, 8> Ops(NumElems);
9079 for (unsigned i = 0; i < 4; ++i) {
9080 bool isZero = !NonZeroMask[i];
9081 if (isZero)
9082 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
9083 else
9084 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9085 }
9086
9087 for (unsigned i = 0; i < 2; ++i) {
9088 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
9089 default: llvm_unreachable("Unexpected NonZero count");
9090 case 0:
9091 Ops[i] = Ops[i*2]; // Must be a zero vector.
9092 break;
9093 case 1:
9094 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
9095 break;
9096 case 2:
9097 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9098 break;
9099 case 3:
9100 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9101 break;
9102 }
9103 }
9104
9105 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
9106 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
9107 int MaskVec[] = {
9108 Reverse1 ? 1 : 0,
9109 Reverse1 ? 0 : 1,
9110 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
9111 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
9112 };
9113 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
9114 }
9115
9116 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
9117
9118 // Check for a build vector from mostly shuffle plus few inserting.
9119 if (SDValue Sh = buildFromShuffleMostly(Op, dl, DAG))
9120 return Sh;
9121
9122 // For SSE 4.1, use insertps to put the high elements into the low element.
9123 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
9125 if (!Op.getOperand(0).isUndef())
9126 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
9127 else
9128 Result = DAG.getUNDEF(VT);
9129
9130 for (unsigned i = 1; i < NumElems; ++i) {
9131 if (Op.getOperand(i).isUndef()) continue;
9132 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
9133 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
9134 }
9135 return Result;
9136 }
9137
9138 // Otherwise, expand into a number of unpckl*, start by extending each of
9139 // our (non-undef) elements to the full vector width with the element in the
9140 // bottom slot of the vector (which generates no code for SSE).
9141 SmallVector<SDValue, 8> Ops(NumElems);
9142 for (unsigned i = 0; i < NumElems; ++i) {
9143 if (!Op.getOperand(i).isUndef())
9144 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9145 else
9146 Ops[i] = DAG.getUNDEF(VT);
9147 }
9148
9149 // Next, we iteratively mix elements, e.g. for v4f32:
9150 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
9151 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
9152 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
9153 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
9154 // Generate scaled UNPCKL shuffle mask.
9156 for(unsigned i = 0; i != Scale; ++i)
9157 Mask.push_back(i);
9158 for (unsigned i = 0; i != Scale; ++i)
9159 Mask.push_back(NumElems+i);
9160 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
9161
9162 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
9163 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
9164 }
9165 return Ops[0];
9166}
9167
9168// 256-bit AVX can use the vinsertf128 instruction
9169// to create 256-bit vectors from two other 128-bit ones.
9170// TODO: Detect subvector broadcast here instead of DAG combine?
9172 const X86Subtarget &Subtarget) {
9173 SDLoc dl(Op);
9174 MVT ResVT = Op.getSimpleValueType();
9175
9176 assert((ResVT.is256BitVector() ||
9177 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
9178
9179 unsigned NumOperands = Op.getNumOperands();
9180 unsigned NumFreezeUndef = 0;
9181 unsigned NumZero = 0;
9182 unsigned NumNonZero = 0;
9183 unsigned NonZeros = 0;
9184 for (unsigned i = 0; i != NumOperands; ++i) {
9185 SDValue SubVec = Op.getOperand(i);
9186 if (SubVec.isUndef())
9187 continue;
9188 if (ISD::isFreezeUndef(SubVec.getNode())) {
9189 // If the freeze(undef) has multiple uses then we must fold to zero.
9190 if (SubVec.hasOneUse())
9191 ++NumFreezeUndef;
9192 else
9193 ++NumZero;
9194 }
9195 else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9196 ++NumZero;
9197 else {
9198 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9199 NonZeros |= 1 << i;
9200 ++NumNonZero;
9201 }
9202 }
9203
9204 // If we have more than 2 non-zeros, build each half separately.
9205 if (NumNonZero > 2) {
9206 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9207 ArrayRef<SDUse> Ops = Op->ops();
9208 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9209 Ops.slice(0, NumOperands/2));
9210 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9211 Ops.slice(NumOperands/2));
9212 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9213 }
9214
9215 // Otherwise, build it up through insert_subvectors.
9216 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
9217 : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
9218 : DAG.getUNDEF(ResVT));
9219
9220 MVT SubVT = Op.getOperand(0).getSimpleValueType();
9221 unsigned NumSubElems = SubVT.getVectorNumElements();
9222 for (unsigned i = 0; i != NumOperands; ++i) {
9223 if ((NonZeros & (1 << i)) == 0)
9224 continue;
9225
9226 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
9227 Op.getOperand(i),
9228 DAG.getIntPtrConstant(i * NumSubElems, dl));
9229 }
9230
9231 return Vec;
9232}
9233
9234// Returns true if the given node is a type promotion (by concatenating i1
9235// zeros) of the result of a node that already zeros all upper bits of
9236// k-register.
9237// TODO: Merge this with LowerAVXCONCAT_VECTORS?
9239 const X86Subtarget &Subtarget,
9240 SelectionDAG & DAG) {
9241 SDLoc dl(Op);
9242 MVT ResVT = Op.getSimpleValueType();
9243 unsigned NumOperands = Op.getNumOperands();
9244
9245 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
9246 "Unexpected number of operands in CONCAT_VECTORS");
9247
9248 uint64_t Zeros = 0;
9249 uint64_t NonZeros = 0;
9250 for (unsigned i = 0; i != NumOperands; ++i) {
9251 SDValue SubVec = Op.getOperand(i);
9252 if (SubVec.isUndef())
9253 continue;
9254 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9255 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9256 Zeros |= (uint64_t)1 << i;
9257 else
9258 NonZeros |= (uint64_t)1 << i;
9259 }
9260
9261 unsigned NumElems = ResVT.getVectorNumElements();
9262
9263 // If we are inserting non-zero vector and there are zeros in LSBs and undef
9264 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
9265 // insert_subvector will give us two kshifts.
9266 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
9267 Log2_64(NonZeros) != NumOperands - 1) {
9268 unsigned Idx = Log2_64(NonZeros);
9269 SDValue SubVec = Op.getOperand(Idx);
9270 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9271 MVT ShiftVT = widenMaskVectorType(ResVT, Subtarget);
9272 Op = widenSubVector(ShiftVT, SubVec, false, Subtarget, DAG, dl);
9273 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, Op,
9274 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
9275 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
9276 DAG.getIntPtrConstant(0, dl));
9277 }
9278
9279 // If there are zero or one non-zeros we can handle this very simply.
9280 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
9281 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
9282 if (!NonZeros)
9283 return Vec;
9284 unsigned Idx = Log2_64(NonZeros);
9285 SDValue SubVec = Op.getOperand(Idx);
9286 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9287 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
9288 DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
9289 }
9290
9291 if (NumOperands > 2) {
9292 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9293 ArrayRef<SDUse> Ops = Op->ops();
9294 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9295 Ops.slice(0, NumOperands/2));
9296 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9297 Ops.slice(NumOperands/2));
9298 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9299 }
9300
9301 assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?");
9302
9303 if (ResVT.getVectorNumElements() >= 16)
9304 return Op; // The operation is legal with KUNPCK
9305
9306 SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
9307 DAG.getUNDEF(ResVT), Op.getOperand(0),
9308 DAG.getIntPtrConstant(0, dl));
9309 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
9310 DAG.getIntPtrConstant(NumElems/2, dl));
9311}
9312
9314 const X86Subtarget &Subtarget,
9315 SelectionDAG &DAG) {
9316 MVT VT = Op.getSimpleValueType();
9317 if (VT.getVectorElementType() == MVT::i1)
9318 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
9319
9320 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
9321 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
9322 Op.getNumOperands() == 4)));
9323
9324 // AVX can use the vinsertf128 instruction to create 256-bit vectors
9325 // from two other 128-bit ones.
9326
9327 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
9328 return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
9329}
9330
9331//===----------------------------------------------------------------------===//
9332// Vector shuffle lowering
9333//
9334// This is an experimental code path for lowering vector shuffles on x86. It is
9335// designed to handle arbitrary vector shuffles and blends, gracefully
9336// degrading performance as necessary. It works hard to recognize idiomatic
9337// shuffles and lower them to optimal instruction patterns without leaving
9338// a framework that allows reasonably efficient handling of all vector shuffle
9339// patterns.
9340//===----------------------------------------------------------------------===//
9341
9342/// Tiny helper function to identify a no-op mask.
9343///
9344/// This is a somewhat boring predicate function. It checks whether the mask
9345/// array input, which is assumed to be a single-input shuffle mask of the kind
9346/// used by the X86 shuffle instructions (not a fully general
9347/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
9348/// in-place shuffle are 'no-op's.
9350 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9351 assert(Mask[i] >= -1 && "Out of bound mask element!");
9352 if (Mask[i] >= 0 && Mask[i] != i)
9353 return false;
9354 }
9355 return true;
9356}
9357
9358/// Test whether there are elements crossing LaneSizeInBits lanes in this
9359/// shuffle mask.
9360///
9361/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
9362/// and we routinely test for these.
9363static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
9364 unsigned ScalarSizeInBits,
9365 ArrayRef<int> Mask) {
9366 assert(LaneSizeInBits && ScalarSizeInBits &&
9367 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9368 "Illegal shuffle lane size");
9369 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
9370 int Size = Mask.size();
9371 for (int i = 0; i < Size; ++i)
9372 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
9373 return true;
9374 return false;
9375}
9376
9377/// Test whether there are elements crossing 128-bit lanes in this
9378/// shuffle mask.
9380 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
9381}
9382
9383/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
9384/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
9385/// better support 'repeated mask + lane permute' style shuffles.
9386static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
9387 unsigned ScalarSizeInBits,
9388 ArrayRef<int> Mask) {
9389 assert(LaneSizeInBits && ScalarSizeInBits &&
9390 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9391 "Illegal shuffle lane size");
9392 int NumElts = Mask.size();
9393 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
9394 int NumLanes = NumElts / NumEltsPerLane;
9395 if (NumLanes > 1) {
9396 for (int i = 0; i != NumLanes; ++i) {
9397 int SrcLane = -1;
9398 for (int j = 0; j != NumEltsPerLane; ++j) {
9399 int M = Mask[(i * NumEltsPerLane) + j];
9400 if (M < 0)
9401 continue;
9402 int Lane = (M % NumElts) / NumEltsPerLane;
9403 if (SrcLane >= 0 && SrcLane != Lane)
9404 return true;
9405 SrcLane = Lane;
9406 }
9407 }
9408 }
9409 return false;
9410}
9411
9412/// Test whether a shuffle mask is equivalent within each sub-lane.
9413///
9414/// This checks a shuffle mask to see if it is performing the same
9415/// lane-relative shuffle in each sub-lane. This trivially implies
9416/// that it is also not lane-crossing. It may however involve a blend from the
9417/// same lane of a second vector.
9418///
9419/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
9420/// non-trivial to compute in the face of undef lanes. The representation is
9421/// suitable for use with existing 128-bit shuffles as entries from the second
9422/// vector have been remapped to [LaneSize, 2*LaneSize).
9423static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
9424 ArrayRef<int> Mask,
9425 SmallVectorImpl<int> &RepeatedMask) {
9426 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
9427 RepeatedMask.assign(LaneSize, -1);
9428 int Size = Mask.size();
9429 for (int i = 0; i < Size; ++i) {
9430 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
9431 if (Mask[i] < 0)
9432 continue;
9433 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
9434 // This entry crosses lanes, so there is no way to model this shuffle.
9435 return false;
9436
9437 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
9438 // Adjust second vector indices to start at LaneSize instead of Size.
9439 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
9440 : Mask[i] % LaneSize + LaneSize;
9441 if (RepeatedMask[i % LaneSize] < 0)
9442 // This is the first non-undef entry in this slot of a 128-bit lane.
9443 RepeatedMask[i % LaneSize] = LocalM;
9444 else if (RepeatedMask[i % LaneSize] != LocalM)
9445 // Found a mismatch with the repeated mask.
9446 return false;
9447 }
9448 return true;
9449}
9450
9451/// Test whether a shuffle mask is equivalent within each 128-bit lane.
9452static bool
9454 SmallVectorImpl<int> &RepeatedMask) {
9455 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
9456}
9457
9458static bool
9460 SmallVector<int, 32> RepeatedMask;
9461 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
9462}
9463
9464/// Test whether a shuffle mask is equivalent within each 256-bit lane.
9465static bool
9467 SmallVectorImpl<int> &RepeatedMask) {
9468 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
9469}
9470
9471/// Test whether a target shuffle mask is equivalent within each sub-lane.
9472/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
9473static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
9474 unsigned EltSizeInBits,
9475 ArrayRef<int> Mask,
9476 SmallVectorImpl<int> &RepeatedMask) {
9477 int LaneSize = LaneSizeInBits / EltSizeInBits;
9478 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
9479 int Size = Mask.size();
9480 for (int i = 0; i < Size; ++i) {
9481 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
9482 if (Mask[i] == SM_SentinelUndef)
9483 continue;
9484 if (Mask[i] == SM_SentinelZero) {
9485 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
9486 return false;
9487 RepeatedMask[i % LaneSize] = SM_SentinelZero;
9488 continue;
9489 }
9490 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
9491 // This entry crosses lanes, so there is no way to model this shuffle.
9492 return false;
9493
9494 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
9495 // later vector indices to start at multiples of LaneSize instead of Size.
9496 int LaneM = Mask[i] / Size;
9497 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
9498 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
9499 // This is the first non-undef entry in this slot of a 128-bit lane.
9500 RepeatedMask[i % LaneSize] = LocalM;
9501 else if (RepeatedMask[i % LaneSize] != LocalM)
9502 // Found a mismatch with the repeated mask.
9503 return false;
9504 }
9505 return true;
9506}
9507
9508/// Test whether a target shuffle mask is equivalent within each sub-lane.
9509/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
9510static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
9511 ArrayRef<int> Mask,
9512 SmallVectorImpl<int> &RepeatedMask) {
9513 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
9514 Mask, RepeatedMask);
9515}
9516
9517/// Checks whether the vector elements referenced by two shuffle masks are
9518/// equivalent.
9519static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
9520 int Idx, int ExpectedIdx) {
9521 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
9522 ExpectedIdx < MaskSize && "Out of range element index");
9523 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
9524 return false;
9525
9526 switch (Op.getOpcode()) {
9527 case ISD::BUILD_VECTOR:
9528 // If the values are build vectors, we can look through them to find
9529 // equivalent inputs that make the shuffles equivalent.
9530 // TODO: Handle MaskSize != Op.getNumOperands()?
9531 if (MaskSize == (int)Op.getNumOperands() &&
9532 MaskSize == (int)ExpectedOp.getNumOperands())
9533 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
9534 break;
9535 case X86ISD::VBROADCAST:
9537 // TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?
9538 return (Op == ExpectedOp &&
9539 (int)Op.getValueType().getVectorNumElements() == MaskSize);
9540 case X86ISD::HADD:
9541 case X86ISD::HSUB:
9542 case X86ISD::FHADD:
9543 case X86ISD::FHSUB:
9544 case X86ISD::PACKSS:
9545 case X86ISD::PACKUS:
9546 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
9547 // TODO: Handle MaskSize != NumElts?
9548 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
9549 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
9550 MVT VT = Op.getSimpleValueType();
9551 int NumElts = VT.getVectorNumElements();
9552 if (MaskSize == NumElts) {
9553 int NumLanes = VT.getSizeInBits() / 128;
9554 int NumEltsPerLane = NumElts / NumLanes;
9555 int NumHalfEltsPerLane = NumEltsPerLane / 2;
9556 bool SameLane =
9557 (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
9558 bool SameElt =
9559 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
9560 return SameLane && SameElt;
9561 }
9562 }
9563 break;
9564 }
9565
9566 return false;
9567}
9568
9569/// Checks whether a shuffle mask is equivalent to an explicit list of
9570/// arguments.
9571///
9572/// This is a fast way to test a shuffle mask against a fixed pattern:
9573///
9574/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
9575///
9576/// It returns true if the mask is exactly as wide as the argument list, and
9577/// each element of the mask is either -1 (signifying undef) or the value given
9578/// in the argument.
9579static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
9580 SDValue V1 = SDValue(),
9581 SDValue V2 = SDValue()) {
9582 int Size = Mask.size();
9583 if (Size != (int)ExpectedMask.size())
9584 return false;
9585
9586 for (int i = 0; i < Size; ++i) {
9587 assert(Mask[i] >= -1 && "Out of bound mask element!");
9588 int MaskIdx = Mask[i];
9589 int ExpectedIdx = ExpectedMask[i];
9590 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
9591 SDValue MaskV = MaskIdx < Size ? V1 : V2;
9592 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
9593 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
9594 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9595 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
9596 return false;
9597 }
9598 }
9599 return true;
9600}
9601
9602/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
9603///
9604/// The masks must be exactly the same width.
9605///
9606/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
9607/// value in ExpectedMask is always accepted. Otherwise the indices must match.
9608///
9609/// SM_SentinelZero is accepted as a valid negative index but must match in
9610/// both, or via a known bits test.
9612 ArrayRef<int> ExpectedMask,
9613 const SelectionDAG &DAG,
9614 SDValue V1 = SDValue(),
9615 SDValue V2 = SDValue()) {
9616 int Size = Mask.size();
9617 if (Size != (int)ExpectedMask.size())
9618 return false;
9619 assert(llvm::all_of(ExpectedMask,
9620 [Size](int M) { return isInRange(M, 0, 2 * Size); }) &&
9621 "Illegal target shuffle mask");
9622
9623 // Check for out-of-range target shuffle mask indices.
9624 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
9625 return false;
9626
9627 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
9628 if (V1 && (V1.getValueSizeInBits() != VT.getSizeInBits() ||
9629 !V1.getValueType().isVector()))
9630 V1 = SDValue();
9631 if (V2 && (V2.getValueSizeInBits() != VT.getSizeInBits() ||
9632 !V2.getValueType().isVector()))
9633 V2 = SDValue();
9634
9635 APInt ZeroV1 = APInt::getZero(Size);
9636 APInt ZeroV2 = APInt::getZero(Size);
9637
9638 for (int i = 0; i < Size; ++i) {
9639 int MaskIdx = Mask[i];
9640 int ExpectedIdx = ExpectedMask[i];
9641 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
9642 continue;
9643 if (MaskIdx == SM_SentinelZero) {
9644 // If we need this expected index to be a zero element, then update the
9645 // relevant zero mask and perform the known bits at the end to minimize
9646 // repeated computes.
9647 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
9648 if (ExpectedV &&
9649 Size == (int)ExpectedV.getValueType().getVectorNumElements()) {
9650 int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9651 APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;
9652 ZeroMask.setBit(BitIdx);
9653 continue;
9654 }
9655 }
9656 if (MaskIdx >= 0) {
9657 SDValue MaskV = MaskIdx < Size ? V1 : V2;
9658 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
9659 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
9660 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9661 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
9662 continue;
9663 }
9664 return false;
9665 }
9666 return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&
9667 (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));
9668}
9669
9670// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
9671// instructions.
9673 const SelectionDAG &DAG) {
9674 if (VT != MVT::v8i32 && VT != MVT::v8f32)
9675 return false;
9676
9677 SmallVector<int, 8> Unpcklwd;
9678 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
9679 /* Unary = */ false);
9680 SmallVector<int, 8> Unpckhwd;
9681 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
9682 /* Unary = */ false);
9683 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||
9684 isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));
9685 return IsUnpackwdMask;
9686}
9687
9689 const SelectionDAG &DAG) {
9690 // Create 128-bit vector type based on mask size.
9691 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
9692 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
9693
9694 // We can't assume a canonical shuffle mask, so try the commuted version too.
9695 SmallVector<int, 4> CommutedMask(Mask);
9697
9698 // Match any of unary/binary or low/high.
9699 for (unsigned i = 0; i != 4; ++i) {
9700 SmallVector<int, 16> UnpackMask;
9701 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
9702 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||
9703 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))
9704 return true;
9705 }
9706 return false;
9707}
9708
9709/// Return true if a shuffle mask chooses elements identically in its top and
9710/// bottom halves. For example, any splat mask has the same top and bottom
9711/// halves. If an element is undefined in only one half of the mask, the halves
9712/// are not considered identical.
9714 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
9715 unsigned HalfSize = Mask.size() / 2;
9716 for (unsigned i = 0; i != HalfSize; ++i) {
9717 if (Mask[i] != Mask[i + HalfSize])
9718 return false;
9719 }
9720 return true;
9721}
9722
9723/// Get a 4-lane 8-bit shuffle immediate for a mask.
9724///
9725/// This helper function produces an 8-bit shuffle immediate corresponding to
9726/// the ubiquitous shuffle encoding scheme used in x86 instructions for
9727/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
9728/// example.
9729///
9730/// NB: We rely heavily on "undef" masks preserving the input lane.
9731static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
9732 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
9733 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
9734 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
9735 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
9736 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
9737
9738 // If the mask only uses one non-undef element, then fully 'splat' it to
9739 // improve later broadcast matching.
9740 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
9741 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
9742
9743 int FirstElt = Mask[FirstIndex];
9744 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
9745 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
9746
9747 unsigned Imm = 0;
9748 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
9749 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
9750 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
9751 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
9752 return Imm;
9753}
9754
9756 SelectionDAG &DAG) {
9757 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
9758}
9759
9760// The Shuffle result is as follow:
9761// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
9762// Each Zeroable's element correspond to a particular Mask's element.
9763// As described in computeZeroableShuffleElements function.
9764//
9765// The function looks for a sub-mask that the nonzero elements are in
9766// increasing order. If such sub-mask exist. The function returns true.
9767static bool isNonZeroElementsInOrder(const APInt &Zeroable,
9768 ArrayRef<int> Mask, const EVT &VectorType,
9769 bool &IsZeroSideLeft) {
9770 int NextElement = -1;
9771 // Check if the Mask's nonzero elements are in increasing order.
9772 for (int i = 0, e = Mask.size(); i < e; i++) {
9773 // Checks if the mask's zeros elements are built from only zeros.
9774 assert(Mask[i] >= -1 && "Out of bound mask element!");
9775 if (Mask[i] < 0)
9776 return false;
9777 if (Zeroable[i])
9778 continue;
9779 // Find the lowest non zero element
9780 if (NextElement < 0) {
9781 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
9782 IsZeroSideLeft = NextElement != 0;
9783 }
9784 // Exit if the mask's non zero elements are not in increasing order.
9785 if (NextElement != Mask[i])
9786 return false;
9787 NextElement++;
9788 }
9789 return true;
9790}
9791
9792/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
9794 ArrayRef<int> Mask, SDValue V1,
9795 SDValue V2, const APInt &Zeroable,
9796 const X86Subtarget &Subtarget,
9797 SelectionDAG &DAG) {
9798 int Size = Mask.size();
9799 int LaneSize = 128 / VT.getScalarSizeInBits();
9800 const int NumBytes = VT.getSizeInBits() / 8;
9801 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
9802
9803 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
9804 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
9805 (Subtarget.hasBWI() && VT.is512BitVector()));
9806
9807 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
9808 // Sign bit set in i8 mask means zero element.
9809 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
9810
9811 SDValue V;
9812 for (int i = 0; i < NumBytes; ++i) {
9813 int M = Mask[i / NumEltBytes];
9814 if (M < 0) {
9815 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
9816 continue;
9817 }
9818 if (Zeroable[i / NumEltBytes]) {
9819 PSHUFBMask[i] = ZeroMask;
9820 continue;
9821 }
9822
9823 // We can only use a single input of V1 or V2.
9824 SDValue SrcV = (M >= Size ? V2 : V1);
9825 if (V && V != SrcV)
9826 return SDValue();
9827 V = SrcV;
9828 M %= Size;
9829
9830 // PSHUFB can't cross lanes, ensure this doesn't happen.
9831 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
9832 return SDValue();
9833
9834 M = M % LaneSize;
9835 M = M * NumEltBytes + (i % NumEltBytes);
9836 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
9837 }
9838 assert(V && "Failed to find a source input");
9839
9840 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
9841 return DAG.getBitcast(
9842 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
9843 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
9844}
9845
9846static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
9847 const X86Subtarget &Subtarget, SelectionDAG &DAG,
9848 const SDLoc &dl);
9849
9850// X86 has dedicated shuffle that can be lowered to VEXPAND
9852 const APInt &Zeroable,
9853 ArrayRef<int> Mask, SDValue &V1,
9854 SDValue &V2, SelectionDAG &DAG,
9855 const X86Subtarget &Subtarget) {
9856 bool IsLeftZeroSide = true;
9857 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
9858 IsLeftZeroSide))
9859 return SDValue();
9860 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
9862 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
9863 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
9864 unsigned NumElts = VT.getVectorNumElements();
9865 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
9866 "Unexpected number of vector elements");
9867 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
9868 Subtarget, DAG, DL);
9869 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
9870 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
9871 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
9872}
9873
9874static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
9875 unsigned &UnpackOpcode, bool IsUnary,
9876 ArrayRef<int> TargetMask, const SDLoc &DL,
9877 SelectionDAG &DAG,
9878 const X86Subtarget &Subtarget) {
9879 int NumElts = VT.getVectorNumElements();
9880
9881 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
9882 for (int i = 0; i != NumElts; i += 2) {
9883 int M1 = TargetMask[i + 0];
9884 int M2 = TargetMask[i + 1];
9885 Undef1 &= (SM_SentinelUndef == M1);
9886 Undef2 &= (SM_SentinelUndef == M2);
9887 Zero1 &= isUndefOrZero(M1);
9888 Zero2 &= isUndefOrZero(M2);
9889 }
9890 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
9891 "Zeroable shuffle detected");
9892
9893 // Attempt to match the target mask against the unpack lo/hi mask patterns.
9894 SmallVector<int, 64> Unpckl, Unpckh;
9895 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
9896 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,
9897 (IsUnary ? V1 : V2))) {
9898 UnpackOpcode = X86ISD::UNPCKL;
9899 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
9900 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
9901 return true;
9902 }
9903
9904 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
9905 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,
9906 (IsUnary ? V1 : V2))) {
9907 UnpackOpcode = X86ISD::UNPCKH;
9908 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
9909 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
9910 return true;
9911 }
9912
9913 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
9914 if (IsUnary && (Zero1 || Zero2)) {
9915 // Don't bother if we can blend instead.
9916 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
9917 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
9918 return false;
9919
9920 bool MatchLo = true, MatchHi = true;
9921 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
9922 int M = TargetMask[i];
9923
9924 // Ignore if the input is known to be zero or the index is undef.
9925 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
9926 (M == SM_SentinelUndef))
9927 continue;
9928
9929 MatchLo &= (M == Unpckl[i]);
9930 MatchHi &= (M == Unpckh[i]);
9931 }
9932
9933 if (MatchLo || MatchHi) {
9934 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
9935 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
9936 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
9937 return true;
9938 }
9939 }
9940
9941 // If a binary shuffle, commute and try again.
9942 if (!IsUnary) {
9944 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {
9945 UnpackOpcode = X86ISD::UNPCKL;
9946 std::swap(V1, V2);
9947 return true;
9948 }
9949
9951 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {
9952 UnpackOpcode = X86ISD::UNPCKH;
9953 std::swap(V1, V2);
9954 return true;
9955 }
9956 }
9957
9958 return false;
9959}
9960
9961// X86 has dedicated unpack instructions that can handle specific blend
9962// operations: UNPCKH and UNPCKL.
9964 ArrayRef<int> Mask, SDValue V1, SDValue V2,
9965 SelectionDAG &DAG) {
9966 SmallVector<int, 8> Unpckl;
9967 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
9968 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
9969 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
9970
9971 SmallVector<int, 8> Unpckh;
9972 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
9973 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
9974 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
9975
9976 // Commute and try again.
9978 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
9979 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
9980
9982 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
9983 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
9984
9985 return SDValue();
9986}
9987
9988/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
9989/// followed by unpack 256-bit.
9991 ArrayRef<int> Mask, SDValue V1,
9992 SDValue V2, SelectionDAG &DAG) {
9993 SmallVector<int, 32> Unpckl, Unpckh;
9994 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
9995 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
9996
9997 unsigned UnpackOpcode;
9998 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
9999 UnpackOpcode = X86ISD::UNPCKL;
10000 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10001 UnpackOpcode = X86ISD::UNPCKH;
10002 else
10003 return SDValue();
10004
10005 // This is a "natural" unpack operation (rather than the 128-bit sectored
10006 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
10007 // input in order to use the x86 instruction.
10008 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
10009 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
10010 V1 = DAG.getBitcast(VT, V1);
10011 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
10012}
10013
10014// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
10015// source into the lower elements and zeroing the upper elements.
10016static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
10017 ArrayRef<int> Mask, const APInt &Zeroable,
10018 const X86Subtarget &Subtarget) {
10019 if (!VT.is512BitVector() && !Subtarget.hasVLX())
10020 return false;
10021
10022 unsigned NumElts = Mask.size();
10023 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10024 unsigned MaxScale = 64 / EltSizeInBits;
10025
10026 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10027 unsigned SrcEltBits = EltSizeInBits * Scale;
10028 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10029 continue;
10030 unsigned NumSrcElts = NumElts / Scale;
10031 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
10032 continue;
10033 unsigned UpperElts = NumElts - NumSrcElts;
10034 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10035 continue;
10036 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
10037 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
10038 DstVT = MVT::getIntegerVT(EltSizeInBits);
10039 if ((NumSrcElts * EltSizeInBits) >= 128) {
10040 // ISD::TRUNCATE
10041 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
10042 } else {
10043 // X86ISD::VTRUNC
10044 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
10045 }
10046 return true;
10047 }
10048
10049 return false;
10050}
10051
10052// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
10053// element padding to the final DstVT.
10054static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
10055 const X86Subtarget &Subtarget,
10056 SelectionDAG &DAG, bool ZeroUppers) {
10057 MVT SrcVT = Src.getSimpleValueType();
10058 MVT DstSVT = DstVT.getScalarType();
10059 unsigned NumDstElts = DstVT.getVectorNumElements();
10060 unsigned NumSrcElts = SrcVT.getVectorNumElements();
10061 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
10062
10063 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
10064 return SDValue();
10065
10066 // Perform a direct ISD::TRUNCATE if possible.
10067 if (NumSrcElts == NumDstElts)
10068 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
10069
10070 if (NumSrcElts > NumDstElts) {
10071 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10072 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10073 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
10074 }
10075
10076 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
10077 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10078 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10079 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10080 DstVT.getSizeInBits());
10081 }
10082
10083 // Non-VLX targets must truncate from a 512-bit type, so we need to
10084 // widen, truncate and then possibly extract the original subvector.
10085 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
10086 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
10087 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
10088 }
10089
10090 // Fallback to a X86ISD::VTRUNC, padding if necessary.
10091 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
10092 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
10093 if (DstVT != TruncVT)
10094 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10095 DstVT.getSizeInBits());
10096 return Trunc;
10097}
10098
10099// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
10100//
10101// An example is the following:
10102//
10103// t0: ch = EntryToken
10104// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
10105// t25: v4i32 = truncate t2
10106// t41: v8i16 = bitcast t25
10107// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
10108// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
10109// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
10110// t18: v2i64 = bitcast t51
10111//
10112// One can just use a single vpmovdw instruction, without avx512vl we need to
10113// use the zmm variant and extract the lower subvector, padding with zeroes.
10114// TODO: Merge with lowerShuffleAsVTRUNC.
10116 SDValue V2, ArrayRef<int> Mask,
10117 const APInt &Zeroable,
10118 const X86Subtarget &Subtarget,
10119 SelectionDAG &DAG) {
10120 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
10121 if (!Subtarget.hasAVX512())
10122 return SDValue();
10123
10124 unsigned NumElts = VT.getVectorNumElements();
10125 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10126 unsigned MaxScale = 64 / EltSizeInBits;
10127 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10128 unsigned SrcEltBits = EltSizeInBits * Scale;
10129 unsigned NumSrcElts = NumElts / Scale;
10130 unsigned UpperElts = NumElts - NumSrcElts;
10131 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
10132 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10133 continue;
10134
10135 // Attempt to find a matching source truncation, but as a fall back VLX
10136 // cases can use the VPMOV directly.
10137 SDValue Src = peekThroughBitcasts(V1);
10138 if (Src.getOpcode() == ISD::TRUNCATE &&
10139 Src.getScalarValueSizeInBits() == SrcEltBits) {
10140 Src = Src.getOperand(0);
10141 } else if (Subtarget.hasVLX()) {
10142 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10143 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10144 Src = DAG.getBitcast(SrcVT, Src);
10145 // Don't do this if PACKSS/PACKUS could perform it cheaper.
10146 if (Scale == 2 &&
10147 ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
10148 (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
10149 return SDValue();
10150 } else
10151 return SDValue();
10152
10153 // VPMOVWB is only available with avx512bw.
10154 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
10155 return SDValue();
10156
10157 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
10158 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10159 }
10160
10161 return SDValue();
10162}
10163
10164// Attempt to match binary shuffle patterns as a truncate.
10166 SDValue V2, ArrayRef<int> Mask,
10167 const APInt &Zeroable,
10168 const X86Subtarget &Subtarget,
10169 SelectionDAG &DAG) {
10170 assert((VT.is128BitVector() || VT.is256BitVector()) &&
10171 "Unexpected VTRUNC type");
10172 if (!Subtarget.hasAVX512())
10173 return SDValue();
10174
10175 unsigned NumElts = VT.getVectorNumElements();
10176 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10177 unsigned MaxScale = 64 / EltSizeInBits;
10178 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10179 // TODO: Support non-BWI VPMOVWB truncations?
10180 unsigned SrcEltBits = EltSizeInBits * Scale;
10181 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10182 continue;
10183
10184 // Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>
10185 // Bail if the V2 elements are undef.
10186 unsigned NumHalfSrcElts = NumElts / Scale;
10187 unsigned NumSrcElts = 2 * NumHalfSrcElts;
10188 for (unsigned Offset = 0; Offset != Scale; ++Offset) {
10189 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||
10190 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
10191 continue;
10192
10193 // The elements beyond the truncation must be undef/zero.
10194 unsigned UpperElts = NumElts - NumSrcElts;
10195 if (UpperElts > 0 &&
10196 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10197 continue;
10198 bool UndefUppers =
10199 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
10200
10201 // For offset truncations, ensure that the concat is cheap.
10202 if (Offset) {
10203 auto IsCheapConcat = [&](SDValue Lo, SDValue Hi) {
10204 if (Lo.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
10205 Hi.getOpcode() == ISD::EXTRACT_SUBVECTOR)
10206 return Lo.getOperand(0) == Hi.getOperand(0);
10207 if (ISD::isNormalLoad(Lo.getNode()) &&
10208 ISD::isNormalLoad(Hi.getNode())) {
10209 auto *LDLo = cast<LoadSDNode>(Lo);
10210 auto *LDHi = cast<LoadSDNode>(Hi);
10212 LDHi, LDLo, Lo.getValueType().getStoreSize(), 1);
10213 }
10214 return false;
10215 };
10216 if (!IsCheapConcat(V1, V2))
10217 continue;
10218 }
10219
10220 // As we're using both sources then we need to concat them together
10221 // and truncate from the double-sized src.
10222 MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
10223 SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
10224
10225 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10226 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10227 Src = DAG.getBitcast(SrcVT, Src);
10228
10229 // Shift the offset'd elements into place for the truncation.
10230 // TODO: Use getTargetVShiftByConstNode.
10231 if (Offset)
10232 Src = DAG.getNode(
10233 X86ISD::VSRLI, DL, SrcVT, Src,
10234 DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));
10235
10236 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10237 }
10238 }
10239
10240 return SDValue();
10241}
10242
10243/// Check whether a compaction lowering can be done by dropping even/odd
10244/// elements and compute how many times even/odd elements must be dropped.
10245///
10246/// This handles shuffles which take every Nth element where N is a power of
10247/// two. Example shuffle masks:
10248///
10249/// (even)
10250/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
10251/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
10252/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
10253/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
10254/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
10255/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
10256///
10257/// (odd)
10258/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14
10259/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
10260///
10261/// Any of these lanes can of course be undef.
10262///
10263/// This routine only supports N <= 3.
10264/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
10265/// for larger N.
10266///
10267/// \returns N above, or the number of times even/odd elements must be dropped
10268/// if there is such a number. Otherwise returns zero.
10269static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
10270 bool IsSingleInput) {
10271 // The modulus for the shuffle vector entries is based on whether this is
10272 // a single input or not.
10273 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
10274 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
10275 "We should only be called with masks with a power-of-2 size!");
10276
10277 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
10278 int Offset = MatchEven ? 0 : 1;
10279
10280 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
10281 // and 2^3 simultaneously. This is because we may have ambiguity with
10282 // partially undef inputs.
10283 bool ViableForN[3] = {true, true, true};
10284
10285 for (int i = 0, e = Mask.size(); i < e; ++i) {
10286 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
10287 // want.
10288 if (Mask[i] < 0)
10289 continue;
10290
10291 bool IsAnyViable = false;
10292 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10293 if (ViableForN[j]) {
10294 uint64_t N = j + 1;
10295
10296 // The shuffle mask must be equal to (i * 2^N) % M.
10297 if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
10298 IsAnyViable = true;
10299 else
10300 ViableForN[j] = false;
10301 }
10302 // Early exit if we exhaust the possible powers of two.
10303 if (!IsAnyViable)
10304 break;
10305 }
10306
10307 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10308 if (ViableForN[j])
10309 return j + 1;
10310
10311 // Return 0 as there is no viable power of two.
10312 return 0;
10313}
10314
10315// X86 has dedicated pack instructions that can handle specific truncation
10316// operations: PACKSS and PACKUS.
10317// Checks for compaction shuffle masks if MaxStages > 1.
10318// TODO: Add support for matching multiple PACKSS/PACKUS stages.
10319static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
10320 unsigned &PackOpcode, ArrayRef<int> TargetMask,
10321 const SelectionDAG &DAG,
10322 const X86Subtarget &Subtarget,
10323 unsigned MaxStages = 1) {
10324 unsigned NumElts = VT.getVectorNumElements();
10325 unsigned BitSize = VT.getScalarSizeInBits();
10326 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
10327 "Illegal maximum compaction");
10328
10329 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
10330 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
10331 unsigned NumPackedBits = NumSrcBits - BitSize;
10332 N1 = peekThroughBitcasts(N1);
10333 N2 = peekThroughBitcasts(N2);
10334 unsigned NumBits1 = N1.getScalarValueSizeInBits();
10335 unsigned NumBits2 = N2.getScalarValueSizeInBits();
10336 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
10337 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
10338 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
10339 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
10340 return false;
10341 if (Subtarget.hasSSE41() || BitSize == 8) {
10342 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
10343 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
10344 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
10345 V1 = N1;
10346 V2 = N2;
10347 SrcVT = PackVT;
10348 PackOpcode = X86ISD::PACKUS;
10349 return true;
10350 }
10351 }
10352 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
10353 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
10354 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
10355 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
10356 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
10357 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
10358 V1 = N1;
10359 V2 = N2;
10360 SrcVT = PackVT;
10361 PackOpcode = X86ISD::PACKSS;
10362 return true;
10363 }
10364 return false;
10365 };
10366
10367 // Attempt to match against wider and wider compaction patterns.
10368 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
10369 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
10370 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
10371
10372 // Try binary shuffle.
10373 SmallVector<int, 32> BinaryMask;
10374 createPackShuffleMask(VT, BinaryMask, false, NumStages);
10375 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))
10376 if (MatchPACK(V1, V2, PackVT))
10377 return true;
10378
10379 // Try unary shuffle.
10380 SmallVector<int, 32> UnaryMask;
10381 createPackShuffleMask(VT, UnaryMask, true, NumStages);
10382 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))
10383 if (MatchPACK(V1, V1, PackVT))
10384 return true;
10385 }
10386
10387 return false;
10388}
10389
10391 SDValue V1, SDValue V2, SelectionDAG &DAG,
10392 const X86Subtarget &Subtarget) {
10393 MVT PackVT;
10394 unsigned PackOpcode;
10395 unsigned SizeBits = VT.getSizeInBits();
10396 unsigned EltBits = VT.getScalarSizeInBits();
10397 unsigned MaxStages = Log2_32(64 / EltBits);
10398 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
10399 Subtarget, MaxStages))
10400 return SDValue();
10401
10402 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
10403 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
10404
10405 // Don't lower multi-stage packs on AVX512, truncation is better.
10406 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
10407 return SDValue();
10408
10409 // Pack to the largest type possible:
10410 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
10411 unsigned MaxPackBits = 16;
10412 if (CurrentEltBits > 16 &&
10413 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
10414 MaxPackBits = 32;
10415
10416 // Repeatedly pack down to the target size.
10417 SDValue Res;
10418 for (unsigned i = 0; i != NumStages; ++i) {
10419 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
10420 unsigned NumSrcElts = SizeBits / SrcEltBits;
10421 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10422 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
10423 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10424 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
10425 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
10426 DAG.getBitcast(SrcVT, V2));
10427 V1 = V2 = Res;
10428 CurrentEltBits /= 2;
10429 }
10430 assert(Res && Res.getValueType() == VT &&
10431 "Failed to lower compaction shuffle");
10432 return Res;
10433}
10434
10435/// Try to emit a bitmask instruction for a shuffle.
10436///
10437/// This handles cases where we can model a blend exactly as a bitmask due to
10438/// one of the inputs being zeroable.
10440 SDValue V2, ArrayRef<int> Mask,
10441 const APInt &Zeroable,
10442 const X86Subtarget &Subtarget,
10443 SelectionDAG &DAG) {
10444 MVT MaskVT = VT;
10445 MVT EltVT = VT.getVectorElementType();
10446 SDValue Zero, AllOnes;
10447 // Use f64 if i64 isn't legal.
10448 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
10449 EltVT = MVT::f64;
10450 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
10451 }
10452
10453 MVT LogicVT = VT;
10454 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
10455 Zero = DAG.getConstantFP(0.0, DL, EltVT);
10456 APFloat AllOnesValue =
10458 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
10459 LogicVT =
10460 MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
10461 } else {
10462 Zero = DAG.getConstant(0, DL, EltVT);
10463 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
10464 }
10465
10466 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
10467 SDValue V;
10468 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10469 if (Zeroable[i])
10470 continue;
10471 if (Mask[i] % Size != i)
10472 return SDValue(); // Not a blend.
10473 if (!V)
10474 V = Mask[i] < Size ? V1 : V2;
10475 else if (V != (Mask[i] < Size ? V1 : V2))
10476 return SDValue(); // Can only let one input through the mask.
10477
10478 VMaskOps[i] = AllOnes;
10479 }
10480 if (!V)
10481 return SDValue(); // No non-zeroable elements!
10482
10483 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
10484 VMask = DAG.getBitcast(LogicVT, VMask);
10485 V = DAG.getBitcast(LogicVT, V);
10486 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
10487 return DAG.getBitcast(VT, And);
10488}
10489
10490/// Try to emit a blend instruction for a shuffle using bit math.
10491///
10492/// This is used as a fallback approach when first class blend instructions are
10493/// unavailable. Currently it is only suitable for integer vectors, but could
10494/// be generalized for floating point vectors if desirable.
10496 SDValue V2, ArrayRef<int> Mask,
10497 SelectionDAG &DAG) {
10498 assert(VT.isInteger() && "Only supports integer vector types!");
10499 MVT EltVT = VT.getVectorElementType();
10500 SDValue Zero = DAG.getConstant(0, DL, EltVT);
10501 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
10503 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10504 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
10505 return SDValue(); // Shuffled input!
10506 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
10507 }
10508
10509 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
10510 return getBitSelect(DL, VT, V1, V2, V1Mask, DAG);
10511}
10512
10514 SDValue PreservedSrc,
10515 const X86Subtarget &Subtarget,
10516 SelectionDAG &DAG);
10517
10520 const APInt &Zeroable, bool &ForceV1Zero,
10521 bool &ForceV2Zero, uint64_t &BlendMask) {
10522 bool V1IsZeroOrUndef =
10524 bool V2IsZeroOrUndef =
10525 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
10526
10527 BlendMask = 0;
10528 ForceV1Zero = false, ForceV2Zero = false;
10529 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
10530
10531 int NumElts = Mask.size();
10532 int NumLanes = VT.getSizeInBits() / 128;
10533 int NumEltsPerLane = NumElts / NumLanes;
10534 assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch");
10535
10536 // For 32/64-bit elements, if we only reference one input (plus any undefs),
10537 // then ensure the blend mask part for that lane just references that input.
10538 bool ForceWholeLaneMasks =
10539 VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;
10540
10541 // Attempt to generate the binary blend mask. If an input is zero then
10542 // we can use any lane.
10543 for (int Lane = 0; Lane != NumLanes; ++Lane) {
10544 // Keep track of the inputs used per lane.
10545 bool LaneV1InUse = false;
10546 bool LaneV2InUse = false;
10547 uint64_t LaneBlendMask = 0;
10548 for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
10549 int Elt = (Lane * NumEltsPerLane) + LaneElt;
10550 int M = Mask[Elt];
10551 if (M == SM_SentinelUndef)
10552 continue;
10553 if (M == Elt || (0 <= M && M < NumElts &&
10554 IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
10555 Mask[Elt] = Elt;
10556 LaneV1InUse = true;
10557 continue;
10558 }
10559 if (M == (Elt + NumElts) ||
10560 (NumElts <= M &&
10561 IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {
10562 LaneBlendMask |= 1ull << LaneElt;
10563 Mask[Elt] = Elt + NumElts;
10564 LaneV2InUse = true;
10565 continue;
10566 }
10567 if (Zeroable[Elt]) {
10568 if (V1IsZeroOrUndef) {
10569 ForceV1Zero = true;
10570 Mask[Elt] = Elt;
10571 LaneV1InUse = true;
10572 continue;
10573 }
10574 if (V2IsZeroOrUndef) {
10575 ForceV2Zero = true;
10576 LaneBlendMask |= 1ull << LaneElt;
10577 Mask[Elt] = Elt + NumElts;
10578 LaneV2InUse = true;
10579 continue;
10580 }
10581 }
10582 return false;
10583 }
10584
10585 // If we only used V2 then splat the lane blend mask to avoid any demanded
10586 // elts from V1 in this lane (the V1 equivalent is implicit with a zero
10587 // blend mask bit).
10588 if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
10589 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
10590
10591 BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
10592 }
10593 return true;
10594}
10595
10596/// Try to emit a blend instruction for a shuffle.
10597///
10598/// This doesn't do any checks for the availability of instructions for blending
10599/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
10600/// be matched in the backend with the type given. What it does check for is
10601/// that the shuffle mask is a blend, or convertible into a blend with zero.
10603 SDValue V2, ArrayRef<int> Original,
10604 const APInt &Zeroable,
10605 const X86Subtarget &Subtarget,
10606 SelectionDAG &DAG) {
10607 uint64_t BlendMask = 0;
10608 bool ForceV1Zero = false, ForceV2Zero = false;
10609 SmallVector<int, 64> Mask(Original);
10610 if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
10611 BlendMask))
10612 return SDValue();
10613
10614 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
10615 if (ForceV1Zero)
10616 V1 = getZeroVector(VT, Subtarget, DAG, DL);
10617 if (ForceV2Zero)
10618 V2 = getZeroVector(VT, Subtarget, DAG, DL);
10619
10620 unsigned NumElts = VT.getVectorNumElements();
10621
10622 switch (VT.SimpleTy) {
10623 case MVT::v4i64:
10624 case MVT::v8i32:
10625 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
10626 [[fallthrough]];
10627 case MVT::v4f64:
10628 case MVT::v8f32:
10629 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
10630 [[fallthrough]];
10631 case MVT::v2f64:
10632 case MVT::v2i64:
10633 case MVT::v4f32:
10634 case MVT::v4i32:
10635 case MVT::v8i16:
10636 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
10637 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
10638 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
10639 case MVT::v16i16: {
10640 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
10641 SmallVector<int, 8> RepeatedMask;
10642 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
10643 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
10644 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
10645 BlendMask = 0;
10646 for (int i = 0; i < 8; ++i)
10647 if (RepeatedMask[i] >= 8)
10648 BlendMask |= 1ull << i;
10649 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10650 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
10651 }
10652 // Use PBLENDW for lower/upper lanes and then blend lanes.
10653 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
10654 // merge to VSELECT where useful.
10655 uint64_t LoMask = BlendMask & 0xFF;
10656 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
10657 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
10658 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10659 DAG.getTargetConstant(LoMask, DL, MVT::i8));
10660 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10661 DAG.getTargetConstant(HiMask, DL, MVT::i8));
10662 return DAG.getVectorShuffle(
10663 MVT::v16i16, DL, Lo, Hi,
10664 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
10665 }
10666 [[fallthrough]];
10667 }
10668 case MVT::v32i8:
10669 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
10670 [[fallthrough]];
10671 case MVT::v16i8: {
10672 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
10673
10674 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
10675 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
10676 Subtarget, DAG))
10677 return Masked;
10678
10679 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
10680 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
10681 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
10682 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
10683 }
10684
10685 // If we have VPTERNLOG, we can use that as a bit blend.
10686 if (Subtarget.hasVLX())
10687 if (SDValue BitBlend =
10688 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
10689 return BitBlend;
10690
10691 // Scale the blend by the number of bytes per element.
10692 int Scale = VT.getScalarSizeInBits() / 8;
10693
10694 // This form of blend is always done on bytes. Compute the byte vector
10695 // type.
10696 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
10697
10698 // x86 allows load folding with blendvb from the 2nd source operand. But
10699 // we are still using LLVM select here (see comment below), so that's V1.
10700 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
10701 // allow that load-folding possibility.
10702 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
10704 std::swap(V1, V2);
10705 }
10706
10707 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
10708 // mix of LLVM's code generator and the x86 backend. We tell the code
10709 // generator that boolean values in the elements of an x86 vector register
10710 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
10711 // mapping a select to operand #1, and 'false' mapping to operand #2. The
10712 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
10713 // of the element (the remaining are ignored) and 0 in that high bit would
10714 // mean operand #1 while 1 in the high bit would mean operand #2. So while
10715 // the LLVM model for boolean values in vector elements gets the relevant
10716 // bit set, it is set backwards and over constrained relative to x86's
10717 // actual model.
10718 SmallVector<SDValue, 32> VSELECTMask;
10719 for (int i = 0, Size = Mask.size(); i < Size; ++i)
10720 for (int j = 0; j < Scale; ++j)
10721 VSELECTMask.push_back(
10722 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
10723 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
10724 MVT::i8));
10725
10726 V1 = DAG.getBitcast(BlendVT, V1);
10727 V2 = DAG.getBitcast(BlendVT, V2);
10728 return DAG.getBitcast(
10729 VT,
10730 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
10731 V1, V2));
10732 }
10733 case MVT::v16f32:
10734 case MVT::v8f64:
10735 case MVT::v8i64:
10736 case MVT::v16i32:
10737 case MVT::v32i16:
10738 case MVT::v64i8: {
10739 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
10740 bool OptForSize = DAG.shouldOptForSize();
10741 if (!OptForSize) {
10742 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
10743 Subtarget, DAG))
10744 return Masked;
10745 }
10746
10747 // Otherwise load an immediate into a GPR, cast to k-register, and use a
10748 // masked move.
10749 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
10750 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
10751 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
10752 }
10753 default:
10754 llvm_unreachable("Not a supported integer vector type!");
10755 }
10756}
10757
10758/// Try to lower as a blend of elements from two inputs followed by
10759/// a single-input permutation.
10760///
10761/// This matches the pattern where we can blend elements from two inputs and
10762/// then reduce the shuffle to a single-input permutation.
10764 SDValue V1, SDValue V2,
10765 ArrayRef<int> Mask,
10766 SelectionDAG &DAG,
10767 bool ImmBlends = false) {
10768 // We build up the blend mask while checking whether a blend is a viable way
10769 // to reduce the shuffle.
10770 SmallVector<int, 32> BlendMask(Mask.size(), -1);
10771 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
10772
10773 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10774 if (Mask[i] < 0)
10775 continue;
10776
10777 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
10778
10779 if (BlendMask[Mask[i] % Size] < 0)
10780 BlendMask[Mask[i] % Size] = Mask[i];
10781 else if (BlendMask[Mask[i] % Size] != Mask[i])
10782 return SDValue(); // Can't blend in the needed input!
10783
10784 PermuteMask[i] = Mask[i] % Size;
10785 }
10786
10787 // If only immediate blends, then bail if the blend mask can't be widened to
10788 // i16.
10789 unsigned EltSize = VT.getScalarSizeInBits();
10790 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
10791 return SDValue();
10792
10793 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
10794 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
10795}
10796
10797/// Try to lower as an unpack of elements from two inputs followed by
10798/// a single-input permutation.
10799///
10800/// This matches the pattern where we can unpack elements from two inputs and
10801/// then reduce the shuffle to a single-input (wider) permutation.
10803 SDValue V1, SDValue V2,
10804 ArrayRef<int> Mask,
10805 SelectionDAG &DAG) {
10806 int NumElts = Mask.size();
10807 int NumLanes = VT.getSizeInBits() / 128;
10808 int NumLaneElts = NumElts / NumLanes;
10809 int NumHalfLaneElts = NumLaneElts / 2;
10810
10811 bool MatchLo = true, MatchHi = true;
10812 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
10813
10814 // Determine UNPCKL/UNPCKH type and operand order.
10815 for (int Elt = 0; Elt != NumElts; ++Elt) {
10816 int M = Mask[Elt];
10817 if (M < 0)
10818 continue;
10819
10820 // Normalize the mask value depending on whether it's V1 or V2.
10821 int NormM = M;
10822 SDValue &Op = Ops[Elt & 1];
10823 if (M < NumElts && (Op.isUndef() || Op == V1))
10824 Op = V1;
10825 else if (NumElts <= M && (Op.isUndef() || Op == V2)) {
10826 Op = V2;
10827 NormM -= NumElts;
10828 } else
10829 return SDValue();
10830
10831 bool MatchLoAnyLane = false, MatchHiAnyLane = false;
10832 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
10833 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
10834 MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid);
10835 MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi);
10836 if (MatchLoAnyLane || MatchHiAnyLane) {
10837 assert((MatchLoAnyLane ^ MatchHiAnyLane) &&
10838 "Failed to match UNPCKLO/UNPCKHI");
10839 break;
10840 }
10841 }
10842 MatchLo &= MatchLoAnyLane;
10843 MatchHi &= MatchHiAnyLane;
10844 if (!MatchLo && !MatchHi)
10845 return SDValue();
10846 }
10847 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
10848
10849 // Element indices have changed after unpacking. Calculate permute mask
10850 // so that they will be put back to the position as dictated by the
10851 // original shuffle mask indices.
10852 SmallVector<int, 32> PermuteMask(NumElts, -1);
10853 for (int Elt = 0; Elt != NumElts; ++Elt) {
10854 int M = Mask[Elt];
10855 if (M < 0)
10856 continue;
10857 int NormM = M;
10858 if (NumElts <= M)
10859 NormM -= NumElts;
10860 bool IsFirstOp = M < NumElts;
10861 int BaseMaskElt =
10862 NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));
10863 if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0]))
10864 PermuteMask[Elt] = BaseMaskElt;
10865 else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1]))
10866 PermuteMask[Elt] = BaseMaskElt + 1;
10867 assert(PermuteMask[Elt] != -1 &&
10868 "Input mask element is defined but failed to assign permute mask");
10869 }
10870
10871 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
10872 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
10873 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
10874}
10875
10876/// Try to lower a shuffle as a permute of the inputs followed by an
10877/// UNPCK instruction.
10878///
10879/// This specifically targets cases where we end up with alternating between
10880/// the two inputs, and so can permute them into something that feeds a single
10881/// UNPCK instruction. Note that this routine only targets integer vectors
10882/// because for floating point vectors we have a generalized SHUFPS lowering
10883/// strategy that handles everything that doesn't *exactly* match an unpack,
10884/// making this clever lowering unnecessary.
10886 SDValue V1, SDValue V2,
10887 ArrayRef<int> Mask,
10888 const X86Subtarget &Subtarget,
10889 SelectionDAG &DAG) {
10890 int Size = Mask.size();
10891 assert(Mask.size() >= 2 && "Single element masks are invalid.");
10892
10893 // This routine only supports 128-bit integer dual input vectors.
10894 if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())
10895 return SDValue();
10896
10897 int NumLoInputs =
10898 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
10899 int NumHiInputs =
10900 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
10901
10902 bool UnpackLo = NumLoInputs >= NumHiInputs;
10903
10904 auto TryUnpack = [&](int ScalarSize, int Scale) {
10905 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
10906 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
10907
10908 for (int i = 0; i < Size; ++i) {
10909 if (Mask[i] < 0)
10910 continue;
10911
10912 // Each element of the unpack contains Scale elements from this mask.
10913 int UnpackIdx = i / Scale;
10914
10915 // We only handle the case where V1 feeds the first slots of the unpack.
10916 // We rely on canonicalization to ensure this is the case.
10917 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
10918 return SDValue();
10919
10920 // Setup the mask for this input. The indexing is tricky as we have to
10921 // handle the unpack stride.
10922 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
10923 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
10924 Mask[i] % Size;
10925 }
10926
10927 // If we will have to shuffle both inputs to use the unpack, check whether
10928 // we can just unpack first and shuffle the result. If so, skip this unpack.
10929 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
10930 !isNoopShuffleMask(V2Mask))
10931 return SDValue();
10932
10933 // Shuffle the inputs into place.
10934 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
10935 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
10936
10937 // Cast the inputs to the type we will use to unpack them.
10938 MVT UnpackVT =
10939 MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
10940 V1 = DAG.getBitcast(UnpackVT, V1);
10941 V2 = DAG.getBitcast(UnpackVT, V2);
10942
10943 // Unpack the inputs and cast the result back to the desired type.
10944 return DAG.getBitcast(
10945 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
10946 UnpackVT, V1, V2));
10947 };
10948
10949 // We try each unpack from the largest to the smallest to try and find one
10950 // that fits this mask.
10951 int OrigScalarSize = VT.getScalarSizeInBits();
10952 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
10953 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
10954 return Unpack;
10955
10956 // If we're shuffling with a zero vector then we're better off not doing
10957 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
10959 ISD::isBuildVectorAllZeros(V2.getNode()))
10960 return SDValue();
10961
10962 // If none of the unpack-rooted lowerings worked (or were profitable) try an
10963 // initial unpack.
10964 if (NumLoInputs == 0 || NumHiInputs == 0) {
10965 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
10966 "We have to have *some* inputs!");
10967 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
10968
10969 // FIXME: We could consider the total complexity of the permute of each
10970 // possible unpacking. Or at the least we should consider how many
10971 // half-crossings are created.
10972 // FIXME: We could consider commuting the unpacks.
10973
10974 SmallVector<int, 32> PermMask((unsigned)Size, -1);
10975 for (int i = 0; i < Size; ++i) {
10976 if (Mask[i] < 0)
10977 continue;
10978
10979 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
10980
10981 PermMask[i] =
10982 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
10983 }
10984 return DAG.getVectorShuffle(
10985 VT, DL,
10986 DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,
10987 V1, V2),
10988 DAG.getUNDEF(VT), PermMask);
10989 }
10990
10991 return SDValue();
10992}
10993
10994/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
10995/// permuting the elements of the result in place.
10997 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10998 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10999 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
11000 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
11001 (VT.is512BitVector() && !Subtarget.hasBWI()))
11002 return SDValue();
11003
11004 // We don't currently support lane crossing permutes.
11005 if (is128BitLaneCrossingShuffleMask(VT, Mask))
11006 return SDValue();
11007
11008 int Scale = VT.getScalarSizeInBits() / 8;
11009 int NumLanes = VT.getSizeInBits() / 128;
11010 int NumElts = VT.getVectorNumElements();
11011 int NumEltsPerLane = NumElts / NumLanes;
11012
11013 // Determine range of mask elts.
11014 bool Blend1 = true;
11015 bool Blend2 = true;
11016 std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
11017 std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
11018 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11019 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11020 int M = Mask[Lane + Elt];
11021 if (M < 0)
11022 continue;
11023 if (M < NumElts) {
11024 Blend1 &= (M == (Lane + Elt));
11025 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11026 M = M % NumEltsPerLane;
11027 Range1.first = std::min(Range1.first, M);
11028 Range1.second = std::max(Range1.second, M);
11029 } else {
11030 M -= NumElts;
11031 Blend2 &= (M == (Lane + Elt));
11032 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11033 M = M % NumEltsPerLane;
11034 Range2.first = std::min(Range2.first, M);
11035 Range2.second = std::max(Range2.second, M);
11036 }
11037 }
11038 }
11039
11040 // Bail if we don't need both elements.
11041 // TODO - it might be worth doing this for unary shuffles if the permute
11042 // can be widened.
11043 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
11044 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
11045 return SDValue();
11046
11047 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
11048 return SDValue();
11049
11050 // Rotate the 2 ops so we can access both ranges, then permute the result.
11051 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
11052 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11053 SDValue Rotate = DAG.getBitcast(
11054 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
11055 DAG.getBitcast(ByteVT, Lo),
11056 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
11057 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
11058 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11059 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11060 int M = Mask[Lane + Elt];
11061 if (M < 0)
11062 continue;
11063 if (M < NumElts)
11064 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
11065 else
11066 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
11067 }
11068 }
11069 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
11070 };
11071
11072 // Check if the ranges are small enough to rotate from either direction.
11073 if (Range2.second < Range1.first)
11074 return RotateAndPermute(V1, V2, Range1.first, 0);
11075 if (Range1.second < Range2.first)
11076 return RotateAndPermute(V2, V1, Range2.first, NumElts);
11077 return SDValue();
11078}
11079
11081 return isUndefOrEqual(Mask, 0);
11082}
11083
11085 return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
11086}
11087
11088/// Check if the Mask consists of the same element repeated multiple times.
11090 size_t NumUndefs = 0;
11091 std::optional<int> UniqueElt;
11092 for (int Elt : Mask) {
11093 if (Elt == SM_SentinelUndef) {
11094 NumUndefs++;
11095 continue;
11096 }
11097 if (UniqueElt.has_value() && UniqueElt.value() != Elt)
11098 return false;
11099 UniqueElt = Elt;
11100 }
11101 // Make sure the element is repeated enough times by checking the number of
11102 // undefs is small.
11103 return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
11104}
11105
11106/// Generic routine to decompose a shuffle and blend into independent
11107/// blends and permutes.
11108///
11109/// This matches the extremely common pattern for handling combined
11110/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
11111/// operations. It will try to pick the best arrangement of shuffles and
11112/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
11114 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11115 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11116 int NumElts = Mask.size();
11117 int NumLanes = VT.getSizeInBits() / 128;
11118 int NumEltsPerLane = NumElts / NumLanes;
11119
11120 // Shuffle the input elements into the desired positions in V1 and V2 and
11121 // unpack/blend them together.
11122 bool IsAlternating = true;
11123 SmallVector<int, 32> V1Mask(NumElts, -1);
11124 SmallVector<int, 32> V2Mask(NumElts, -1);
11125 SmallVector<int, 32> FinalMask(NumElts, -1);
11126 for (int i = 0; i < NumElts; ++i) {
11127 int M = Mask[i];
11128 if (M >= 0 && M < NumElts) {
11129 V1Mask[i] = M;
11130 FinalMask[i] = i;
11131 IsAlternating &= (i & 1) == 0;
11132 } else if (M >= NumElts) {
11133 V2Mask[i] = M - NumElts;
11134 FinalMask[i] = i + NumElts;
11135 IsAlternating &= (i & 1) == 1;
11136 }
11137 }
11138
11139 // If we effectively only demand the 0'th element of \p Input, and not only
11140 // as 0'th element, then broadcast said input,
11141 // and change \p InputMask to be a no-op (identity) mask.
11142 auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
11143 &DAG](SDValue &Input,
11144 MutableArrayRef<int> InputMask) {
11145 unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
11146 if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
11147 !X86::mayFoldLoad(Input, Subtarget)))
11148 return;
11149 if (isNoopShuffleMask(InputMask))
11150 return;
11151 assert(isBroadcastShuffleMask(InputMask) &&
11152 "Expected to demand only the 0'th element.");
11153 Input = DAG.getNode(X86ISD::VBROADCAST, DL, VT, Input);
11154 for (auto I : enumerate(InputMask)) {
11155 int &InputMaskElt = I.value();
11156 if (InputMaskElt >= 0)
11157 InputMaskElt = I.index();
11158 }
11159 };
11160
11161 // Currently, we may need to produce one shuffle per input, and blend results.
11162 // It is possible that the shuffle for one of the inputs is already a no-op.
11163 // See if we can simplify non-no-op shuffles into broadcasts,
11164 // which we consider to be strictly better than an arbitrary shuffle.
11165 if (isNoopOrBroadcastShuffleMask(V1Mask) &&
11167 canonicalizeBroadcastableInput(V1, V1Mask);
11168 canonicalizeBroadcastableInput(V2, V2Mask);
11169 }
11170
11171 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
11172 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
11173 // the shuffle may be able to fold with a load or other benefit. However, when
11174 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
11175 // pre-shuffle first is a better strategy.
11176 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
11177 // Only prefer immediate blends to unpack/rotate.
11178 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11179 DAG, true))
11180 return BlendPerm;
11181 // If either input vector provides only a single element which is repeated
11182 // multiple times, unpacking from both input vectors would generate worse
11183 // code. e.g. for
11184 // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
11185 // it is better to process t4 first to create a vector of t4[0], then unpack
11186 // that vector with t2.
11187 if (!isSingleElementRepeatedMask(V1Mask) &&
11189 if (SDValue UnpackPerm =
11190 lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
11191 return UnpackPerm;
11193 DL, VT, V1, V2, Mask, Subtarget, DAG))
11194 return RotatePerm;
11195 // Unpack/rotate failed - try again with variable blends.
11196 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11197 DAG))
11198 return BlendPerm;
11199 if (VT.getScalarSizeInBits() >= 32)
11200 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
11201 DL, VT, V1, V2, Mask, Subtarget, DAG))
11202 return PermUnpack;
11203 }
11204
11205 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
11206 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
11207 // TODO: It doesn't have to be alternating - but each lane mustn't have more
11208 // than half the elements coming from each source.
11209 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
11210 V1Mask.assign(NumElts, -1);
11211 V2Mask.assign(NumElts, -1);
11212 FinalMask.assign(NumElts, -1);
11213 for (int i = 0; i != NumElts; i += NumEltsPerLane)
11214 for (int j = 0; j != NumEltsPerLane; ++j) {
11215 int M = Mask[i + j];
11216 if (M >= 0 && M < NumElts) {
11217 V1Mask[i + (j / 2)] = M;
11218 FinalMask[i + j] = i + (j / 2);
11219 } else if (M >= NumElts) {
11220 V2Mask[i + (j / 2)] = M - NumElts;
11221 FinalMask[i + j] = i + (j / 2) + NumElts;
11222 }
11223 }
11224 }
11225
11226 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11227 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11228 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
11229}
11230
11231static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
11232 const X86Subtarget &Subtarget,
11233 ArrayRef<int> Mask) {
11234 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11235 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
11236
11237 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
11238 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
11239 int MaxSubElts = 64 / EltSizeInBits;
11240 unsigned RotateAmt, NumSubElts;
11241 if (!ShuffleVectorInst::isBitRotateMask(Mask, EltSizeInBits, MinSubElts,
11242 MaxSubElts, NumSubElts, RotateAmt))
11243 return -1;
11244 unsigned NumElts = Mask.size();
11245 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
11246 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
11247 return RotateAmt;
11248}
11249
11250/// Lower shuffle using X86ISD::VROTLI rotations.
11252 ArrayRef<int> Mask,
11253 const X86Subtarget &Subtarget,
11254 SelectionDAG &DAG) {
11255 // Only XOP + AVX512 targets have bit rotation instructions.
11256 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
11257 bool IsLegal =
11258 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
11259 if (!IsLegal && Subtarget.hasSSE3())
11260 return SDValue();
11261
11262 MVT RotateVT;
11263 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
11264 Subtarget, Mask);
11265 if (RotateAmt < 0)
11266 return SDValue();
11267
11268 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
11269 // expanded to OR(SRL,SHL), will be more efficient, but if they can
11270 // widen to vXi16 or more then existing lowering should will be better.
11271 if (!IsLegal) {
11272 if ((RotateAmt % 16) == 0)
11273 return SDValue();
11274 // TODO: Use getTargetVShiftByConstNode.
11275 unsigned ShlAmt = RotateAmt;
11276 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
11277 V1 = DAG.getBitcast(RotateVT, V1);
11278 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
11279 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
11280 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
11281 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
11282 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
11283 return DAG.getBitcast(VT, Rot);
11284 }
11285
11286 SDValue Rot =
11287 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
11288 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
11289 return DAG.getBitcast(VT, Rot);
11290}
11291
11292/// Try to match a vector shuffle as an element rotation.
11293///
11294/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
11296 ArrayRef<int> Mask) {
11297 int NumElts = Mask.size();
11298
11299 // We need to detect various ways of spelling a rotation:
11300 // [11, 12, 13, 14, 15, 0, 1, 2]
11301 // [-1, 12, 13, 14, -1, -1, 1, -1]
11302 // [-1, -1, -1, -1, -1, -1, 1, 2]
11303 // [ 3, 4, 5, 6, 7, 8, 9, 10]
11304 // [-1, 4, 5, 6, -1, -1, 9, -1]
11305 // [-1, 4, 5, 6, -1, -1, -1, -1]
11306 int Rotation = 0;
11307 SDValue Lo, Hi;
11308 for (int i = 0; i < NumElts; ++i) {
11309 int M = Mask[i];
11310 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
11311 "Unexpected mask index.");
11312 if (M < 0)
11313 continue;
11314
11315 // Determine where a rotated vector would have started.
11316 int StartIdx = i - (M % NumElts);
11317 if (StartIdx == 0)
11318 // The identity rotation isn't interesting, stop.
11319 return -1;
11320
11321 // If we found the tail of a vector the rotation must be the missing
11322 // front. If we found the head of a vector, it must be how much of the
11323 // head.
11324 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
11325
11326 if (Rotation == 0)
11327 Rotation = CandidateRotation;
11328 else if (Rotation != CandidateRotation)
11329 // The rotations don't match, so we can't match this mask.
11330 return -1;
11331
11332 // Compute which value this mask is pointing at.
11333 SDValue MaskV = M < NumElts ? V1 : V2;
11334
11335 // Compute which of the two target values this index should be assigned
11336 // to. This reflects whether the high elements are remaining or the low
11337 // elements are remaining.
11338 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
11339
11340 // Either set up this value if we've not encountered it before, or check
11341 // that it remains consistent.
11342 if (!TargetV)
11343 TargetV = MaskV;
11344 else if (TargetV != MaskV)
11345 // This may be a rotation, but it pulls from the inputs in some
11346 // unsupported interleaving.
11347 return -1;
11348 }
11349
11350 // Check that we successfully analyzed the mask, and normalize the results.
11351 assert(Rotation != 0 && "Failed to locate a viable rotation!");
11352 assert((Lo || Hi) && "Failed to find a rotated input vector!");
11353 if (!Lo)
11354 Lo = Hi;
11355 else if (!Hi)
11356 Hi = Lo;
11357
11358 V1 = Lo;
11359 V2 = Hi;
11360
11361 return Rotation;
11362}
11363
11364/// Try to lower a vector shuffle as a byte rotation.
11365///
11366/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
11367/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
11368/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
11369/// try to generically lower a vector shuffle through such an pattern. It
11370/// does not check for the profitability of lowering either as PALIGNR or
11371/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
11372/// This matches shuffle vectors that look like:
11373///
11374/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
11375///
11376/// Essentially it concatenates V1 and V2, shifts right by some number of
11377/// elements, and takes the low elements as the result. Note that while this is
11378/// specified as a *right shift* because x86 is little-endian, it is a *left
11379/// rotate* of the vector lanes.
11381 ArrayRef<int> Mask) {
11382 // Don't accept any shuffles with zero elements.
11383 if (isAnyZero(Mask))
11384 return -1;
11385
11386 // PALIGNR works on 128-bit lanes.
11387 SmallVector<int, 16> RepeatedMask;
11388 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
11389 return -1;
11390
11391 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
11392 if (Rotation <= 0)
11393 return -1;
11394
11395 // PALIGNR rotates bytes, so we need to scale the
11396 // rotation based on how many bytes are in the vector lane.
11397 int NumElts = RepeatedMask.size();
11398 int Scale = 16 / NumElts;
11399 return Rotation * Scale;
11400}
11401
11403 SDValue V2, ArrayRef<int> Mask,
11404 const X86Subtarget &Subtarget,
11405 SelectionDAG &DAG) {
11406 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11407
11408 SDValue Lo = V1, Hi = V2;
11409 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
11410 if (ByteRotation <= 0)
11411 return SDValue();
11412
11413 // Cast the inputs to i8 vector of correct length to match PALIGNR or
11414 // PSLLDQ/PSRLDQ.
11415 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11416 Lo = DAG.getBitcast(ByteVT, Lo);
11417 Hi = DAG.getBitcast(ByteVT, Hi);
11418
11419 // SSSE3 targets can use the palignr instruction.
11420 if (Subtarget.hasSSSE3()) {
11421 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
11422 "512-bit PALIGNR requires BWI instructions");
11423 return DAG.getBitcast(
11424 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
11425 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
11426 }
11427
11428 assert(VT.is128BitVector() &&
11429 "Rotate-based lowering only supports 128-bit lowering!");
11430 assert(Mask.size() <= 16 &&
11431 "Can shuffle at most 16 bytes in a 128-bit vector!");
11432 assert(ByteVT == MVT::v16i8 &&
11433 "SSE2 rotate lowering only needed for v16i8!");
11434
11435 // Default SSE2 implementation
11436 int LoByteShift = 16 - ByteRotation;
11437 int HiByteShift = ByteRotation;
11438
11439 SDValue LoShift =
11440 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
11441 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
11442 SDValue HiShift =
11443 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
11444 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
11445 return DAG.getBitcast(VT,
11446 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
11447}
11448
11449/// Try to lower a vector shuffle as a dword/qword rotation.
11450///
11451/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
11452/// rotation of the concatenation of two vectors; This routine will
11453/// try to generically lower a vector shuffle through such an pattern.
11454///
11455/// Essentially it concatenates V1 and V2, shifts right by some number of
11456/// elements, and takes the low elements as the result. Note that while this is
11457/// specified as a *right shift* because x86 is little-endian, it is a *left
11458/// rotate* of the vector lanes.
11460 SDValue V2, ArrayRef<int> Mask,
11461 const APInt &Zeroable,
11462 const X86Subtarget &Subtarget,
11463 SelectionDAG &DAG) {
11464 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
11465 "Only 32-bit and 64-bit elements are supported!");
11466
11467 // 128/256-bit vectors are only supported with VLX.
11468 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
11469 && "VLX required for 128/256-bit vectors");
11470
11471 SDValue Lo = V1, Hi = V2;
11472 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
11473 if (0 < Rotation)
11474 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
11475 DAG.getTargetConstant(Rotation, DL, MVT::i8));
11476
11477 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
11478 // TODO: Pull this out as a matchShuffleAsElementShift helper?
11479 // TODO: We can probably make this more aggressive and use shift-pairs like
11480 // lowerShuffleAsByteShiftMask.
11481 unsigned NumElts = Mask.size();
11482 unsigned ZeroLo = Zeroable.countr_one();
11483 unsigned ZeroHi = Zeroable.countl_one();
11484 assert((ZeroLo + ZeroHi) < NumElts && "Zeroable shuffle detected");
11485 if (!ZeroLo && !ZeroHi)
11486 return SDValue();
11487
11488 if (ZeroLo) {
11489 SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
11490 int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts;
11491 if (isSequentialOrUndefInRange(Mask, ZeroLo, NumElts - ZeroLo, Low))
11492 return DAG.getNode(X86ISD::VALIGN, DL, VT, Src,
11493 getZeroVector(VT, Subtarget, DAG, DL),
11494 DAG.getTargetConstant(NumElts - ZeroLo, DL, MVT::i8));
11495 }
11496
11497 if (ZeroHi) {
11498 SDValue Src = Mask[0] < (int)NumElts ? V1 : V2;
11499 int Low = Mask[0] < (int)NumElts ? 0 : NumElts;
11500 if (isSequentialOrUndefInRange(Mask, 0, NumElts - ZeroHi, Low + ZeroHi))
11501 return DAG.getNode(X86ISD::VALIGN, DL, VT,
11502 getZeroVector(VT, Subtarget, DAG, DL), Src,
11503 DAG.getTargetConstant(ZeroHi, DL, MVT::i8));
11504 }
11505
11506 return SDValue();
11507}
11508
11509/// Try to lower a vector shuffle as a byte shift sequence.
11511 SDValue V2, ArrayRef<int> Mask,
11512 const APInt &Zeroable,
11513 const X86Subtarget &Subtarget,
11514 SelectionDAG &DAG) {
11515 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11516 assert(VT.is128BitVector() && "Only 128-bit vectors supported");
11517
11518 // We need a shuffle that has zeros at one/both ends and a sequential
11519 // shuffle from one source within.
11520 unsigned ZeroLo = Zeroable.countr_one();
11521 unsigned ZeroHi = Zeroable.countl_one();
11522 if (!ZeroLo && !ZeroHi)
11523 return SDValue();
11524
11525 unsigned NumElts = Mask.size();
11526 unsigned Len = NumElts - (ZeroLo + ZeroHi);
11527 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
11528 return SDValue();
11529
11530 unsigned Scale = VT.getScalarSizeInBits() / 8;
11531 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
11532 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
11533 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
11534 return SDValue();
11535
11536 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
11537 Res = DAG.getBitcast(MVT::v16i8, Res);
11538
11539 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
11540 // inner sequential set of elements, possibly offset:
11541 // 01234567 --> zzzzzz01 --> 1zzzzzzz
11542 // 01234567 --> 4567zzzz --> zzzzz456
11543 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
11544 if (ZeroLo == 0) {
11545 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
11546 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11547 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11548 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11549 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
11550 } else if (ZeroHi == 0) {
11551 unsigned Shift = Mask[ZeroLo] % NumElts;
11552 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11553 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11554 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11555 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
11556 } else if (!Subtarget.hasSSSE3()) {
11557 // If we don't have PSHUFB then its worth avoiding an AND constant mask
11558 // by performing 3 byte shifts. Shuffle combining can kick in above that.
11559 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
11560 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
11561 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11562 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11563 Shift += Mask[ZeroLo] % NumElts;
11564 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11565 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11566 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11567 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
11568 } else
11569 return SDValue();
11570
11571 return DAG.getBitcast(VT, Res);
11572}
11573
11574/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
11575///
11576/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
11577/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
11578/// matches elements from one of the input vectors shuffled to the left or
11579/// right with zeroable elements 'shifted in'. It handles both the strictly
11580/// bit-wise element shifts and the byte shift across an entire 128-bit double
11581/// quad word lane.
11582///
11583/// PSHL : (little-endian) left bit shift.
11584/// [ zz, 0, zz, 2 ]
11585/// [ -1, 4, zz, -1 ]
11586/// PSRL : (little-endian) right bit shift.
11587/// [ 1, zz, 3, zz]
11588/// [ -1, -1, 7, zz]
11589/// PSLLDQ : (little-endian) left byte shift
11590/// [ zz, 0, 1, 2, 3, 4, 5, 6]
11591/// [ zz, zz, -1, -1, 2, 3, 4, -1]
11592/// [ zz, zz, zz, zz, zz, zz, -1, 1]
11593/// PSRLDQ : (little-endian) right byte shift
11594/// [ 5, 6, 7, zz, zz, zz, zz, zz]
11595/// [ -1, 5, 6, 7, zz, zz, zz, zz]
11596/// [ 1, 2, -1, -1, -1, -1, zz, zz]
11597static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
11598 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
11599 int MaskOffset, const APInt &Zeroable,
11600 const X86Subtarget &Subtarget) {
11601 int Size = Mask.size();
11602 unsigned SizeInBits = Size * ScalarSizeInBits;
11603
11604 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
11605 for (int i = 0; i < Size; i += Scale)
11606 for (int j = 0; j < Shift; ++j)
11607 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
11608 return false;
11609
11610 return true;
11611 };
11612
11613 auto MatchShift = [&](int Shift, int Scale, bool Left) {
11614 for (int i = 0; i != Size; i += Scale) {
11615 unsigned Pos = Left ? i + Shift : i;
11616 unsigned Low = Left ? i : i + Shift;
11617 unsigned Len = Scale - Shift;
11618 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
11619 return -1;
11620 }
11621
11622 int ShiftEltBits = ScalarSizeInBits * Scale;
11623 bool ByteShift = ShiftEltBits > 64;
11624 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
11625 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
11626 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
11627
11628 // Normalize the scale for byte shifts to still produce an i64 element
11629 // type.
11630 Scale = ByteShift ? Scale / 2 : Scale;
11631
11632 // We need to round trip through the appropriate type for the shift.
11633 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
11634 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
11635 : MVT::getVectorVT(ShiftSVT, Size / Scale);
11636 return (int)ShiftAmt;
11637 };
11638
11639 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
11640 // keep doubling the size of the integer elements up to that. We can
11641 // then shift the elements of the integer vector by whole multiples of
11642 // their width within the elements of the larger integer vector. Test each
11643 // multiple to see if we can find a match with the moved element indices
11644 // and that the shifted in elements are all zeroable.
11645 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
11646 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
11647 for (int Shift = 1; Shift != Scale; ++Shift)
11648 for (bool Left : {true, false})
11649 if (CheckZeros(Shift, Scale, Left)) {
11650 int ShiftAmt = MatchShift(Shift, Scale, Left);
11651 if (0 < ShiftAmt)
11652 return ShiftAmt;
11653 }
11654
11655 // no match
11656 return -1;
11657}
11658
11660 SDValue V2, ArrayRef<int> Mask,
11661 const APInt &Zeroable,
11662 const X86Subtarget &Subtarget,
11663 SelectionDAG &DAG, bool BitwiseOnly) {
11664 int Size = Mask.size();
11665 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
11666
11667 MVT ShiftVT;
11668 SDValue V = V1;
11669 unsigned Opcode;
11670
11671 // Try to match shuffle against V1 shift.
11672 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
11673 Mask, 0, Zeroable, Subtarget);
11674
11675 // If V1 failed, try to match shuffle against V2 shift.
11676 if (ShiftAmt < 0) {
11677 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
11678 Mask, Size, Zeroable, Subtarget);
11679 V = V2;
11680 }
11681
11682 if (ShiftAmt < 0)
11683 return SDValue();
11684
11685 if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))
11686 return SDValue();
11687
11688 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
11689 "Illegal integer vector type");
11690 V = DAG.getBitcast(ShiftVT, V);
11691 V = DAG.getNode(Opcode, DL, ShiftVT, V,
11692 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
11693 return DAG.getBitcast(VT, V);
11694}
11695
11696// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
11697// Remainder of lower half result is zero and upper half is all undef.
11698static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
11699 ArrayRef<int> Mask, uint64_t &BitLen,
11700 uint64_t &BitIdx, const APInt &Zeroable) {
11701 int Size = Mask.size();
11702 int HalfSize = Size / 2;
11703 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
11704 assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask");
11705
11706 // Upper half must be undefined.
11707 if (!isUndefUpperHalf(Mask))
11708 return false;
11709
11710 // Determine the extraction length from the part of the
11711 // lower half that isn't zeroable.
11712 int Len = HalfSize;
11713 for (; Len > 0; --Len)
11714 if (!Zeroable[Len - 1])
11715 break;
11716 assert(Len > 0 && "Zeroable shuffle mask");
11717
11718 // Attempt to match first Len sequential elements from the lower half.
11719 SDValue Src;
11720 int Idx = -1;
11721 for (int i = 0; i != Len; ++i) {
11722 int M = Mask[i];
11723 if (M == SM_SentinelUndef)
11724 continue;
11725 SDValue &V = (M < Size ? V1 : V2);
11726 M = M % Size;
11727
11728 // The extracted elements must start at a valid index and all mask
11729 // elements must be in the lower half.
11730 if (i > M || M >= HalfSize)
11731 return false;
11732
11733 if (Idx < 0 || (Src == V && Idx == (M - i))) {
11734 Src = V;
11735 Idx = M - i;
11736 continue;
11737 }
11738 return false;
11739 }
11740
11741 if (!Src || Idx < 0)
11742 return false;
11743
11744 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
11745 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
11746 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
11747 V1 = Src;
11748 return true;
11749}
11750
11751// INSERTQ: Extract lowest Len elements from lower half of second source and
11752// insert over first source, starting at Idx.
11753// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
11754static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
11755 ArrayRef<int> Mask, uint64_t &BitLen,
11756 uint64_t &BitIdx) {
11757 int Size = Mask.size();
11758 int HalfSize = Size / 2;
11759 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
11760
11761 // Upper half must be undefined.
11762 if (!isUndefUpperHalf(Mask))
11763 return false;
11764
11765 for (int Idx = 0; Idx != HalfSize; ++Idx) {
11766 SDValue Base;
11767
11768 // Attempt to match first source from mask before insertion point.
11769 if (isUndefInRange(Mask, 0, Idx)) {
11770 /* EMPTY */
11771 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
11772 Base = V1;
11773 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
11774 Base = V2;
11775 } else {
11776 continue;
11777 }
11778
11779 // Extend the extraction length looking to match both the insertion of
11780 // the second source and the remaining elements of the first.
11781 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
11782 SDValue Insert;
11783 int Len = Hi - Idx;
11784
11785 // Match insertion.
11786 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
11787 Insert = V1;
11788 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
11789 Insert = V2;
11790 } else {
11791 continue;
11792 }
11793
11794 // Match the remaining elements of the lower half.
11795 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
11796 /* EMPTY */
11797 } else if ((!Base || (Base == V1)) &&
11798 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
11799 Base = V1;
11800 } else if ((!Base || (Base == V2)) &&
11801 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
11802 Size + Hi)) {
11803 Base = V2;
11804 } else {
11805 continue;
11806 }
11807
11808 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
11809 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
11810 V1 = Base;
11811 V2 = Insert;
11812 return true;
11813 }
11814 }
11815
11816 return false;
11817}
11818
11819/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
11821 SDValue V2, ArrayRef<int> Mask,
11822 const APInt &Zeroable, SelectionDAG &DAG) {
11823 uint64_t BitLen, BitIdx;
11824 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
11825 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
11826 DAG.getTargetConstant(BitLen, DL, MVT::i8),
11827 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
11828
11829 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
11830 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
11831 V2 ? V2 : DAG.getUNDEF(VT),
11832 DAG.getTargetConstant(BitLen, DL, MVT::i8),
11833 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
11834
11835 return SDValue();
11836}
11837
11838/// Lower a vector shuffle as a zero or any extension.
11839///
11840/// Given a specific number of elements, element bit width, and extension
11841/// stride, produce either a zero or any extension based on the available
11842/// features of the subtarget. The extended elements are consecutive and
11843/// begin and can start from an offsetted element index in the input; to
11844/// avoid excess shuffling the offset must either being in the bottom lane
11845/// or at the start of a higher lane. All extended elements must be from
11846/// the same lane.
11848 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
11849 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11850 assert(Scale > 1 && "Need a scale to extend.");
11851 int EltBits = VT.getScalarSizeInBits();
11852 int NumElements = VT.getVectorNumElements();
11853 int NumEltsPerLane = 128 / EltBits;
11854 int OffsetLane = Offset / NumEltsPerLane;
11855 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
11856 "Only 8, 16, and 32 bit elements can be extended.");
11857 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
11858 assert(0 <= Offset && "Extension offset must be positive.");
11859 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
11860 "Extension offset must be in the first lane or start an upper lane.");
11861
11862 // Check that an index is in same lane as the base offset.
11863 auto SafeOffset = [&](int Idx) {
11864 return OffsetLane == (Idx / NumEltsPerLane);
11865 };
11866
11867 // Shift along an input so that the offset base moves to the first element.
11868 auto ShuffleOffset = [&](SDValue V) {
11869 if (!Offset)
11870 return V;
11871
11872 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
11873 for (int i = 0; i * Scale < NumElements; ++i) {
11874 int SrcIdx = i + Offset;
11875 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
11876 }
11877 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
11878 };
11879
11880 // Found a valid a/zext mask! Try various lowering strategies based on the
11881 // input type and available ISA extensions.
11882 if (Subtarget.hasSSE41()) {
11883 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
11884 // PUNPCK will catch this in a later shuffle match.
11885 if (Offset && Scale == 2 && VT.is128BitVector())
11886 return SDValue();
11887 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
11888 NumElements / Scale);
11889 InputV = DAG.getBitcast(VT, InputV);
11890 InputV = ShuffleOffset(InputV);
11892 DL, ExtVT, InputV, DAG);
11893 return DAG.getBitcast(VT, InputV);
11894 }
11895
11896 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
11897 InputV = DAG.getBitcast(VT, InputV);
11898
11899 // For any extends we can cheat for larger element sizes and use shuffle
11900 // instructions that can fold with a load and/or copy.
11901 if (AnyExt && EltBits == 32) {
11902 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
11903 -1};
11904 return DAG.getBitcast(
11905 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
11906 DAG.getBitcast(MVT::v4i32, InputV),
11907 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11908 }
11909 if (AnyExt && EltBits == 16 && Scale > 2) {
11910 int PSHUFDMask[4] = {Offset / 2, -1,
11911 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
11912 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
11913 DAG.getBitcast(MVT::v4i32, InputV),
11914 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
11915 int PSHUFWMask[4] = {1, -1, -1, -1};
11916 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
11917 return DAG.getBitcast(
11918 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
11919 DAG.getBitcast(MVT::v8i16, InputV),
11920 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
11921 }
11922
11923 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
11924 // to 64-bits.
11925 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
11926 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
11927 assert(VT.is128BitVector() && "Unexpected vector width!");
11928
11929 int LoIdx = Offset * EltBits;
11930 SDValue Lo = DAG.getBitcast(
11931 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
11932 DAG.getTargetConstant(EltBits, DL, MVT::i8),
11933 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
11934
11935 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
11936 return DAG.getBitcast(VT, Lo);
11937
11938 int HiIdx = (Offset + 1) * EltBits;
11939 SDValue Hi = DAG.getBitcast(
11940 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
11941 DAG.getTargetConstant(EltBits, DL, MVT::i8),
11942 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
11943 return DAG.getBitcast(VT,
11944 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
11945 }
11946
11947 // If this would require more than 2 unpack instructions to expand, use
11948 // pshufb when available. We can only use more than 2 unpack instructions
11949 // when zero extending i8 elements which also makes it easier to use pshufb.
11950 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
11951 assert(NumElements == 16 && "Unexpected byte vector width!");
11952 SDValue PSHUFBMask[16];
11953 for (int i = 0; i < 16; ++i) {
11954 int Idx = Offset + (i / Scale);
11955 if ((i % Scale == 0 && SafeOffset(Idx))) {
11956 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
11957 continue;
11958 }
11959 PSHUFBMask[i] =
11960 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
11961 }
11962 InputV = DAG.getBitcast(MVT::v16i8, InputV);
11963 return DAG.getBitcast(
11964 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
11965 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
11966 }
11967
11968 // If we are extending from an offset, ensure we start on a boundary that
11969 // we can unpack from.
11970 int AlignToUnpack = Offset % (NumElements / Scale);
11971 if (AlignToUnpack) {
11972 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
11973 for (int i = AlignToUnpack; i < NumElements; ++i)
11974 ShMask[i - AlignToUnpack] = i;
11975 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
11976 Offset -= AlignToUnpack;
11977 }
11978
11979 // Otherwise emit a sequence of unpacks.
11980 do {
11981 unsigned UnpackLoHi = X86ISD::UNPCKL;
11982 if (Offset >= (NumElements / 2)) {
11983 UnpackLoHi = X86ISD::UNPCKH;
11984 Offset -= (NumElements / 2);
11985 }
11986
11987 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
11988 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
11989 : getZeroVector(InputVT, Subtarget, DAG, DL);
11990 InputV = DAG.getBitcast(InputVT, InputV);
11991 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
11992 Scale /= 2;
11993 EltBits *= 2;
11994 NumElements /= 2;
11995 } while (Scale > 1);
11996 return DAG.getBitcast(VT, InputV);
11997}
11998
11999/// Try to lower a vector shuffle as a zero extension on any microarch.
12000///
12001/// This routine will try to do everything in its power to cleverly lower
12002/// a shuffle which happens to match the pattern of a zero extend. It doesn't
12003/// check for the profitability of this lowering, it tries to aggressively
12004/// match this pattern. It will use all of the micro-architectural details it
12005/// can to emit an efficient lowering. It handles both blends with all-zero
12006/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
12007/// masking out later).
12008///
12009/// The reason we have dedicated lowering for zext-style shuffles is that they
12010/// are both incredibly common and often quite performance sensitive.
12012 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12013 const APInt &Zeroable, const X86Subtarget &Subtarget,
12014 SelectionDAG &DAG) {
12015 int Bits = VT.getSizeInBits();
12016 int NumLanes = Bits / 128;
12017 int NumElements = VT.getVectorNumElements();
12018 int NumEltsPerLane = NumElements / NumLanes;
12019 assert(VT.getScalarSizeInBits() <= 32 &&
12020 "Exceeds 32-bit integer zero extension limit");
12021 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
12022
12023 // Define a helper function to check a particular ext-scale and lower to it if
12024 // valid.
12025 auto Lower = [&](int Scale) -> SDValue {
12026 SDValue InputV;
12027 bool AnyExt = true;
12028 int Offset = 0;
12029 int Matches = 0;
12030 for (int i = 0; i < NumElements; ++i) {
12031 int M = Mask[i];
12032 if (M < 0)
12033 continue; // Valid anywhere but doesn't tell us anything.
12034 if (i % Scale != 0) {
12035 // Each of the extended elements need to be zeroable.
12036 if (!Zeroable[i])
12037 return SDValue();
12038
12039 // We no longer are in the anyext case.
12040 AnyExt = false;
12041 continue;
12042 }
12043
12044 // Each of the base elements needs to be consecutive indices into the
12045 // same input vector.
12046 SDValue V = M < NumElements ? V1 : V2;
12047 M = M % NumElements;
12048 if (!InputV) {
12049 InputV = V;
12050 Offset = M - (i / Scale);
12051 } else if (InputV != V)
12052 return SDValue(); // Flip-flopping inputs.
12053
12054 // Offset must start in the lowest 128-bit lane or at the start of an
12055 // upper lane.
12056 // FIXME: Is it ever worth allowing a negative base offset?
12057 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
12058 (Offset % NumEltsPerLane) == 0))
12059 return SDValue();
12060
12061 // If we are offsetting, all referenced entries must come from the same
12062 // lane.
12063 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
12064 return SDValue();
12065
12066 if ((M % NumElements) != (Offset + (i / Scale)))
12067 return SDValue(); // Non-consecutive strided elements.
12068 Matches++;
12069 }
12070
12071 // If we fail to find an input, we have a zero-shuffle which should always
12072 // have already been handled.
12073 // FIXME: Maybe handle this here in case during blending we end up with one?
12074 if (!InputV)
12075 return SDValue();
12076
12077 // If we are offsetting, don't extend if we only match a single input, we
12078 // can always do better by using a basic PSHUF or PUNPCK.
12079 if (Offset != 0 && Matches < 2)
12080 return SDValue();
12081
12082 return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
12083 InputV, Mask, Subtarget, DAG);
12084 };
12085
12086 // The widest scale possible for extending is to a 64-bit integer.
12087 assert(Bits % 64 == 0 &&
12088 "The number of bits in a vector must be divisible by 64 on x86!");
12089 int NumExtElements = Bits / 64;
12090
12091 // Each iteration, try extending the elements half as much, but into twice as
12092 // many elements.
12093 for (; NumExtElements < NumElements; NumExtElements *= 2) {
12094 assert(NumElements % NumExtElements == 0 &&
12095 "The input vector size must be divisible by the extended size.");
12096 if (SDValue V = Lower(NumElements / NumExtElements))
12097 return V;
12098 }
12099
12100 // General extends failed, but 128-bit vectors may be able to use MOVQ.
12101 if (Bits != 128)
12102 return SDValue();
12103
12104 // Returns one of the source operands if the shuffle can be reduced to a
12105 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
12106 auto CanZExtLowHalf = [&]() {
12107 for (int i = NumElements / 2; i != NumElements; ++i)
12108 if (!Zeroable[i])
12109 return SDValue();
12110 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
12111 return V1;
12112 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
12113 return V2;
12114 return SDValue();
12115 };
12116
12117 if (SDValue V = CanZExtLowHalf()) {
12118 V = DAG.getBitcast(MVT::v2i64, V);
12119 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
12120 return DAG.getBitcast(VT, V);
12121 }
12122
12123 // No viable ext lowering found.
12124 return SDValue();
12125}
12126
12127/// Try to get a scalar value for a specific element of a vector.
12128///
12129/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
12131 SelectionDAG &DAG) {
12132 MVT VT = V.getSimpleValueType();
12133 MVT EltVT = VT.getVectorElementType();
12134 V = peekThroughBitcasts(V);
12135
12136 // If the bitcasts shift the element size, we can't extract an equivalent
12137 // element from it.
12138 MVT NewVT = V.getSimpleValueType();
12139 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
12140 return SDValue();
12141
12142 if (V.getOpcode() == ISD::BUILD_VECTOR ||
12143 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
12144 // Ensure the scalar operand is the same size as the destination.
12145 // FIXME: Add support for scalar truncation where possible.
12146 SDValue S = V.getOperand(Idx);
12147 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
12148 return DAG.getBitcast(EltVT, S);
12149 }
12150
12151 return SDValue();
12152}
12153
12154/// Helper to test for a load that can be folded with x86 shuffles.
12155///
12156/// This is particularly important because the set of instructions varies
12157/// significantly based on whether the operand is a load or not.
12159 return V->hasOneUse() &&
12161}
12162
12163template<typename T>
12164static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
12165 T EltVT = VT.getScalarType();
12166 return EltVT == MVT::bf16 || (EltVT == MVT::f16 && !Subtarget.hasFP16());
12167}
12168
12169/// Try to lower insertion of a single element into a zero vector.
12170///
12171/// This is a common pattern that we have especially efficient patterns to lower
12172/// across all subtarget feature sets.
12174 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12175 const APInt &Zeroable, const X86Subtarget &Subtarget,
12176 SelectionDAG &DAG) {
12177 MVT ExtVT = VT;
12178 MVT EltVT = VT.getVectorElementType();
12179 unsigned NumElts = VT.getVectorNumElements();
12180 unsigned EltBits = VT.getScalarSizeInBits();
12181
12182 if (isSoftF16(EltVT, Subtarget))
12183 return SDValue();
12184
12185 int V2Index =
12186 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
12187 Mask.begin();
12188 bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr;
12189 bool IsV1Zeroable = true;
12190 for (int i = 0, Size = Mask.size(); i < Size; ++i)
12191 if (i != V2Index && !Zeroable[i]) {
12192 IsV1Zeroable = false;
12193 break;
12194 }
12195
12196 // Bail if a non-zero V1 isn't used in place.
12197 if (!IsV1Zeroable) {
12198 SmallVector<int, 8> V1Mask(Mask);
12199 V1Mask[V2Index] = -1;
12200 if (!isNoopShuffleMask(V1Mask))
12201 return SDValue();
12202 }
12203
12204 // Check for a single input from a SCALAR_TO_VECTOR node.
12205 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
12206 // all the smarts here sunk into that routine. However, the current
12207 // lowering of BUILD_VECTOR makes that nearly impossible until the old
12208 // vector shuffle lowering is dead.
12209 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
12210 DAG);
12211 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
12212 // We need to zext the scalar if it is smaller than an i32.
12213 V2S = DAG.getBitcast(EltVT, V2S);
12214 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
12215 // Using zext to expand a narrow element won't work for non-zero
12216 // insertions. But we can use a masked constant vector if we're
12217 // inserting V2 into the bottom of V1.
12218 if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
12219 return SDValue();
12220
12221 // Zero-extend directly to i32.
12222 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
12223 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
12224
12225 // If we're inserting into a constant, mask off the inserted index
12226 // and OR with the zero-extended scalar.
12227 if (!IsV1Zeroable) {
12228 SmallVector<APInt> Bits(NumElts, APInt::getAllOnes(EltBits));
12229 Bits[V2Index] = APInt::getZero(EltBits);
12230 SDValue BitMask = getConstVector(Bits, VT, DAG, DL);
12231 V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask);
12232 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12233 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2));
12234 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
12235 }
12236 }
12237 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12238 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
12239 EltVT == MVT::i16) {
12240 // Either not inserting from the low element of the input or the input
12241 // element size is too small to use VZEXT_MOVL to clear the high bits.
12242 return SDValue();
12243 }
12244
12245 if (!IsV1Zeroable) {
12246 // If V1 can't be treated as a zero vector we have fewer options to lower
12247 // this. We can't support integer vectors or non-zero targets cheaply.
12248 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
12249 if (!VT.isFloatingPoint() || V2Index != 0)
12250 return SDValue();
12251 if (!VT.is128BitVector())
12252 return SDValue();
12253
12254 // Otherwise, use MOVSD, MOVSS or MOVSH.
12255 unsigned MovOpc = 0;
12256 if (EltVT == MVT::f16)
12257 MovOpc = X86ISD::MOVSH;
12258 else if (EltVT == MVT::f32)
12259 MovOpc = X86ISD::MOVSS;
12260 else if (EltVT == MVT::f64)
12261 MovOpc = X86ISD::MOVSD;
12262 else
12263 llvm_unreachable("Unsupported floating point element type to handle!");
12264 return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
12265 }
12266
12267 // This lowering only works for the low element with floating point vectors.
12268 if (VT.isFloatingPoint() && V2Index != 0)
12269 return SDValue();
12270
12271 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
12272 if (ExtVT != VT)
12273 V2 = DAG.getBitcast(VT, V2);
12274
12275 if (V2Index != 0) {
12276 // If we have 4 or fewer lanes we can cheaply shuffle the element into
12277 // the desired position. Otherwise it is more efficient to do a vector
12278 // shift left. We know that we can do a vector shift left because all
12279 // the inputs are zero.
12280 if (VT.isFloatingPoint() || NumElts <= 4) {
12281 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
12282 V2Shuffle[V2Index] = 0;
12283 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
12284 } else {
12285 V2 = DAG.getBitcast(MVT::v16i8, V2);
12286 V2 = DAG.getNode(
12287 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
12288 DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8));
12289 V2 = DAG.getBitcast(VT, V2);
12290 }
12291 }
12292 return V2;
12293}
12294
12295/// Try to lower broadcast of a single - truncated - integer element,
12296/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
12297///
12298/// This assumes we have AVX2.
12300 int BroadcastIdx,
12301 const X86Subtarget &Subtarget,
12302 SelectionDAG &DAG) {
12303 assert(Subtarget.hasAVX2() &&
12304 "We can only lower integer broadcasts with AVX2!");
12305
12306 MVT EltVT = VT.getVectorElementType();
12307 MVT V0VT = V0.getSimpleValueType();
12308
12309 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
12310 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
12311
12312 MVT V0EltVT = V0VT.getVectorElementType();
12313 if (!V0EltVT.isInteger())
12314 return SDValue();
12315
12316 const unsigned EltSize = EltVT.getSizeInBits();
12317 const unsigned V0EltSize = V0EltVT.getSizeInBits();
12318
12319 // This is only a truncation if the original element type is larger.
12320 if (V0EltSize <= EltSize)
12321 return SDValue();
12322
12323 assert(((V0EltSize % EltSize) == 0) &&
12324 "Scalar type sizes must all be powers of 2 on x86!");
12325
12326 const unsigned V0Opc = V0.getOpcode();
12327 const unsigned Scale = V0EltSize / EltSize;
12328 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
12329
12330 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
12331 V0Opc != ISD::BUILD_VECTOR)
12332 return SDValue();
12333
12334 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
12335
12336 // If we're extracting non-least-significant bits, shift so we can truncate.
12337 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
12338 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
12339 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
12340 if (const int OffsetIdx = BroadcastIdx % Scale)
12341 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
12342 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
12343
12344 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
12345 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
12346}
12347
12348/// Test whether this can be lowered with a single SHUFPS instruction.
12349///
12350/// This is used to disable more specialized lowerings when the shufps lowering
12351/// will happen to be efficient.
12353 // This routine only handles 128-bit shufps.
12354 assert(Mask.size() == 4 && "Unsupported mask size!");
12355 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
12356 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
12357 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
12358 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
12359
12360 // To lower with a single SHUFPS we need to have the low half and high half
12361 // each requiring a single input.
12362 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
12363 return false;
12364 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
12365 return false;
12366
12367 return true;
12368}
12369
12370/// Test whether the specified input (0 or 1) is in-place blended by the
12371/// given mask.
12372///
12373/// This returns true if the elements from a particular input are already in the
12374/// slot required by the given mask and require no permutation.
12375static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
12376 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12377 int Size = Mask.size();
12378 for (int i = 0; i < Size; ++i)
12379 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12380 return false;
12381
12382 return true;
12383}
12384
12385/// If we are extracting two 128-bit halves of a vector and shuffling the
12386/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
12387/// multi-shuffle lowering.
12389 SDValue N1, ArrayRef<int> Mask,
12390 SelectionDAG &DAG) {
12391 MVT VT = N0.getSimpleValueType();
12392 assert((VT.is128BitVector() &&
12393 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
12394 "VPERM* family of shuffles requires 32-bit or 64-bit elements");
12395
12396 // Check that both sources are extracts of the same source vector.
12397 if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
12399 N0.getOperand(0) != N1.getOperand(0) ||
12400 !N0.hasOneUse() || !N1.hasOneUse())
12401 return SDValue();
12402
12403 SDValue WideVec = N0.getOperand(0);
12404 MVT WideVT = WideVec.getSimpleValueType();
12405 if (!WideVT.is256BitVector())
12406 return SDValue();
12407
12408 // Match extracts of each half of the wide source vector. Commute the shuffle
12409 // if the extract of the low half is N1.
12410 unsigned NumElts = VT.getVectorNumElements();
12411 SmallVector<int, 4> NewMask(Mask);
12412 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
12413 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
12414 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
12416 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
12417 return SDValue();
12418
12419 // Final bailout: if the mask is simple, we are better off using an extract
12420 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
12421 // because that avoids a constant load from memory.
12422 if (NumElts == 4 &&
12423 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))
12424 return SDValue();
12425
12426 // Extend the shuffle mask with undef elements.
12427 NewMask.append(NumElts, -1);
12428
12429 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
12430 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
12431 NewMask);
12432 // This is free: ymm -> xmm.
12433 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
12434 DAG.getIntPtrConstant(0, DL));
12435}
12436
12437/// Try to lower broadcast of a single element.
12438///
12439/// For convenience, this code also bundles all of the subtarget feature set
12440/// filtering. While a little annoying to re-dispatch on type here, there isn't
12441/// a convenient way to factor it out.
12443 SDValue V2, ArrayRef<int> Mask,
12444 const X86Subtarget &Subtarget,
12445 SelectionDAG &DAG) {
12446 MVT EltVT = VT.getVectorElementType();
12447 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
12448 (Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
12449 (Subtarget.hasAVX2() && (VT.isInteger() || EltVT == MVT::f16))))
12450 return SDValue();
12451
12452 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
12453 // we can only broadcast from a register with AVX2.
12454 unsigned NumEltBits = VT.getScalarSizeInBits();
12455 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
12458 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
12459
12460 // Check that the mask is a broadcast.
12461 int BroadcastIdx = getSplatIndex(Mask);
12462 if (BroadcastIdx < 0)
12463 return SDValue();
12464 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
12465 "a sorted mask where the broadcast "
12466 "comes from V1.");
12467
12468 // Go up the chain of (vector) values to find a scalar load that we can
12469 // combine with the broadcast.
12470 // TODO: Combine this logic with findEltLoadSrc() used by
12471 // EltsFromConsecutiveLoads().
12472 int BitOffset = BroadcastIdx * NumEltBits;
12473 SDValue V = V1;
12474 for (;;) {
12475 switch (V.getOpcode()) {
12476 case ISD::BITCAST: {
12477 V = V.getOperand(0);
12478 continue;
12479 }
12480 case ISD::CONCAT_VECTORS: {
12481 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
12482 int OpIdx = BitOffset / OpBitWidth;
12483 V = V.getOperand(OpIdx);
12484 BitOffset %= OpBitWidth;
12485 continue;
12486 }
12488 // The extraction index adds to the existing offset.
12489 unsigned EltBitWidth = V.getScalarValueSizeInBits();
12490 unsigned Idx = V.getConstantOperandVal(1);
12491 unsigned BeginOffset = Idx * EltBitWidth;
12492 BitOffset += BeginOffset;
12493 V = V.getOperand(0);
12494 continue;
12495 }
12496 case ISD::INSERT_SUBVECTOR: {
12497 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
12498 int EltBitWidth = VOuter.getScalarValueSizeInBits();
12499 int Idx = (int)V.getConstantOperandVal(2);
12500 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
12501 int BeginOffset = Idx * EltBitWidth;
12502 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
12503 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
12504 BitOffset -= BeginOffset;
12505 V = VInner;
12506 } else {
12507 V = VOuter;
12508 }
12509 continue;
12510 }
12511 }
12512 break;
12513 }
12514 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
12515 BroadcastIdx = BitOffset / NumEltBits;
12516
12517 // Do we need to bitcast the source to retrieve the original broadcast index?
12518 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
12519
12520 // Check if this is a broadcast of a scalar. We special case lowering
12521 // for scalars so that we can more effectively fold with loads.
12522 // If the original value has a larger element type than the shuffle, the
12523 // broadcast element is in essence truncated. Make that explicit to ease
12524 // folding.
12525 if (BitCastSrc && VT.isInteger())
12526 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
12527 DL, VT, V, BroadcastIdx, Subtarget, DAG))
12528 return TruncBroadcast;
12529
12530 // Also check the simpler case, where we can directly reuse the scalar.
12531 if (!BitCastSrc &&
12532 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
12533 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
12534 V = V.getOperand(BroadcastIdx);
12535
12536 // If we can't broadcast from a register, check that the input is a load.
12537 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
12538 return SDValue();
12539 } else if (ISD::isNormalLoad(V.getNode()) &&
12540 cast<LoadSDNode>(V)->isSimple()) {
12541 // We do not check for one-use of the vector load because a broadcast load
12542 // is expected to be a win for code size, register pressure, and possibly
12543 // uops even if the original vector load is not eliminated.
12544
12545 // Reduce the vector load and shuffle to a broadcasted scalar load.
12546 LoadSDNode *Ld = cast<LoadSDNode>(V);
12547 SDValue BaseAddr = Ld->getOperand(1);
12548 MVT SVT = VT.getScalarType();
12549 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
12550 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
12551 SDValue NewAddr =
12553
12554 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
12555 // than MOVDDUP.
12556 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
12557 if (Opcode == X86ISD::VBROADCAST) {
12558 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
12559 SDValue Ops[] = {Ld->getChain(), NewAddr};
12560 V = DAG.getMemIntrinsicNode(
12561 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
12563 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
12565 return DAG.getBitcast(VT, V);
12566 }
12567 assert(SVT == MVT::f64 && "Unexpected VT!");
12568 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
12570 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
12572 } else if (!BroadcastFromReg) {
12573 // We can't broadcast from a vector register.
12574 return SDValue();
12575 } else if (BitOffset != 0) {
12576 // We can only broadcast from the zero-element of a vector register,
12577 // but it can be advantageous to broadcast from the zero-element of a
12578 // subvector.
12579 if (!VT.is256BitVector() && !VT.is512BitVector())
12580 return SDValue();
12581
12582 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
12583 if (VT == MVT::v4f64 || VT == MVT::v4i64)
12584 return SDValue();
12585
12586 // Only broadcast the zero-element of a 128-bit subvector.
12587 if ((BitOffset % 128) != 0)
12588 return SDValue();
12589
12590 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
12591 "Unexpected bit-offset");
12592 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
12593 "Unexpected vector size");
12594 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
12595 V = extract128BitVector(V, ExtractIdx, DAG, DL);
12596 }
12597
12598 // On AVX we can use VBROADCAST directly for scalar sources.
12599 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
12600 V = DAG.getBitcast(MVT::f64, V);
12601 if (Subtarget.hasAVX()) {
12602 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
12603 return DAG.getBitcast(VT, V);
12604 }
12605 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
12606 }
12607
12608 // If this is a scalar, do the broadcast on this type and bitcast.
12609 if (!V.getValueType().isVector()) {
12610 assert(V.getScalarValueSizeInBits() == NumEltBits &&
12611 "Unexpected scalar size");
12612 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
12614 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
12615 }
12616
12617 // We only support broadcasting from 128-bit vectors to minimize the
12618 // number of patterns we need to deal with in isel. So extract down to
12619 // 128-bits, removing as many bitcasts as possible.
12620 if (V.getValueSizeInBits() > 128)
12622
12623 // Otherwise cast V to a vector with the same element type as VT, but
12624 // possibly narrower than VT. Then perform the broadcast.
12625 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
12626 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
12627 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
12628}
12629
12630// Check for whether we can use INSERTPS to perform the shuffle. We only use
12631// INSERTPS when the V1 elements are already in the correct locations
12632// because otherwise we can just always use two SHUFPS instructions which
12633// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
12634// perform INSERTPS if a single V1 element is out of place and all V2
12635// elements are zeroable.
12637 unsigned &InsertPSMask,
12638 const APInt &Zeroable,
12639 ArrayRef<int> Mask, SelectionDAG &DAG) {
12640 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
12641 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
12642 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12643
12644 // Attempt to match INSERTPS with one element from VA or VB being
12645 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
12646 // are updated.
12647 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
12648 ArrayRef<int> CandidateMask) {
12649 unsigned ZMask = 0;
12650 int VADstIndex = -1;
12651 int VBDstIndex = -1;
12652 bool VAUsedInPlace = false;
12653
12654 for (int i = 0; i < 4; ++i) {
12655 // Synthesize a zero mask from the zeroable elements (includes undefs).
12656 if (Zeroable[i]) {
12657 ZMask |= 1 << i;
12658 continue;
12659 }
12660
12661 // Flag if we use any VA inputs in place.
12662 if (i == CandidateMask[i]) {
12663 VAUsedInPlace = true;
12664 continue;
12665 }
12666
12667 // We can only insert a single non-zeroable element.
12668 if (VADstIndex >= 0 || VBDstIndex >= 0)
12669 return false;
12670
12671 if (CandidateMask[i] < 4) {
12672 // VA input out of place for insertion.
12673 VADstIndex = i;
12674 } else {
12675 // VB input for insertion.
12676 VBDstIndex = i;
12677 }
12678 }
12679
12680 // Don't bother if we have no (non-zeroable) element for insertion.
12681 if (VADstIndex < 0 && VBDstIndex < 0)
12682 return false;
12683
12684 // Determine element insertion src/dst indices. The src index is from the
12685 // start of the inserted vector, not the start of the concatenated vector.
12686 unsigned VBSrcIndex = 0;
12687 if (VADstIndex >= 0) {
12688 // If we have a VA input out of place, we use VA as the V2 element
12689 // insertion and don't use the original V2 at all.
12690 VBSrcIndex = CandidateMask[VADstIndex];
12691 VBDstIndex = VADstIndex;
12692 VB = VA;
12693 } else {
12694 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
12695 }
12696
12697 // If no V1 inputs are used in place, then the result is created only from
12698 // the zero mask and the V2 insertion - so remove V1 dependency.
12699 if (!VAUsedInPlace)
12700 VA = DAG.getUNDEF(MVT::v4f32);
12701
12702 // Update V1, V2 and InsertPSMask accordingly.
12703 V1 = VA;
12704 V2 = VB;
12705
12706 // Insert the V2 element into the desired position.
12707 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
12708 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
12709 return true;
12710 };
12711
12712 if (matchAsInsertPS(V1, V2, Mask))
12713 return true;
12714
12715 // Commute and try again.
12716 SmallVector<int, 4> CommutedMask(Mask);
12718 if (matchAsInsertPS(V2, V1, CommutedMask))
12719 return true;
12720
12721 return false;
12722}
12723
12725 ArrayRef<int> Mask, const APInt &Zeroable,
12726 SelectionDAG &DAG) {
12727 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
12728 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
12729
12730 // Attempt to match the insertps pattern.
12731 unsigned InsertPSMask = 0;
12732 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
12733 return SDValue();
12734
12735 // Insert the V2 element into the desired position.
12736 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
12737 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
12738}
12739
12740/// Handle lowering of 2-lane 64-bit floating point shuffles.
12741///
12742/// This is the basis function for the 2-lane 64-bit shuffles as we have full
12743/// support for floating point shuffles but not integer shuffles. These
12744/// instructions will incur a domain crossing penalty on some chips though so
12745/// it is better to avoid lowering through this for integer vectors where
12746/// possible.
12748 const APInt &Zeroable, SDValue V1, SDValue V2,
12749 const X86Subtarget &Subtarget,
12750 SelectionDAG &DAG) {
12751 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
12752 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
12753 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
12754
12755 if (V2.isUndef()) {
12756 // Check for being able to broadcast a single element.
12757 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
12758 Mask, Subtarget, DAG))
12759 return Broadcast;
12760
12761 // Straight shuffle of a single input vector. Simulate this by using the
12762 // single input as both of the "inputs" to this instruction..
12763 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
12764
12765 if (Subtarget.hasAVX()) {
12766 // If we have AVX, we can use VPERMILPS which will allow folding a load
12767 // into the shuffle.
12768 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
12769 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
12770 }
12771
12772 return DAG.getNode(
12773 X86ISD::SHUFP, DL, MVT::v2f64,
12774 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
12775 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
12776 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
12777 }
12778 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
12779 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
12780 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
12781 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
12782
12783 if (Subtarget.hasAVX2())
12784 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
12785 return Extract;
12786
12787 // When loading a scalar and then shuffling it into a vector we can often do
12788 // the insertion cheaply.
12790 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
12791 return Insertion;
12792 // Try inverting the insertion since for v2 masks it is easy to do and we
12793 // can't reliably sort the mask one way or the other.
12794 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
12795 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
12797 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
12798 return Insertion;
12799
12800 // Try to use one of the special instruction patterns to handle two common
12801 // blend patterns if a zero-blend above didn't work.
12802 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
12803 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
12804 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
12805 // We can either use a special instruction to load over the low double or
12806 // to move just the low double.
12807 return DAG.getNode(
12808 X86ISD::MOVSD, DL, MVT::v2f64, V2,
12809 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
12810
12811 if (Subtarget.hasSSE41())
12812 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
12813 Zeroable, Subtarget, DAG))
12814 return Blend;
12815
12816 // Use dedicated unpack instructions for masks that match their pattern.
12817 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
12818 return V;
12819
12820 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
12821 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
12822 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
12823}
12824
12825/// Handle lowering of 2-lane 64-bit integer shuffles.
12826///
12827/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
12828/// the integer unit to minimize domain crossing penalties. However, for blends
12829/// it falls back to the floating point shuffle operation with appropriate bit
12830/// casting.
12832 const APInt &Zeroable, SDValue V1, SDValue V2,
12833 const X86Subtarget &Subtarget,
12834 SelectionDAG &DAG) {
12835 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
12836 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
12837 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
12838
12839 if (V2.isUndef()) {
12840 // Check for being able to broadcast a single element.
12841 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
12842 Mask, Subtarget, DAG))
12843 return Broadcast;
12844
12845 // Straight shuffle of a single input vector. For everything from SSE2
12846 // onward this has a single fast instruction with no scary immediates.
12847 // We have to map the mask as it is actually a v4i32 shuffle instruction.
12848 V1 = DAG.getBitcast(MVT::v4i32, V1);
12849 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
12850 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
12851 Mask[1] < 0 ? -1 : (Mask[1] * 2),
12852 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
12853 return DAG.getBitcast(
12854 MVT::v2i64,
12855 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
12856 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
12857 }
12858 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
12859 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
12860 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
12861 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
12862
12863 if (Subtarget.hasAVX2())
12864 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
12865 return Extract;
12866
12867 // Try to use shift instructions.
12868 if (SDValue Shift =
12869 lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,
12870 DAG, /*BitwiseOnly*/ false))
12871 return Shift;
12872
12873 // When loading a scalar and then shuffling it into a vector we can often do
12874 // the insertion cheaply.
12876 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
12877 return Insertion;
12878 // Try inverting the insertion since for v2 masks it is easy to do and we
12879 // can't reliably sort the mask one way or the other.
12880 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
12882 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
12883 return Insertion;
12884
12885 // We have different paths for blend lowering, but they all must use the
12886 // *exact* same predicate.
12887 bool IsBlendSupported = Subtarget.hasSSE41();
12888 if (IsBlendSupported)
12889 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
12890 Zeroable, Subtarget, DAG))
12891 return Blend;
12892
12893 // Use dedicated unpack instructions for masks that match their pattern.
12894 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
12895 return V;
12896
12897 // Try to use byte rotation instructions.
12898 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
12899 if (Subtarget.hasSSSE3()) {
12900 if (Subtarget.hasVLX())
12901 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
12902 Zeroable, Subtarget, DAG))
12903 return Rotate;
12904
12905 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
12906 Subtarget, DAG))
12907 return Rotate;
12908 }
12909
12910 // If we have direct support for blends, we should lower by decomposing into
12911 // a permute. That will be faster than the domain cross.
12912 if (IsBlendSupported)
12913 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
12914 Subtarget, DAG);
12915
12916 // We implement this with SHUFPD which is pretty lame because it will likely
12917 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
12918 // However, all the alternatives are still more cycles and newer chips don't
12919 // have this problem. It would be really nice if x86 had better shuffles here.
12920 V1 = DAG.getBitcast(MVT::v2f64, V1);
12921 V2 = DAG.getBitcast(MVT::v2f64, V2);
12922 return DAG.getBitcast(MVT::v2i64,
12923 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
12924}
12925
12926/// Lower a vector shuffle using the SHUFPS instruction.
12927///
12928/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
12929/// It makes no assumptions about whether this is the *best* lowering, it simply
12930/// uses it.
12932 ArrayRef<int> Mask, SDValue V1,
12933 SDValue V2, SelectionDAG &DAG) {
12934 SDValue LowV = V1, HighV = V2;
12935 SmallVector<int, 4> NewMask(Mask);
12936 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
12937
12938 if (NumV2Elements == 1) {
12939 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
12940
12941 // Compute the index adjacent to V2Index and in the same half by toggling
12942 // the low bit.
12943 int V2AdjIndex = V2Index ^ 1;
12944
12945 if (Mask[V2AdjIndex] < 0) {
12946 // Handles all the cases where we have a single V2 element and an undef.
12947 // This will only ever happen in the high lanes because we commute the
12948 // vector otherwise.
12949 if (V2Index < 2)
12950 std::swap(LowV, HighV);
12951 NewMask[V2Index] -= 4;
12952 } else {
12953 // Handle the case where the V2 element ends up adjacent to a V1 element.
12954 // To make this work, blend them together as the first step.
12955 int V1Index = V2AdjIndex;
12956 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
12957 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
12958 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
12959
12960 // Now proceed to reconstruct the final blend as we have the necessary
12961 // high or low half formed.
12962 if (V2Index < 2) {
12963 LowV = V2;
12964 HighV = V1;
12965 } else {
12966 HighV = V2;
12967 }
12968 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
12969 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
12970 }
12971 } else if (NumV2Elements == 2) {
12972 if (Mask[0] < 4 && Mask[1] < 4) {
12973 // Handle the easy case where we have V1 in the low lanes and V2 in the
12974 // high lanes.
12975 NewMask[2] -= 4;
12976 NewMask[3] -= 4;
12977 } else if (Mask[2] < 4 && Mask[3] < 4) {
12978 // We also handle the reversed case because this utility may get called
12979 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
12980 // arrange things in the right direction.
12981 NewMask[0] -= 4;
12982 NewMask[1] -= 4;
12983 HighV = V1;
12984 LowV = V2;
12985 } else {
12986 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
12987 // trying to place elements directly, just blend them and set up the final
12988 // shuffle to place them.
12989
12990 // The first two blend mask elements are for V1, the second two are for
12991 // V2.
12992 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
12993 Mask[2] < 4 ? Mask[2] : Mask[3],
12994 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
12995 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
12996 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
12997 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
12998
12999 // Now we do a normal shuffle of V1 by giving V1 as both operands to
13000 // a blend.
13001 LowV = HighV = V1;
13002 NewMask[0] = Mask[0] < 4 ? 0 : 2;
13003 NewMask[1] = Mask[0] < 4 ? 2 : 0;
13004 NewMask[2] = Mask[2] < 4 ? 1 : 3;
13005 NewMask[3] = Mask[2] < 4 ? 3 : 1;
13006 }
13007 } else if (NumV2Elements == 3) {
13008 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
13009 // we can get here due to other paths (e.g repeated mask matching) that we
13010 // don't want to do another round of lowerVECTOR_SHUFFLE.
13012 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
13013 }
13014 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
13015 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
13016}
13017
13018/// Lower 4-lane 32-bit floating point shuffles.
13019///
13020/// Uses instructions exclusively from the floating point unit to minimize
13021/// domain crossing penalties, as these are sufficient to implement all v4f32
13022/// shuffles.
13024 const APInt &Zeroable, SDValue V1, SDValue V2,
13025 const X86Subtarget &Subtarget,
13026 SelectionDAG &DAG) {
13027 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13028 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13029 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13030
13031 if (Subtarget.hasSSE41())
13032 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
13033 Zeroable, Subtarget, DAG))
13034 return Blend;
13035
13036 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13037
13038 if (NumV2Elements == 0) {
13039 // Check for being able to broadcast a single element.
13040 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
13041 Mask, Subtarget, DAG))
13042 return Broadcast;
13043
13044 // Use even/odd duplicate instructions for masks that match their pattern.
13045 if (Subtarget.hasSSE3()) {
13046 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
13047 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
13048 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
13049 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
13050 }
13051
13052 if (Subtarget.hasAVX()) {
13053 // If we have AVX, we can use VPERMILPS which will allow folding a load
13054 // into the shuffle.
13055 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
13056 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13057 }
13058
13059 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
13060 // in SSE1 because otherwise they are widened to v2f64 and never get here.
13061 if (!Subtarget.hasSSE2()) {
13062 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
13063 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
13064 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
13065 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
13066 }
13067
13068 // Otherwise, use a straight shuffle of a single input vector. We pass the
13069 // input vector to both operands to simulate this with a SHUFPS.
13070 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
13071 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13072 }
13073
13074 if (Subtarget.hasSSE2())
13076 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
13077 ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
13078 return ZExt;
13079 }
13080
13081 if (Subtarget.hasAVX2())
13082 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13083 return Extract;
13084
13085 // There are special ways we can lower some single-element blends. However, we
13086 // have custom ways we can lower more complex single-element blends below that
13087 // we defer to if both this and BLENDPS fail to match, so restrict this to
13088 // when the V2 input is targeting element 0 of the mask -- that is the fast
13089 // case here.
13090 if (NumV2Elements == 1 && Mask[0] >= 4)
13092 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13093 return V;
13094
13095 if (Subtarget.hasSSE41()) {
13096 // Use INSERTPS if we can complete the shuffle efficiently.
13097 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
13098 return V;
13099
13100 if (!isSingleSHUFPSMask(Mask))
13101 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
13102 V2, Mask, DAG))
13103 return BlendPerm;
13104 }
13105
13106 // Use low/high mov instructions. These are only valid in SSE1 because
13107 // otherwise they are widened to v2f64 and never get here.
13108 if (!Subtarget.hasSSE2()) {
13109 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
13110 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
13111 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
13112 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
13113 }
13114
13115 // Use dedicated unpack instructions for masks that match their pattern.
13116 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
13117 return V;
13118
13119 // Otherwise fall back to a SHUFPS lowering strategy.
13120 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
13121}
13122
13123/// Lower 4-lane i32 vector shuffles.
13124///
13125/// We try to handle these with integer-domain shuffles where we can, but for
13126/// blends we use the floating point domain blend instructions.
13128 const APInt &Zeroable, SDValue V1, SDValue V2,
13129 const X86Subtarget &Subtarget,
13130 SelectionDAG &DAG) {
13131 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13132 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13133 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13134
13135 // Whenever we can lower this as a zext, that instruction is strictly faster
13136 // than any alternative. It also allows us to fold memory operands into the
13137 // shuffle in many cases.
13138 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
13139 Zeroable, Subtarget, DAG))
13140 return ZExt;
13141
13142 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13143
13144 // Try to use shift instructions if fast.
13145 if (Subtarget.preferLowerShuffleAsShift()) {
13146 if (SDValue Shift =
13147 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,
13148 Subtarget, DAG, /*BitwiseOnly*/ true))
13149 return Shift;
13150 if (NumV2Elements == 0)
13151 if (SDValue Rotate =
13152 lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))
13153 return Rotate;
13154 }
13155
13156 if (NumV2Elements == 0) {
13157 // Try to use broadcast unless the mask only has one non-undef element.
13158 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
13159 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
13160 Mask, Subtarget, DAG))
13161 return Broadcast;
13162 }
13163
13164 // Straight shuffle of a single input vector. For everything from SSE2
13165 // onward this has a single fast instruction with no scary immediates.
13166 // We coerce the shuffle pattern to be compatible with UNPCK instructions
13167 // but we aren't actually going to use the UNPCK instruction because doing
13168 // so prevents folding a load into this instruction or making a copy.
13169 const int UnpackLoMask[] = {0, 0, 1, 1};
13170 const int UnpackHiMask[] = {2, 2, 3, 3};
13171 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
13172 Mask = UnpackLoMask;
13173 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
13174 Mask = UnpackHiMask;
13175
13176 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13177 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13178 }
13179
13180 if (Subtarget.hasAVX2())
13181 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13182 return Extract;
13183
13184 // Try to use shift instructions.
13185 if (SDValue Shift =
13186 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,
13187 DAG, /*BitwiseOnly*/ false))
13188 return Shift;
13189
13190 // There are special ways we can lower some single-element blends.
13191 if (NumV2Elements == 1)
13193 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13194 return V;
13195
13196 // We have different paths for blend lowering, but they all must use the
13197 // *exact* same predicate.
13198 bool IsBlendSupported = Subtarget.hasSSE41();
13199 if (IsBlendSupported)
13200 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
13201 Zeroable, Subtarget, DAG))
13202 return Blend;
13203
13204 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
13205 Zeroable, Subtarget, DAG))
13206 return Masked;
13207
13208 // Use dedicated unpack instructions for masks that match their pattern.
13209 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
13210 return V;
13211
13212 // Try to use byte rotation instructions.
13213 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13214 if (Subtarget.hasSSSE3()) {
13215 if (Subtarget.hasVLX())
13216 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
13217 Zeroable, Subtarget, DAG))
13218 return Rotate;
13219
13220 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
13221 Subtarget, DAG))
13222 return Rotate;
13223 }
13224
13225 // Assume that a single SHUFPS is faster than an alternative sequence of
13226 // multiple instructions (even if the CPU has a domain penalty).
13227 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13228 if (!isSingleSHUFPSMask(Mask)) {
13229 // If we have direct support for blends, we should lower by decomposing into
13230 // a permute. That will be faster than the domain cross.
13231 if (IsBlendSupported)
13232 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
13233 Subtarget, DAG);
13234
13235 // Try to lower by permuting the inputs into an unpack instruction.
13236 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
13237 Mask, Subtarget, DAG))
13238 return Unpack;
13239 }
13240
13241 // We implement this with SHUFPS because it can blend from two vectors.
13242 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
13243 // up the inputs, bypassing domain shift penalties that we would incur if we
13244 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
13245 // relevant.
13246 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
13247 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
13248 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
13249 return DAG.getBitcast(MVT::v4i32, ShufPS);
13250}
13251
13252/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
13253/// shuffle lowering, and the most complex part.
13254///
13255/// The lowering strategy is to try to form pairs of input lanes which are
13256/// targeted at the same half of the final vector, and then use a dword shuffle
13257/// to place them onto the right half, and finally unpack the paired lanes into
13258/// their final position.
13259///
13260/// The exact breakdown of how to form these dword pairs and align them on the
13261/// correct sides is really tricky. See the comments within the function for
13262/// more of the details.
13263///
13264/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
13265/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
13266/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
13267/// vector, form the analogous 128-bit 8-element Mask.
13269 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
13270 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13271 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
13272 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
13273
13274 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
13275 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
13276 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
13277
13278 // Attempt to directly match PSHUFLW or PSHUFHW.
13279 if (isUndefOrInRange(LoMask, 0, 4) &&
13280 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
13281 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13282 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
13283 }
13284 if (isUndefOrInRange(HiMask, 4, 8) &&
13285 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
13286 for (int i = 0; i != 4; ++i)
13287 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
13288 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13289 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
13290 }
13291
13292 SmallVector<int, 4> LoInputs;
13293 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
13294 array_pod_sort(LoInputs.begin(), LoInputs.end());
13295 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
13296 SmallVector<int, 4> HiInputs;
13297 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
13298 array_pod_sort(HiInputs.begin(), HiInputs.end());
13299 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
13300 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
13301 int NumHToL = LoInputs.size() - NumLToL;
13302 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
13303 int NumHToH = HiInputs.size() - NumLToH;
13304 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
13305 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
13306 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
13307 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
13308
13309 // If we are shuffling values from one half - check how many different DWORD
13310 // pairs we need to create. If only 1 or 2 then we can perform this as a
13311 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
13312 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
13313 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
13314 V = DAG.getNode(ShufWOp, DL, VT, V,
13315 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13316 V = DAG.getBitcast(PSHUFDVT, V);
13317 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
13318 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13319 return DAG.getBitcast(VT, V);
13320 };
13321
13322 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
13323 int PSHUFDMask[4] = { -1, -1, -1, -1 };
13324 SmallVector<std::pair<int, int>, 4> DWordPairs;
13325 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
13326
13327 // Collect the different DWORD pairs.
13328 for (int DWord = 0; DWord != 4; ++DWord) {
13329 int M0 = Mask[2 * DWord + 0];
13330 int M1 = Mask[2 * DWord + 1];
13331 M0 = (M0 >= 0 ? M0 % 4 : M0);
13332 M1 = (M1 >= 0 ? M1 % 4 : M1);
13333 if (M0 < 0 && M1 < 0)
13334 continue;
13335
13336 bool Match = false;
13337 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
13338 auto &DWordPair = DWordPairs[j];
13339 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
13340 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
13341 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
13342 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
13343 PSHUFDMask[DWord] = DOffset + j;
13344 Match = true;
13345 break;
13346 }
13347 }
13348 if (!Match) {
13349 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
13350 DWordPairs.push_back(std::make_pair(M0, M1));
13351 }
13352 }
13353
13354 if (DWordPairs.size() <= 2) {
13355 DWordPairs.resize(2, std::make_pair(-1, -1));
13356 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
13357 DWordPairs[1].first, DWordPairs[1].second};
13358 if ((NumHToL + NumHToH) == 0)
13359 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
13360 if ((NumLToL + NumLToH) == 0)
13361 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
13362 }
13363 }
13364
13365 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
13366 // such inputs we can swap two of the dwords across the half mark and end up
13367 // with <=2 inputs to each half in each half. Once there, we can fall through
13368 // to the generic code below. For example:
13369 //
13370 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13371 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
13372 //
13373 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
13374 // and an existing 2-into-2 on the other half. In this case we may have to
13375 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
13376 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
13377 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
13378 // because any other situation (including a 3-into-1 or 1-into-3 in the other
13379 // half than the one we target for fixing) will be fixed when we re-enter this
13380 // path. We will also combine away any sequence of PSHUFD instructions that
13381 // result into a single instruction. Here is an example of the tricky case:
13382 //
13383 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13384 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
13385 //
13386 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
13387 //
13388 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
13389 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
13390 //
13391 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
13392 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
13393 //
13394 // The result is fine to be handled by the generic logic.
13395 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
13396 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
13397 int AOffset, int BOffset) {
13398 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
13399 "Must call this with A having 3 or 1 inputs from the A half.");
13400 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
13401 "Must call this with B having 1 or 3 inputs from the B half.");
13402 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
13403 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
13404
13405 bool ThreeAInputs = AToAInputs.size() == 3;
13406
13407 // Compute the index of dword with only one word among the three inputs in
13408 // a half by taking the sum of the half with three inputs and subtracting
13409 // the sum of the actual three inputs. The difference is the remaining
13410 // slot.
13411 int ADWord = 0, BDWord = 0;
13412 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
13413 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
13414 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
13415 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
13416 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
13417 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
13418 int TripleNonInputIdx =
13419 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
13420 TripleDWord = TripleNonInputIdx / 2;
13421
13422 // We use xor with one to compute the adjacent DWord to whichever one the
13423 // OneInput is in.
13424 OneInputDWord = (OneInput / 2) ^ 1;
13425
13426 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
13427 // and BToA inputs. If there is also such a problem with the BToB and AToB
13428 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
13429 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
13430 // is essential that we don't *create* a 3<-1 as then we might oscillate.
13431 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
13432 // Compute how many inputs will be flipped by swapping these DWords. We
13433 // need
13434 // to balance this to ensure we don't form a 3-1 shuffle in the other
13435 // half.
13436 int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
13437 llvm::count(AToBInputs, 2 * ADWord + 1);
13438 int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
13439 llvm::count(BToBInputs, 2 * BDWord + 1);
13440 if ((NumFlippedAToBInputs == 1 &&
13441 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
13442 (NumFlippedBToBInputs == 1 &&
13443 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
13444 // We choose whether to fix the A half or B half based on whether that
13445 // half has zero flipped inputs. At zero, we may not be able to fix it
13446 // with that half. We also bias towards fixing the B half because that
13447 // will more commonly be the high half, and we have to bias one way.
13448 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
13449 ArrayRef<int> Inputs) {
13450 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
13451 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
13452 // Determine whether the free index is in the flipped dword or the
13453 // unflipped dword based on where the pinned index is. We use this bit
13454 // in an xor to conditionally select the adjacent dword.
13455 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
13456 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
13457 if (IsFixIdxInput == IsFixFreeIdxInput)
13458 FixFreeIdx += 1;
13459 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
13460 assert(IsFixIdxInput != IsFixFreeIdxInput &&
13461 "We need to be changing the number of flipped inputs!");
13462 int PSHUFHalfMask[] = {0, 1, 2, 3};
13463 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
13464 V = DAG.getNode(
13465 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
13466 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
13467 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13468
13469 for (int &M : Mask)
13470 if (M >= 0 && M == FixIdx)
13471 M = FixFreeIdx;
13472 else if (M >= 0 && M == FixFreeIdx)
13473 M = FixIdx;
13474 };
13475 if (NumFlippedBToBInputs != 0) {
13476 int BPinnedIdx =
13477 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
13478 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
13479 } else {
13480 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
13481 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
13482 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
13483 }
13484 }
13485 }
13486
13487 int PSHUFDMask[] = {0, 1, 2, 3};
13488 PSHUFDMask[ADWord] = BDWord;
13489 PSHUFDMask[BDWord] = ADWord;
13490 V = DAG.getBitcast(
13491 VT,
13492 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
13493 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13494
13495 // Adjust the mask to match the new locations of A and B.
13496 for (int &M : Mask)
13497 if (M >= 0 && M/2 == ADWord)
13498 M = 2 * BDWord + M % 2;
13499 else if (M >= 0 && M/2 == BDWord)
13500 M = 2 * ADWord + M % 2;
13501
13502 // Recurse back into this routine to re-compute state now that this isn't
13503 // a 3 and 1 problem.
13504 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
13505 };
13506 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
13507 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
13508 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
13509 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
13510
13511 // At this point there are at most two inputs to the low and high halves from
13512 // each half. That means the inputs can always be grouped into dwords and
13513 // those dwords can then be moved to the correct half with a dword shuffle.
13514 // We use at most one low and one high word shuffle to collect these paired
13515 // inputs into dwords, and finally a dword shuffle to place them.
13516 int PSHUFLMask[4] = {-1, -1, -1, -1};
13517 int PSHUFHMask[4] = {-1, -1, -1, -1};
13518 int PSHUFDMask[4] = {-1, -1, -1, -1};
13519
13520 // First fix the masks for all the inputs that are staying in their
13521 // original halves. This will then dictate the targets of the cross-half
13522 // shuffles.
13523 auto fixInPlaceInputs =
13524 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
13525 MutableArrayRef<int> SourceHalfMask,
13526 MutableArrayRef<int> HalfMask, int HalfOffset) {
13527 if (InPlaceInputs.empty())
13528 return;
13529 if (InPlaceInputs.size() == 1) {
13530 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
13531 InPlaceInputs[0] - HalfOffset;
13532 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
13533 return;
13534 }
13535 if (IncomingInputs.empty()) {
13536 // Just fix all of the in place inputs.
13537 for (int Input : InPlaceInputs) {
13538 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
13539 PSHUFDMask[Input / 2] = Input / 2;
13540 }
13541 return;
13542 }
13543
13544 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
13545 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
13546 InPlaceInputs[0] - HalfOffset;
13547 // Put the second input next to the first so that they are packed into
13548 // a dword. We find the adjacent index by toggling the low bit.
13549 int AdjIndex = InPlaceInputs[0] ^ 1;
13550 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
13551 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
13552 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
13553 };
13554 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
13555 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
13556
13557 // Now gather the cross-half inputs and place them into a free dword of
13558 // their target half.
13559 // FIXME: This operation could almost certainly be simplified dramatically to
13560 // look more like the 3-1 fixing operation.
13561 auto moveInputsToRightHalf = [&PSHUFDMask](
13562 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
13563 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
13564 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
13565 int DestOffset) {
13566 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
13567 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
13568 };
13569 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
13570 int Word) {
13571 int LowWord = Word & ~1;
13572 int HighWord = Word | 1;
13573 return isWordClobbered(SourceHalfMask, LowWord) ||
13574 isWordClobbered(SourceHalfMask, HighWord);
13575 };
13576
13577 if (IncomingInputs.empty())
13578 return;
13579
13580 if (ExistingInputs.empty()) {
13581 // Map any dwords with inputs from them into the right half.
13582 for (int Input : IncomingInputs) {
13583 // If the source half mask maps over the inputs, turn those into
13584 // swaps and use the swapped lane.
13585 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
13586 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
13587 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
13588 Input - SourceOffset;
13589 // We have to swap the uses in our half mask in one sweep.
13590 for (int &M : HalfMask)
13591 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
13592 M = Input;
13593 else if (M == Input)
13594 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
13595 } else {
13596 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
13597 Input - SourceOffset &&
13598 "Previous placement doesn't match!");
13599 }
13600 // Note that this correctly re-maps both when we do a swap and when
13601 // we observe the other side of the swap above. We rely on that to
13602 // avoid swapping the members of the input list directly.
13603 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
13604 }
13605
13606 // Map the input's dword into the correct half.
13607 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
13608 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
13609 else
13610 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
13611 Input / 2 &&
13612 "Previous placement doesn't match!");
13613 }
13614
13615 // And just directly shift any other-half mask elements to be same-half
13616 // as we will have mirrored the dword containing the element into the
13617 // same position within that half.
13618 for (int &M : HalfMask)
13619 if (M >= SourceOffset && M < SourceOffset + 4) {
13620 M = M - SourceOffset + DestOffset;
13621 assert(M >= 0 && "This should never wrap below zero!");
13622 }
13623 return;
13624 }
13625
13626 // Ensure we have the input in a viable dword of its current half. This
13627 // is particularly tricky because the original position may be clobbered
13628 // by inputs being moved and *staying* in that half.
13629 if (IncomingInputs.size() == 1) {
13630 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
13631 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
13632 SourceOffset;
13633 SourceHalfMask[InputFixed - SourceOffset] =
13634 IncomingInputs[0] - SourceOffset;
13635 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
13636 InputFixed);
13637 IncomingInputs[0] = InputFixed;
13638 }
13639 } else if (IncomingInputs.size() == 2) {
13640 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
13641 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
13642 // We have two non-adjacent or clobbered inputs we need to extract from
13643 // the source half. To do this, we need to map them into some adjacent
13644 // dword slot in the source mask.
13645 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
13646 IncomingInputs[1] - SourceOffset};
13647
13648 // If there is a free slot in the source half mask adjacent to one of
13649 // the inputs, place the other input in it. We use (Index XOR 1) to
13650 // compute an adjacent index.
13651 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
13652 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
13653 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
13654 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
13655 InputsFixed[1] = InputsFixed[0] ^ 1;
13656 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
13657 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
13658 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
13659 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
13660 InputsFixed[0] = InputsFixed[1] ^ 1;
13661 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
13662 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
13663 // The two inputs are in the same DWord but it is clobbered and the
13664 // adjacent DWord isn't used at all. Move both inputs to the free
13665 // slot.
13666 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
13667 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
13668 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
13669 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
13670 } else {
13671 // The only way we hit this point is if there is no clobbering
13672 // (because there are no off-half inputs to this half) and there is no
13673 // free slot adjacent to one of the inputs. In this case, we have to
13674 // swap an input with a non-input.
13675 for (int i = 0; i < 4; ++i)
13676 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
13677 "We can't handle any clobbers here!");
13678 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
13679 "Cannot have adjacent inputs here!");
13680
13681 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
13682 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
13683
13684 // We also have to update the final source mask in this case because
13685 // it may need to undo the above swap.
13686 for (int &M : FinalSourceHalfMask)
13687 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
13688 M = InputsFixed[1] + SourceOffset;
13689 else if (M == InputsFixed[1] + SourceOffset)
13690 M = (InputsFixed[0] ^ 1) + SourceOffset;
13691
13692 InputsFixed[1] = InputsFixed[0] ^ 1;
13693 }
13694
13695 // Point everything at the fixed inputs.
13696 for (int &M : HalfMask)
13697 if (M == IncomingInputs[0])
13698 M = InputsFixed[0] + SourceOffset;
13699 else if (M == IncomingInputs[1])
13700 M = InputsFixed[1] + SourceOffset;
13701
13702 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
13703 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
13704 }
13705 } else {
13706 llvm_unreachable("Unhandled input size!");
13707 }
13708
13709 // Now hoist the DWord down to the right half.
13710 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
13711 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
13712 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
13713 for (int &M : HalfMask)
13714 for (int Input : IncomingInputs)
13715 if (M == Input)
13716 M = FreeDWord * 2 + Input % 2;
13717 };
13718 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
13719 /*SourceOffset*/ 4, /*DestOffset*/ 0);
13720 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
13721 /*SourceOffset*/ 0, /*DestOffset*/ 4);
13722
13723 // Now enact all the shuffles we've computed to move the inputs into their
13724 // target half.
13725 if (!isNoopShuffleMask(PSHUFLMask))
13726 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13727 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
13728 if (!isNoopShuffleMask(PSHUFHMask))
13729 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13730 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
13731 if (!isNoopShuffleMask(PSHUFDMask))
13732 V = DAG.getBitcast(
13733 VT,
13734 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
13735 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13736
13737 // At this point, each half should contain all its inputs, and we can then
13738 // just shuffle them into their final position.
13739 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
13740 "Failed to lift all the high half inputs to the low mask!");
13741 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
13742 "Failed to lift all the low half inputs to the high mask!");
13743
13744 // Do a half shuffle for the low mask.
13745 if (!isNoopShuffleMask(LoMask))
13746 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13747 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
13748
13749 // Do a half shuffle with the high mask after shifting its values down.
13750 for (int &M : HiMask)
13751 if (M >= 0)
13752 M -= 4;
13753 if (!isNoopShuffleMask(HiMask))
13754 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13755 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
13756
13757 return V;
13758}
13759
13760/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
13761/// blend if only one input is used.
13763 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13764 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
13766 "Lane crossing shuffle masks not supported");
13767
13768 int NumBytes = VT.getSizeInBits() / 8;
13769 int Size = Mask.size();
13770 int Scale = NumBytes / Size;
13771
13772 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
13773 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
13774 V1InUse = false;
13775 V2InUse = false;
13776
13777 for (int i = 0; i < NumBytes; ++i) {
13778 int M = Mask[i / Scale];
13779 if (M < 0)
13780 continue;
13781
13782 const int ZeroMask = 0x80;
13783 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
13784 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
13785 if (Zeroable[i / Scale])
13786 V1Idx = V2Idx = ZeroMask;
13787
13788 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
13789 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
13790 V1InUse |= (ZeroMask != V1Idx);
13791 V2InUse |= (ZeroMask != V2Idx);
13792 }
13793
13794 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
13795 if (V1InUse)
13796 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
13797 DAG.getBuildVector(ShufVT, DL, V1Mask));
13798 if (V2InUse)
13799 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
13800 DAG.getBuildVector(ShufVT, DL, V2Mask));
13801
13802 // If we need shuffled inputs from both, blend the two.
13803 SDValue V;
13804 if (V1InUse && V2InUse)
13805 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
13806 else
13807 V = V1InUse ? V1 : V2;
13808
13809 // Cast the result back to the correct type.
13810 return DAG.getBitcast(VT, V);
13811}
13812
13813/// Generic lowering of 8-lane i16 shuffles.
13814///
13815/// This handles both single-input shuffles and combined shuffle/blends with
13816/// two inputs. The single input shuffles are immediately delegated to
13817/// a dedicated lowering routine.
13818///
13819/// The blends are lowered in one of three fundamental ways. If there are few
13820/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
13821/// of the input is significantly cheaper when lowered as an interleaving of
13822/// the two inputs, try to interleave them. Otherwise, blend the low and high
13823/// halves of the inputs separately (making them have relatively few inputs)
13824/// and then concatenate them.
13826 const APInt &Zeroable, SDValue V1, SDValue V2,
13827 const X86Subtarget &Subtarget,
13828 SelectionDAG &DAG) {
13829 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
13830 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
13831 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13832
13833 // Whenever we can lower this as a zext, that instruction is strictly faster
13834 // than any alternative.
13835 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
13836 Zeroable, Subtarget, DAG))
13837 return ZExt;
13838
13839 // Try to use lower using a truncation.
13840 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
13841 Subtarget, DAG))
13842 return V;
13843
13844 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
13845
13846 if (NumV2Inputs == 0) {
13847 // Try to use shift instructions.
13848 if (SDValue Shift =
13849 lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,
13850 Subtarget, DAG, /*BitwiseOnly*/ false))
13851 return Shift;
13852
13853 // Check for being able to broadcast a single element.
13854 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
13855 Mask, Subtarget, DAG))
13856 return Broadcast;
13857
13858 // Try to use bit rotation instructions.
13859 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
13860 Subtarget, DAG))
13861 return Rotate;
13862
13863 // Use dedicated unpack instructions for masks that match their pattern.
13864 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
13865 return V;
13866
13867 // Use dedicated pack instructions for masks that match their pattern.
13868 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
13869 Subtarget))
13870 return V;
13871
13872 // Try to use byte rotation instructions.
13873 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
13874 Subtarget, DAG))
13875 return Rotate;
13876
13877 // Make a copy of the mask so it can be modified.
13878 SmallVector<int, 8> MutableMask(Mask);
13879 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
13880 Subtarget, DAG);
13881 }
13882
13883 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
13884 "All single-input shuffles should be canonicalized to be V1-input "
13885 "shuffles.");
13886
13887 // Try to use shift instructions.
13888 if (SDValue Shift =
13889 lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,
13890 DAG, /*BitwiseOnly*/ false))
13891 return Shift;
13892
13893 // See if we can use SSE4A Extraction / Insertion.
13894 if (Subtarget.hasSSE4A())
13895 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
13896 Zeroable, DAG))
13897 return V;
13898
13899 // There are special ways we can lower some single-element blends.
13900 if (NumV2Inputs == 1)
13902 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13903 return V;
13904
13905 // We have different paths for blend lowering, but they all must use the
13906 // *exact* same predicate.
13907 bool IsBlendSupported = Subtarget.hasSSE41();
13908 if (IsBlendSupported)
13909 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
13910 Zeroable, Subtarget, DAG))
13911 return Blend;
13912
13913 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
13914 Zeroable, Subtarget, DAG))
13915 return Masked;
13916
13917 // Use dedicated unpack instructions for masks that match their pattern.
13918 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
13919 return V;
13920
13921 // Use dedicated pack instructions for masks that match their pattern.
13922 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
13923 Subtarget))
13924 return V;
13925
13926 // Try to use lower using a truncation.
13927 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
13928 Subtarget, DAG))
13929 return V;
13930
13931 // Try to use byte rotation instructions.
13932 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
13933 Subtarget, DAG))
13934 return Rotate;
13935
13936 if (SDValue BitBlend =
13937 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
13938 return BitBlend;
13939
13940 // Try to use byte shift instructions to mask.
13941 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
13942 Zeroable, Subtarget, DAG))
13943 return V;
13944
13945 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
13946 int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
13947 if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.hasSSE41())) &&
13948 !Subtarget.hasVLX()) {
13949 // Check if this is part of a 256-bit vector truncation.
13950 unsigned PackOpc = 0;
13951 if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
13954 SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
13955 V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
13956 getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
13957 DAG.getTargetConstant(0xEE, DL, MVT::i8));
13958 V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
13959 V1 = extract128BitVector(V1V2, 0, DAG, DL);
13960 V2 = extract128BitVector(V1V2, 4, DAG, DL);
13961 PackOpc = X86ISD::PACKUS;
13962 } else if (Subtarget.hasSSE41()) {
13963 SmallVector<SDValue, 4> DWordClearOps(4,
13964 DAG.getConstant(0, DL, MVT::i32));
13965 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
13966 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
13967 SDValue DWordClearMask =
13968 DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
13969 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
13970 DWordClearMask);
13971 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
13972 DWordClearMask);
13973 PackOpc = X86ISD::PACKUS;
13974 } else if (!Subtarget.hasSSSE3()) {
13975 SDValue ShAmt = DAG.getTargetConstant(16, DL, MVT::i8);
13976 V1 = DAG.getBitcast(MVT::v4i32, V1);
13977 V2 = DAG.getBitcast(MVT::v4i32, V2);
13978 V1 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V1, ShAmt);
13979 V2 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V2, ShAmt);
13980 V1 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V1, ShAmt);
13981 V2 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V2, ShAmt);
13982 PackOpc = X86ISD::PACKSS;
13983 }
13984 if (PackOpc) {
13985 // Now pack things back together.
13986 SDValue Result = DAG.getNode(PackOpc, DL, MVT::v8i16, V1, V2);
13987 if (NumEvenDrops == 2) {
13988 Result = DAG.getBitcast(MVT::v4i32, Result);
13989 Result = DAG.getNode(PackOpc, DL, MVT::v8i16, Result, Result);
13990 }
13991 return Result;
13992 }
13993 }
13994
13995 // When compacting odd (upper) elements, use PACKSS pre-SSE41.
13996 int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
13997 if (NumOddDrops == 1) {
13998 bool HasSSE41 = Subtarget.hasSSE41();
13999 V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14000 DAG.getBitcast(MVT::v4i32, V1),
14001 DAG.getTargetConstant(16, DL, MVT::i8));
14002 V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14003 DAG.getBitcast(MVT::v4i32, V2),
14004 DAG.getTargetConstant(16, DL, MVT::i8));
14005 return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
14006 MVT::v8i16, V1, V2);
14007 }
14008
14009 // Try to lower by permuting the inputs into an unpack instruction.
14010 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
14011 Mask, Subtarget, DAG))
14012 return Unpack;
14013
14014 // If we can't directly blend but can use PSHUFB, that will be better as it
14015 // can both shuffle and set up the inefficient blend.
14016 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
14017 bool V1InUse, V2InUse;
14018 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
14019 Zeroable, DAG, V1InUse, V2InUse);
14020 }
14021
14022 // We can always bit-blend if we have to so the fallback strategy is to
14023 // decompose into single-input permutes and blends/unpacks.
14024 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,
14025 Mask, Subtarget, DAG);
14026}
14027
14028/// Lower 8-lane 16-bit floating point shuffles.
14030 const APInt &Zeroable, SDValue V1, SDValue V2,
14031 const X86Subtarget &Subtarget,
14032 SelectionDAG &DAG) {
14033 assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14034 assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14035 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14036 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
14037
14038 if (Subtarget.hasFP16()) {
14039 if (NumV2Elements == 0) {
14040 // Check for being able to broadcast a single element.
14041 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
14042 Mask, Subtarget, DAG))
14043 return Broadcast;
14044 }
14045 if (NumV2Elements == 1 && Mask[0] >= 8)
14047 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14048 return V;
14049 }
14050
14051 V1 = DAG.getBitcast(MVT::v8i16, V1);
14052 V2 = DAG.getBitcast(MVT::v8i16, V2);
14053 return DAG.getBitcast(MVT::v8f16,
14054 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
14055}
14056
14057// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
14058// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
14059// the active subvector is extracted.
14061 ArrayRef<int> Mask, SDValue V1, SDValue V2,
14062 const X86Subtarget &Subtarget,
14063 SelectionDAG &DAG) {
14064 MVT MaskVT = VT.changeTypeToInteger();
14065 SDValue MaskNode;
14066 MVT ShuffleVT = VT;
14067 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
14068 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
14069 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
14070 ShuffleVT = V1.getSimpleValueType();
14071
14072 // Adjust mask to correct indices for the second input.
14073 int NumElts = VT.getVectorNumElements();
14074 unsigned Scale = 512 / VT.getSizeInBits();
14075 SmallVector<int, 32> AdjustedMask(Mask);
14076 for (int &M : AdjustedMask)
14077 if (NumElts <= M)
14078 M += (Scale - 1) * NumElts;
14079 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
14080 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
14081 } else {
14082 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
14083 }
14084
14085 SDValue Result;
14086 if (V2.isUndef())
14087 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
14088 else
14089 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
14090
14091 if (VT != ShuffleVT)
14092 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
14093
14094 return Result;
14095}
14096
14097/// Generic lowering of v16i8 shuffles.
14098///
14099/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
14100/// detect any complexity reducing interleaving. If that doesn't help, it uses
14101/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
14102/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
14103/// back together.
14105 const APInt &Zeroable, SDValue V1, SDValue V2,
14106 const X86Subtarget &Subtarget,
14107 SelectionDAG &DAG) {
14108 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14109 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14110 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14111
14112 // Try to use shift instructions.
14113 if (SDValue Shift =
14114 lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,
14115 DAG, /*BitwiseOnly*/ false))
14116 return Shift;
14117
14118 // Try to use byte rotation instructions.
14119 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
14120 Subtarget, DAG))
14121 return Rotate;
14122
14123 // Use dedicated pack instructions for masks that match their pattern.
14124 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
14125 Subtarget))
14126 return V;
14127
14128 // Try to use a zext lowering.
14129 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
14130 Zeroable, Subtarget, DAG))
14131 return ZExt;
14132
14133 // Try to use lower using a truncation.
14134 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14135 Subtarget, DAG))
14136 return V;
14137
14138 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14139 Subtarget, DAG))
14140 return V;
14141
14142 // See if we can use SSE4A Extraction / Insertion.
14143 if (Subtarget.hasSSE4A())
14144 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
14145 Zeroable, DAG))
14146 return V;
14147
14148 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
14149
14150 // For single-input shuffles, there are some nicer lowering tricks we can use.
14151 if (NumV2Elements == 0) {
14152 // Check for being able to broadcast a single element.
14153 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
14154 Mask, Subtarget, DAG))
14155 return Broadcast;
14156
14157 // Try to use bit rotation instructions.
14158 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
14159 Subtarget, DAG))
14160 return Rotate;
14161
14162 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
14163 return V;
14164
14165 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
14166 // Notably, this handles splat and partial-splat shuffles more efficiently.
14167 // However, it only makes sense if the pre-duplication shuffle simplifies
14168 // things significantly. Currently, this means we need to be able to
14169 // express the pre-duplication shuffle as an i16 shuffle.
14170 //
14171 // FIXME: We should check for other patterns which can be widened into an
14172 // i16 shuffle as well.
14173 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
14174 for (int i = 0; i < 16; i += 2)
14175 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
14176 return false;
14177
14178 return true;
14179 };
14180 auto tryToWidenViaDuplication = [&]() -> SDValue {
14181 if (!canWidenViaDuplication(Mask))
14182 return SDValue();
14183 SmallVector<int, 4> LoInputs;
14184 copy_if(Mask, std::back_inserter(LoInputs),
14185 [](int M) { return M >= 0 && M < 8; });
14186 array_pod_sort(LoInputs.begin(), LoInputs.end());
14187 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
14188 LoInputs.end());
14189 SmallVector<int, 4> HiInputs;
14190 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
14191 array_pod_sort(HiInputs.begin(), HiInputs.end());
14192 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
14193 HiInputs.end());
14194
14195 bool TargetLo = LoInputs.size() >= HiInputs.size();
14196 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
14197 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
14198
14199 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
14201 for (int I : InPlaceInputs) {
14202 PreDupI16Shuffle[I/2] = I/2;
14203 LaneMap[I] = I;
14204 }
14205 int j = TargetLo ? 0 : 4, je = j + 4;
14206 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
14207 // Check if j is already a shuffle of this input. This happens when
14208 // there are two adjacent bytes after we move the low one.
14209 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
14210 // If we haven't yet mapped the input, search for a slot into which
14211 // we can map it.
14212 while (j < je && PreDupI16Shuffle[j] >= 0)
14213 ++j;
14214
14215 if (j == je)
14216 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
14217 return SDValue();
14218
14219 // Map this input with the i16 shuffle.
14220 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
14221 }
14222
14223 // Update the lane map based on the mapping we ended up with.
14224 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
14225 }
14226 V1 = DAG.getBitcast(
14227 MVT::v16i8,
14228 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14229 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
14230
14231 // Unpack the bytes to form the i16s that will be shuffled into place.
14232 bool EvenInUse = false, OddInUse = false;
14233 for (int i = 0; i < 16; i += 2) {
14234 EvenInUse |= (Mask[i + 0] >= 0);
14235 OddInUse |= (Mask[i + 1] >= 0);
14236 if (EvenInUse && OddInUse)
14237 break;
14238 }
14239 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
14240 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
14241 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
14242
14243 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
14244 for (int i = 0; i < 16; ++i)
14245 if (Mask[i] >= 0) {
14246 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
14247 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
14248 if (PostDupI16Shuffle[i / 2] < 0)
14249 PostDupI16Shuffle[i / 2] = MappedMask;
14250 else
14251 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
14252 "Conflicting entries in the original shuffle!");
14253 }
14254 return DAG.getBitcast(
14255 MVT::v16i8,
14256 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14257 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
14258 };
14259 if (SDValue V = tryToWidenViaDuplication())
14260 return V;
14261 }
14262
14263 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
14264 Zeroable, Subtarget, DAG))
14265 return Masked;
14266
14267 // Use dedicated unpack instructions for masks that match their pattern.
14268 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
14269 return V;
14270
14271 // Try to use byte shift instructions to mask.
14272 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
14273 Zeroable, Subtarget, DAG))
14274 return V;
14275
14276 // Check for compaction patterns.
14277 bool IsSingleInput = V2.isUndef();
14278 int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
14279
14280 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
14281 // with PSHUFB. It is important to do this before we attempt to generate any
14282 // blends but after all of the single-input lowerings. If the single input
14283 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
14284 // want to preserve that and we can DAG combine any longer sequences into
14285 // a PSHUFB in the end. But once we start blending from multiple inputs,
14286 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
14287 // and there are *very* few patterns that would actually be faster than the
14288 // PSHUFB approach because of its ability to zero lanes.
14289 //
14290 // If the mask is a binary compaction, we can more efficiently perform this
14291 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
14292 //
14293 // FIXME: The only exceptions to the above are blends which are exact
14294 // interleavings with direct instructions supporting them. We currently don't
14295 // handle those well here.
14296 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
14297 bool V1InUse = false;
14298 bool V2InUse = false;
14299
14301 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
14302
14303 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
14304 // do so. This avoids using them to handle blends-with-zero which is
14305 // important as a single pshufb is significantly faster for that.
14306 if (V1InUse && V2InUse) {
14307 if (Subtarget.hasSSE41())
14308 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
14309 Zeroable, Subtarget, DAG))
14310 return Blend;
14311
14312 // We can use an unpack to do the blending rather than an or in some
14313 // cases. Even though the or may be (very minorly) more efficient, we
14314 // preference this lowering because there are common cases where part of
14315 // the complexity of the shuffles goes away when we do the final blend as
14316 // an unpack.
14317 // FIXME: It might be worth trying to detect if the unpack-feeding
14318 // shuffles will both be pshufb, in which case we shouldn't bother with
14319 // this.
14321 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14322 return Unpack;
14323
14324 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
14325 if (Subtarget.hasVBMI())
14326 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
14327 DAG);
14328
14329 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
14330 if (Subtarget.hasXOP()) {
14331 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
14332 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
14333 }
14334
14335 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
14336 // PALIGNR will be cheaper than the second PSHUFB+OR.
14338 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14339 return V;
14340 }
14341
14342 return PSHUFB;
14343 }
14344
14345 // There are special ways we can lower some single-element blends.
14346 if (NumV2Elements == 1)
14348 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14349 return V;
14350
14351 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
14352 return Blend;
14353
14354 // Check whether a compaction lowering can be done. This handles shuffles
14355 // which take every Nth element for some even N. See the helper function for
14356 // details.
14357 //
14358 // We special case these as they can be particularly efficiently handled with
14359 // the PACKUSB instruction on x86 and they show up in common patterns of
14360 // rearranging bytes to truncate wide elements.
14361 if (NumEvenDrops) {
14362 // NumEvenDrops is the power of two stride of the elements. Another way of
14363 // thinking about it is that we need to drop the even elements this many
14364 // times to get the original input.
14365
14366 // First we need to zero all the dropped bytes.
14367 assert(NumEvenDrops <= 3 &&
14368 "No support for dropping even elements more than 3 times.");
14369 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
14370 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
14371 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
14372 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
14373 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
14374 WordClearMask);
14375 if (!IsSingleInput)
14376 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
14377 WordClearMask);
14378
14379 // Now pack things back together.
14380 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
14381 IsSingleInput ? V1 : V2);
14382 for (int i = 1; i < NumEvenDrops; ++i) {
14383 Result = DAG.getBitcast(MVT::v8i16, Result);
14384 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
14385 }
14386 return Result;
14387 }
14388
14389 int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
14390 if (NumOddDrops == 1) {
14391 V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
14392 DAG.getBitcast(MVT::v8i16, V1),
14393 DAG.getTargetConstant(8, DL, MVT::i8));
14394 if (!IsSingleInput)
14395 V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
14396 DAG.getBitcast(MVT::v8i16, V2),
14397 DAG.getTargetConstant(8, DL, MVT::i8));
14398 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
14399 IsSingleInput ? V1 : V2);
14400 }
14401
14402 // Handle multi-input cases by blending/unpacking single-input shuffles.
14403 if (NumV2Elements > 0)
14404 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
14405 Subtarget, DAG);
14406
14407 // The fallback path for single-input shuffles widens this into two v8i16
14408 // vectors with unpacks, shuffles those, and then pulls them back together
14409 // with a pack.
14410 SDValue V = V1;
14411
14412 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14413 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14414 for (int i = 0; i < 16; ++i)
14415 if (Mask[i] >= 0)
14416 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
14417
14418 SDValue VLoHalf, VHiHalf;
14419 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
14420 // them out and avoid using UNPCK{L,H} to extract the elements of V as
14421 // i16s.
14422 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
14423 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
14424 // Use a mask to drop the high bytes.
14425 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
14426 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
14427 DAG.getConstant(0x00FF, DL, MVT::v8i16));
14428
14429 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
14430 VHiHalf = DAG.getUNDEF(MVT::v8i16);
14431
14432 // Squash the masks to point directly into VLoHalf.
14433 for (int &M : LoBlendMask)
14434 if (M >= 0)
14435 M /= 2;
14436 for (int &M : HiBlendMask)
14437 if (M >= 0)
14438 M /= 2;
14439 } else {
14440 // Otherwise just unpack the low half of V into VLoHalf and the high half into
14441 // VHiHalf so that we can blend them as i16s.
14442 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
14443
14444 VLoHalf = DAG.getBitcast(
14445 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
14446 VHiHalf = DAG.getBitcast(
14447 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
14448 }
14449
14450 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
14451 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
14452
14453 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
14454}
14455
14456/// Dispatching routine to lower various 128-bit x86 vector shuffles.
14457///
14458/// This routine breaks down the specific type of 128-bit shuffle and
14459/// dispatches to the lowering routines accordingly.
14461 MVT VT, SDValue V1, SDValue V2,
14462 const APInt &Zeroable,
14463 const X86Subtarget &Subtarget,
14464 SelectionDAG &DAG) {
14465 if (VT == MVT::v8bf16) {
14466 V1 = DAG.getBitcast(MVT::v8i16, V1);
14467 V2 = DAG.getBitcast(MVT::v8i16, V2);
14468 return DAG.getBitcast(VT,
14469 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
14470 }
14471
14472 switch (VT.SimpleTy) {
14473 case MVT::v2i64:
14474 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14475 case MVT::v2f64:
14476 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14477 case MVT::v4i32:
14478 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14479 case MVT::v4f32:
14480 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14481 case MVT::v8i16:
14482 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14483 case MVT::v8f16:
14484 return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14485 case MVT::v16i8:
14486 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14487
14488 default:
14489 llvm_unreachable("Unimplemented!");
14490 }
14491}
14492
14493/// Generic routine to split vector shuffle into half-sized shuffles.
14494///
14495/// This routine just extracts two subvectors, shuffles them independently, and
14496/// then concatenates them back together. This should work effectively with all
14497/// AVX vector shuffle types.
14499 SDValue V2, ArrayRef<int> Mask,
14500 SelectionDAG &DAG, bool SimpleOnly) {
14501 assert(VT.getSizeInBits() >= 256 &&
14502 "Only for 256-bit or wider vector shuffles!");
14503 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
14504 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
14505
14506 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
14507 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
14508
14509 int NumElements = VT.getVectorNumElements();
14510 int SplitNumElements = NumElements / 2;
14511 MVT ScalarVT = VT.getVectorElementType();
14512 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
14513
14514 // Use splitVector/extractSubVector so that split build-vectors just build two
14515 // narrower build vectors. This helps shuffling with splats and zeros.
14516 auto SplitVector = [&](SDValue V) {
14517 SDValue LoV, HiV;
14518 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
14519 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
14520 DAG.getBitcast(SplitVT, HiV));
14521 };
14522
14523 SDValue LoV1, HiV1, LoV2, HiV2;
14524 std::tie(LoV1, HiV1) = SplitVector(V1);
14525 std::tie(LoV2, HiV2) = SplitVector(V2);
14526
14527 // Now create two 4-way blends of these half-width vectors.
14528 auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
14529 bool &UseHiV1, bool &UseLoV2,
14530 bool &UseHiV2) {
14531 UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
14532 for (int i = 0; i < SplitNumElements; ++i) {
14533 int M = HalfMask[i];
14534 if (M >= NumElements) {
14535 if (M >= NumElements + SplitNumElements)
14536 UseHiV2 = true;
14537 else
14538 UseLoV2 = true;
14539 } else if (M >= 0) {
14540 if (M >= SplitNumElements)
14541 UseHiV1 = true;
14542 else
14543 UseLoV1 = true;
14544 }
14545 }
14546 };
14547
14548 auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
14549 if (!SimpleOnly)
14550 return true;
14551
14552 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
14553 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
14554
14555 return !(UseHiV1 || UseHiV2);
14556 };
14557
14558 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
14559 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
14560 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
14561 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
14562 for (int i = 0; i < SplitNumElements; ++i) {
14563 int M = HalfMask[i];
14564 if (M >= NumElements) {
14565 V2BlendMask[i] = M - NumElements;
14566 BlendMask[i] = SplitNumElements + i;
14567 } else if (M >= 0) {
14568 V1BlendMask[i] = M;
14569 BlendMask[i] = i;
14570 }
14571 }
14572
14573 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
14574 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
14575
14576 // Because the lowering happens after all combining takes place, we need to
14577 // manually combine these blend masks as much as possible so that we create
14578 // a minimal number of high-level vector shuffle nodes.
14579 assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple");
14580
14581 // First try just blending the halves of V1 or V2.
14582 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
14583 return DAG.getUNDEF(SplitVT);
14584 if (!UseLoV2 && !UseHiV2)
14585 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
14586 if (!UseLoV1 && !UseHiV1)
14587 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
14588
14589 SDValue V1Blend, V2Blend;
14590 if (UseLoV1 && UseHiV1) {
14591 V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
14592 } else {
14593 // We only use half of V1 so map the usage down into the final blend mask.
14594 V1Blend = UseLoV1 ? LoV1 : HiV1;
14595 for (int i = 0; i < SplitNumElements; ++i)
14596 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
14597 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
14598 }
14599 if (UseLoV2 && UseHiV2) {
14600 V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
14601 } else {
14602 // We only use half of V2 so map the usage down into the final blend mask.
14603 V2Blend = UseLoV2 ? LoV2 : HiV2;
14604 for (int i = 0; i < SplitNumElements; ++i)
14605 if (BlendMask[i] >= SplitNumElements)
14606 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
14607 }
14608 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
14609 };
14610
14611 if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
14612 return SDValue();
14613
14614 SDValue Lo = HalfBlend(LoMask);
14615 SDValue Hi = HalfBlend(HiMask);
14616 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
14617}
14618
14619/// Either split a vector in halves or decompose the shuffles and the
14620/// blend/unpack.
14621///
14622/// This is provided as a good fallback for many lowerings of non-single-input
14623/// shuffles with more than one 128-bit lane. In those cases, we want to select
14624/// between splitting the shuffle into 128-bit components and stitching those
14625/// back together vs. extracting the single-input shuffles and blending those
14626/// results.
14628 SDValue V2, ArrayRef<int> Mask,
14629 const X86Subtarget &Subtarget,
14630 SelectionDAG &DAG) {
14631 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
14632 "shuffles as it could then recurse on itself.");
14633 int Size = Mask.size();
14634
14635 // If this can be modeled as a broadcast of two elements followed by a blend,
14636 // prefer that lowering. This is especially important because broadcasts can
14637 // often fold with memory operands.
14638 auto DoBothBroadcast = [&] {
14639 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
14640 for (int M : Mask)
14641 if (M >= Size) {
14642 if (V2BroadcastIdx < 0)
14643 V2BroadcastIdx = M - Size;
14644 else if (M - Size != V2BroadcastIdx)
14645 return false;
14646 } else if (M >= 0) {
14647 if (V1BroadcastIdx < 0)
14648 V1BroadcastIdx = M;
14649 else if (M != V1BroadcastIdx)
14650 return false;
14651 }
14652 return true;
14653 };
14654 if (DoBothBroadcast())
14655 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
14656 DAG);
14657
14658 // If the inputs all stem from a single 128-bit lane of each input, then we
14659 // split them rather than blending because the split will decompose to
14660 // unusually few instructions.
14661 int LaneCount = VT.getSizeInBits() / 128;
14662 int LaneSize = Size / LaneCount;
14663 SmallBitVector LaneInputs[2];
14664 LaneInputs[0].resize(LaneCount, false);
14665 LaneInputs[1].resize(LaneCount, false);
14666 for (int i = 0; i < Size; ++i)
14667 if (Mask[i] >= 0)
14668 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
14669 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
14670 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
14671 /*SimpleOnly*/ false);
14672
14673 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
14674 // requires that the decomposed single-input shuffles don't end up here.
14675 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
14676 DAG);
14677}
14678
14679// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
14680// TODO: Extend to support v8f32 (+ 512-bit shuffles).
14682 SDValue V1, SDValue V2,
14683 ArrayRef<int> Mask,
14684 SelectionDAG &DAG) {
14685 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
14686
14687 int LHSMask[4] = {-1, -1, -1, -1};
14688 int RHSMask[4] = {-1, -1, -1, -1};
14689 unsigned SHUFPMask = 0;
14690
14691 // As SHUFPD uses a single LHS/RHS element per lane, we can always
14692 // perform the shuffle once the lanes have been shuffled in place.
14693 for (int i = 0; i != 4; ++i) {
14694 int M = Mask[i];
14695 if (M < 0)
14696 continue;
14697 int LaneBase = i & ~1;
14698 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
14699 LaneMask[LaneBase + (M & 1)] = M;
14700 SHUFPMask |= (M & 1) << i;
14701 }
14702
14703 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
14704 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
14705 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
14706 DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
14707}
14708
14709/// Lower a vector shuffle crossing multiple 128-bit lanes as
14710/// a lane permutation followed by a per-lane permutation.
14711///
14712/// This is mainly for cases where we can have non-repeating permutes
14713/// in each lane.
14714///
14715/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
14716/// we should investigate merging them.
14718 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14719 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
14720 int NumElts = VT.getVectorNumElements();
14721 int NumLanes = VT.getSizeInBits() / 128;
14722 int NumEltsPerLane = NumElts / NumLanes;
14723 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
14724
14725 /// Attempts to find a sublane permute with the given size
14726 /// that gets all elements into their target lanes.
14727 ///
14728 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
14729 /// If unsuccessful, returns false and may overwrite InLaneMask.
14730 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
14731 int NumSublanesPerLane = NumSublanes / NumLanes;
14732 int NumEltsPerSublane = NumElts / NumSublanes;
14733
14734 SmallVector<int, 16> CrossLaneMask;
14735 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
14736 // CrossLaneMask but one entry == one sublane.
14737 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
14738
14739 for (int i = 0; i != NumElts; ++i) {
14740 int M = Mask[i];
14741 if (M < 0)
14742 continue;
14743
14744 int SrcSublane = M / NumEltsPerSublane;
14745 int DstLane = i / NumEltsPerLane;
14746
14747 // We only need to get the elements into the right lane, not sublane.
14748 // So search all sublanes that make up the destination lane.
14749 bool Found = false;
14750 int DstSubStart = DstLane * NumSublanesPerLane;
14751 int DstSubEnd = DstSubStart + NumSublanesPerLane;
14752 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
14753 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
14754 continue;
14755
14756 Found = true;
14757 CrossLaneMaskLarge[DstSublane] = SrcSublane;
14758 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
14759 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
14760 break;
14761 }
14762 if (!Found)
14763 return SDValue();
14764 }
14765
14766 // Fill CrossLaneMask using CrossLaneMaskLarge.
14767 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
14768
14769 if (!CanUseSublanes) {
14770 // If we're only shuffling a single lowest lane and the rest are identity
14771 // then don't bother.
14772 // TODO - isShuffleMaskInputInPlace could be extended to something like
14773 // this.
14774 int NumIdentityLanes = 0;
14775 bool OnlyShuffleLowestLane = true;
14776 for (int i = 0; i != NumLanes; ++i) {
14777 int LaneOffset = i * NumEltsPerLane;
14778 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
14779 i * NumEltsPerLane))
14780 NumIdentityLanes++;
14781 else if (CrossLaneMask[LaneOffset] != 0)
14782 OnlyShuffleLowestLane = false;
14783 }
14784 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
14785 return SDValue();
14786 }
14787
14788 // Avoid returning the same shuffle operation. For example,
14789 // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
14790 // undef:v16i16
14791 if (CrossLaneMask == Mask || InLaneMask == Mask)
14792 return SDValue();
14793
14794 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
14795 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
14796 InLaneMask);
14797 };
14798
14799 // First attempt a solution with full lanes.
14800 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
14801 return V;
14802
14803 // The rest of the solutions use sublanes.
14804 if (!CanUseSublanes)
14805 return SDValue();
14806
14807 // Then attempt a solution with 64-bit sublanes (vpermq).
14808 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
14809 return V;
14810
14811 // If that doesn't work and we have fast variable cross-lane shuffle,
14812 // attempt 32-bit sublanes (vpermd).
14813 if (!Subtarget.hasFastVariableCrossLaneShuffle())
14814 return SDValue();
14815
14816 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
14817}
14818
14819/// Helper to get compute inlane shuffle mask for a complete shuffle mask.
14820static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
14821 SmallVector<int> &InLaneMask) {
14822 int Size = Mask.size();
14823 InLaneMask.assign(Mask.begin(), Mask.end());
14824 for (int i = 0; i < Size; ++i) {
14825 int &M = InLaneMask[i];
14826 if (M < 0)
14827 continue;
14828 if (((M % Size) / LaneSize) != (i / LaneSize))
14829 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
14830 }
14831}
14832
14833/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
14834/// source with a lane permutation.
14835///
14836/// This lowering strategy results in four instructions in the worst case for a
14837/// single-input cross lane shuffle which is lower than any other fully general
14838/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
14839/// shuffle pattern should be handled prior to trying this lowering.
14841 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14842 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
14843 // FIXME: This should probably be generalized for 512-bit vectors as well.
14844 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
14845 int Size = Mask.size();
14846 int LaneSize = Size / 2;
14847
14848 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
14849 // Only do this if the elements aren't all from the lower lane,
14850 // otherwise we're (probably) better off doing a split.
14851 if (VT == MVT::v4f64 &&
14852 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
14853 return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);
14854
14855 // If there are only inputs from one 128-bit lane, splitting will in fact be
14856 // less expensive. The flags track whether the given lane contains an element
14857 // that crosses to another lane.
14858 bool AllLanes;
14859 if (!Subtarget.hasAVX2()) {
14860 bool LaneCrossing[2] = {false, false};
14861 for (int i = 0; i < Size; ++i)
14862 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
14863 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
14864 AllLanes = LaneCrossing[0] && LaneCrossing[1];
14865 } else {
14866 bool LaneUsed[2] = {false, false};
14867 for (int i = 0; i < Size; ++i)
14868 if (Mask[i] >= 0)
14869 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
14870 AllLanes = LaneUsed[0] && LaneUsed[1];
14871 }
14872
14873 // TODO - we could support shuffling V2 in the Flipped input.
14874 assert(V2.isUndef() &&
14875 "This last part of this routine only works on single input shuffles");
14876
14877 SmallVector<int> InLaneMask;
14878 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
14879
14880 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
14881 "In-lane shuffle mask expected");
14882
14883 // If we're not using both lanes in each lane and the inlane mask is not
14884 // repeating, then we're better off splitting.
14885 if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
14886 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
14887 /*SimpleOnly*/ false);
14888
14889 // Flip the lanes, and shuffle the results which should now be in-lane.
14890 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
14891 SDValue Flipped = DAG.getBitcast(PVT, V1);
14892 Flipped =
14893 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
14894 Flipped = DAG.getBitcast(VT, Flipped);
14895 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
14896}
14897
14898/// Handle lowering 2-lane 128-bit shuffles.
14900 SDValue V2, ArrayRef<int> Mask,
14901 const APInt &Zeroable,
14902 const X86Subtarget &Subtarget,
14903 SelectionDAG &DAG) {
14904 if (V2.isUndef()) {
14905 // Attempt to match VBROADCAST*128 subvector broadcast load.
14906 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
14907 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
14908 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
14910 MVT MemVT = VT.getHalfNumVectorElementsVT();
14911 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
14912 auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
14914 VT, MemVT, Ld, Ofs, DAG))
14915 return BcstLd;
14916 }
14917
14918 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
14919 if (Subtarget.hasAVX2())
14920 return SDValue();
14921 }
14922
14923 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
14924
14925 SmallVector<int, 4> WidenedMask;
14926 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
14927 return SDValue();
14928
14929 bool IsLowZero = (Zeroable & 0x3) == 0x3;
14930 bool IsHighZero = (Zeroable & 0xc) == 0xc;
14931
14932 // Try to use an insert into a zero vector.
14933 if (WidenedMask[0] == 0 && IsHighZero) {
14934 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
14935 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
14936 DAG.getIntPtrConstant(0, DL));
14937 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
14938 getZeroVector(VT, Subtarget, DAG, DL), LoV,
14939 DAG.getIntPtrConstant(0, DL));
14940 }
14941
14942 // TODO: If minimizing size and one of the inputs is a zero vector and the
14943 // the zero vector has only one use, we could use a VPERM2X128 to save the
14944 // instruction bytes needed to explicitly generate the zero vector.
14945
14946 // Blends are faster and handle all the non-lane-crossing cases.
14947 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
14948 Subtarget, DAG))
14949 return Blend;
14950
14951 // If either input operand is a zero vector, use VPERM2X128 because its mask
14952 // allows us to replace the zero input with an implicit zero.
14953 if (!IsLowZero && !IsHighZero) {
14954 // Check for patterns which can be matched with a single insert of a 128-bit
14955 // subvector.
14956 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
14957 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
14958
14959 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
14960 // this will likely become vinsertf128 which can't fold a 256-bit memop.
14961 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
14962 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
14963 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
14964 OnlyUsesV1 ? V1 : V2,
14965 DAG.getIntPtrConstant(0, DL));
14966 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
14967 DAG.getIntPtrConstant(2, DL));
14968 }
14969 }
14970
14971 // Try to use SHUF128 if possible.
14972 if (Subtarget.hasVLX()) {
14973 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
14974 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
14975 ((WidenedMask[1] % 2) << 1);
14976 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
14977 DAG.getTargetConstant(PermMask, DL, MVT::i8));
14978 }
14979 }
14980 }
14981
14982 // Otherwise form a 128-bit permutation. After accounting for undefs,
14983 // convert the 64-bit shuffle mask selection values into 128-bit
14984 // selection bits by dividing the indexes by 2 and shifting into positions
14985 // defined by a vperm2*128 instruction's immediate control byte.
14986
14987 // The immediate permute control byte looks like this:
14988 // [1:0] - select 128 bits from sources for low half of destination
14989 // [2] - ignore
14990 // [3] - zero low half of destination
14991 // [5:4] - select 128 bits from sources for high half of destination
14992 // [6] - ignore
14993 // [7] - zero high half of destination
14994
14995 assert((WidenedMask[0] >= 0 || IsLowZero) &&
14996 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
14997
14998 unsigned PermMask = 0;
14999 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
15000 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
15001
15002 // Check the immediate mask and replace unused sources with undef.
15003 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
15004 V1 = DAG.getUNDEF(VT);
15005 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
15006 V2 = DAG.getUNDEF(VT);
15007
15008 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
15009 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15010}
15011
15012/// Lower a vector shuffle by first fixing the 128-bit lanes and then
15013/// shuffling each lane.
15014///
15015/// This attempts to create a repeated lane shuffle where each lane uses one
15016/// or two of the lanes of the inputs. The lanes of the input vectors are
15017/// shuffled in one or two independent shuffles to get the lanes into the
15018/// position needed by the final shuffle.
15020 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15021 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15022 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
15023
15024 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15025 return SDValue();
15026
15027 int NumElts = Mask.size();
15028 int NumLanes = VT.getSizeInBits() / 128;
15029 int NumLaneElts = 128 / VT.getScalarSizeInBits();
15030 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
15031 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
15032
15033 // First pass will try to fill in the RepeatMask from lanes that need two
15034 // sources.
15035 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15036 int Srcs[2] = {-1, -1};
15037 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
15038 for (int i = 0; i != NumLaneElts; ++i) {
15039 int M = Mask[(Lane * NumLaneElts) + i];
15040 if (M < 0)
15041 continue;
15042 // Determine which of the possible input lanes (NumLanes from each source)
15043 // this element comes from. Assign that as one of the sources for this
15044 // lane. We can assign up to 2 sources for this lane. If we run out
15045 // sources we can't do anything.
15046 int LaneSrc = M / NumLaneElts;
15047 int Src;
15048 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
15049 Src = 0;
15050 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
15051 Src = 1;
15052 else
15053 return SDValue();
15054
15055 Srcs[Src] = LaneSrc;
15056 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
15057 }
15058
15059 // If this lane has two sources, see if it fits with the repeat mask so far.
15060 if (Srcs[1] < 0)
15061 continue;
15062
15063 LaneSrcs[Lane][0] = Srcs[0];
15064 LaneSrcs[Lane][1] = Srcs[1];
15065
15066 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
15067 assert(M1.size() == M2.size() && "Unexpected mask size");
15068 for (int i = 0, e = M1.size(); i != e; ++i)
15069 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
15070 return false;
15071 return true;
15072 };
15073
15074 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
15075 assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
15076 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
15077 int M = Mask[i];
15078 if (M < 0)
15079 continue;
15080 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
15081 "Unexpected mask element");
15082 MergedMask[i] = M;
15083 }
15084 };
15085
15086 if (MatchMasks(InLaneMask, RepeatMask)) {
15087 // Merge this lane mask into the final repeat mask.
15088 MergeMasks(InLaneMask, RepeatMask);
15089 continue;
15090 }
15091
15092 // Didn't find a match. Swap the operands and try again.
15093 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
15095
15096 if (MatchMasks(InLaneMask, RepeatMask)) {
15097 // Merge this lane mask into the final repeat mask.
15098 MergeMasks(InLaneMask, RepeatMask);
15099 continue;
15100 }
15101
15102 // Couldn't find a match with the operands in either order.
15103 return SDValue();
15104 }
15105
15106 // Now handle any lanes with only one source.
15107 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15108 // If this lane has already been processed, skip it.
15109 if (LaneSrcs[Lane][0] >= 0)
15110 continue;
15111
15112 for (int i = 0; i != NumLaneElts; ++i) {
15113 int M = Mask[(Lane * NumLaneElts) + i];
15114 if (M < 0)
15115 continue;
15116
15117 // If RepeatMask isn't defined yet we can define it ourself.
15118 if (RepeatMask[i] < 0)
15119 RepeatMask[i] = M % NumLaneElts;
15120
15121 if (RepeatMask[i] < NumElts) {
15122 if (RepeatMask[i] != M % NumLaneElts)
15123 return SDValue();
15124 LaneSrcs[Lane][0] = M / NumLaneElts;
15125 } else {
15126 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
15127 return SDValue();
15128 LaneSrcs[Lane][1] = M / NumLaneElts;
15129 }
15130 }
15131
15132 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
15133 return SDValue();
15134 }
15135
15136 SmallVector<int, 16> NewMask(NumElts, -1);
15137 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15138 int Src = LaneSrcs[Lane][0];
15139 for (int i = 0; i != NumLaneElts; ++i) {
15140 int M = -1;
15141 if (Src >= 0)
15142 M = Src * NumLaneElts + i;
15143 NewMask[Lane * NumLaneElts + i] = M;
15144 }
15145 }
15146 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15147 // Ensure we didn't get back the shuffle we started with.
15148 // FIXME: This is a hack to make up for some splat handling code in
15149 // getVectorShuffle.
15150 if (isa<ShuffleVectorSDNode>(NewV1) &&
15151 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
15152 return SDValue();
15153
15154 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15155 int Src = LaneSrcs[Lane][1];
15156 for (int i = 0; i != NumLaneElts; ++i) {
15157 int M = -1;
15158 if (Src >= 0)
15159 M = Src * NumLaneElts + i;
15160 NewMask[Lane * NumLaneElts + i] = M;
15161 }
15162 }
15163 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15164 // Ensure we didn't get back the shuffle we started with.
15165 // FIXME: This is a hack to make up for some splat handling code in
15166 // getVectorShuffle.
15167 if (isa<ShuffleVectorSDNode>(NewV2) &&
15168 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
15169 return SDValue();
15170
15171 for (int i = 0; i != NumElts; ++i) {
15172 if (Mask[i] < 0) {
15173 NewMask[i] = -1;
15174 continue;
15175 }
15176 NewMask[i] = RepeatMask[i % NumLaneElts];
15177 if (NewMask[i] < 0)
15178 continue;
15179
15180 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
15181 }
15182 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
15183}
15184
15185/// If the input shuffle mask results in a vector that is undefined in all upper
15186/// or lower half elements and that mask accesses only 2 halves of the
15187/// shuffle's operands, return true. A mask of half the width with mask indexes
15188/// adjusted to access the extracted halves of the original shuffle operands is
15189/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
15190/// lower half of each input operand is accessed.
15191static bool
15193 int &HalfIdx1, int &HalfIdx2) {
15194 assert((Mask.size() == HalfMask.size() * 2) &&
15195 "Expected input mask to be twice as long as output");
15196
15197 // Exactly one half of the result must be undef to allow narrowing.
15198 bool UndefLower = isUndefLowerHalf(Mask);
15199 bool UndefUpper = isUndefUpperHalf(Mask);
15200 if (UndefLower == UndefUpper)
15201 return false;
15202
15203 unsigned HalfNumElts = HalfMask.size();
15204 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
15205 HalfIdx1 = -1;
15206 HalfIdx2 = -1;
15207 for (unsigned i = 0; i != HalfNumElts; ++i) {
15208 int M = Mask[i + MaskIndexOffset];
15209 if (M < 0) {
15210 HalfMask[i] = M;
15211 continue;
15212 }
15213
15214 // Determine which of the 4 half vectors this element is from.
15215 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
15216 int HalfIdx = M / HalfNumElts;
15217
15218 // Determine the element index into its half vector source.
15219 int HalfElt = M % HalfNumElts;
15220
15221 // We can shuffle with up to 2 half vectors, set the new 'half'
15222 // shuffle mask accordingly.
15223 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
15224 HalfMask[i] = HalfElt;
15225 HalfIdx1 = HalfIdx;
15226 continue;
15227 }
15228 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
15229 HalfMask[i] = HalfElt + HalfNumElts;
15230 HalfIdx2 = HalfIdx;
15231 continue;
15232 }
15233
15234 // Too many half vectors referenced.
15235 return false;
15236 }
15237
15238 return true;
15239}
15240
15241/// Given the output values from getHalfShuffleMask(), create a half width
15242/// shuffle of extracted vectors followed by an insert back to full width.
15244 ArrayRef<int> HalfMask, int HalfIdx1,
15245 int HalfIdx2, bool UndefLower,
15246 SelectionDAG &DAG, bool UseConcat = false) {
15247 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
15248 assert(V1.getValueType().isSimple() && "Expecting only simple types");
15249
15250 MVT VT = V1.getSimpleValueType();
15251 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15252 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15253
15254 auto getHalfVector = [&](int HalfIdx) {
15255 if (HalfIdx < 0)
15256 return DAG.getUNDEF(HalfVT);
15257 SDValue V = (HalfIdx < 2 ? V1 : V2);
15258 HalfIdx = (HalfIdx % 2) * HalfNumElts;
15259 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
15260 DAG.getIntPtrConstant(HalfIdx, DL));
15261 };
15262
15263 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
15264 SDValue Half1 = getHalfVector(HalfIdx1);
15265 SDValue Half2 = getHalfVector(HalfIdx2);
15266 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
15267 if (UseConcat) {
15268 SDValue Op0 = V;
15269 SDValue Op1 = DAG.getUNDEF(HalfVT);
15270 if (UndefLower)
15271 std::swap(Op0, Op1);
15272 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
15273 }
15274
15275 unsigned Offset = UndefLower ? HalfNumElts : 0;
15276 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
15278}
15279
15280/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
15281/// This allows for fast cases such as subvector extraction/insertion
15282/// or shuffling smaller vector types which can lower more efficiently.
15284 SDValue V2, ArrayRef<int> Mask,
15285 const X86Subtarget &Subtarget,
15286 SelectionDAG &DAG) {
15287 assert((VT.is256BitVector() || VT.is512BitVector()) &&
15288 "Expected 256-bit or 512-bit vector");
15289
15290 bool UndefLower = isUndefLowerHalf(Mask);
15291 if (!UndefLower && !isUndefUpperHalf(Mask))
15292 return SDValue();
15293
15294 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
15295 "Completely undef shuffle mask should have been simplified already");
15296
15297 // Upper half is undef and lower half is whole upper subvector.
15298 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
15299 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15300 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15301 if (!UndefLower &&
15302 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
15303 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15304 DAG.getIntPtrConstant(HalfNumElts, DL));
15305 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15306 DAG.getIntPtrConstant(0, DL));
15307 }
15308
15309 // Lower half is undef and upper half is whole lower subvector.
15310 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
15311 if (UndefLower &&
15312 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
15313 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15314 DAG.getIntPtrConstant(0, DL));
15315 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15316 DAG.getIntPtrConstant(HalfNumElts, DL));
15317 }
15318
15319 int HalfIdx1, HalfIdx2;
15320 SmallVector<int, 8> HalfMask(HalfNumElts);
15321 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
15322 return SDValue();
15323
15324 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
15325
15326 // Only shuffle the halves of the inputs when useful.
15327 unsigned NumLowerHalves =
15328 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
15329 unsigned NumUpperHalves =
15330 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
15331 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
15332
15333 // Determine the larger pattern of undef/halves, then decide if it's worth
15334 // splitting the shuffle based on subtarget capabilities and types.
15335 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
15336 if (!UndefLower) {
15337 // XXXXuuuu: no insert is needed.
15338 // Always extract lowers when setting lower - these are all free subreg ops.
15339 if (NumUpperHalves == 0)
15340 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15341 UndefLower, DAG);
15342
15343 if (NumUpperHalves == 1) {
15344 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
15345 if (Subtarget.hasAVX2()) {
15346 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
15347 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
15348 !is128BitUnpackShuffleMask(HalfMask, DAG) &&
15349 (!isSingleSHUFPSMask(HalfMask) ||
15350 Subtarget.hasFastVariableCrossLaneShuffle()))
15351 return SDValue();
15352 // If this is a unary shuffle (assume that the 2nd operand is
15353 // canonicalized to undef), then we can use vpermpd. Otherwise, we
15354 // are better off extracting the upper half of 1 operand and using a
15355 // narrow shuffle.
15356 if (EltWidth == 64 && V2.isUndef())
15357 return SDValue();
15358 }
15359 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
15360 if (Subtarget.hasAVX512() && VT.is512BitVector())
15361 return SDValue();
15362 // Extract + narrow shuffle is better than the wide alternative.
15363 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15364 UndefLower, DAG);
15365 }
15366
15367 // Don't extract both uppers, instead shuffle and then extract.
15368 assert(NumUpperHalves == 2 && "Half vector count went wrong");
15369 return SDValue();
15370 }
15371
15372 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
15373 if (NumUpperHalves == 0) {
15374 // AVX2 has efficient 64-bit element cross-lane shuffles.
15375 // TODO: Refine to account for unary shuffle, splat, and other masks?
15376 if (Subtarget.hasAVX2() && EltWidth == 64)
15377 return SDValue();
15378 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
15379 if (Subtarget.hasAVX512() && VT.is512BitVector())
15380 return SDValue();
15381 // Narrow shuffle + insert is better than the wide alternative.
15382 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15383 UndefLower, DAG);
15384 }
15385
15386 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
15387 return SDValue();
15388}
15389
15390/// Handle case where shuffle sources are coming from the same 128-bit lane and
15391/// every lane can be represented as the same repeating mask - allowing us to
15392/// shuffle the sources with the repeating shuffle and then permute the result
15393/// to the destination lanes.
15395 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15396 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15397 int NumElts = VT.getVectorNumElements();
15398 int NumLanes = VT.getSizeInBits() / 128;
15399 int NumLaneElts = NumElts / NumLanes;
15400
15401 // On AVX2 we may be able to just shuffle the lowest elements and then
15402 // broadcast the result.
15403 if (Subtarget.hasAVX2()) {
15404 for (unsigned BroadcastSize : {16, 32, 64}) {
15405 if (BroadcastSize <= VT.getScalarSizeInBits())
15406 continue;
15407 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
15408
15409 // Attempt to match a repeating pattern every NumBroadcastElts,
15410 // accounting for UNDEFs but only references the lowest 128-bit
15411 // lane of the inputs.
15412 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
15413 for (int i = 0; i != NumElts; i += NumBroadcastElts)
15414 for (int j = 0; j != NumBroadcastElts; ++j) {
15415 int M = Mask[i + j];
15416 if (M < 0)
15417 continue;
15418 int &R = RepeatMask[j];
15419 if (0 != ((M % NumElts) / NumLaneElts))
15420 return false;
15421 if (0 <= R && R != M)
15422 return false;
15423 R = M;
15424 }
15425 return true;
15426 };
15427
15428 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
15429 if (!FindRepeatingBroadcastMask(RepeatMask))
15430 continue;
15431
15432 // Shuffle the (lowest) repeated elements in place for broadcast.
15433 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
15434
15435 // Shuffle the actual broadcast.
15436 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
15437 for (int i = 0; i != NumElts; i += NumBroadcastElts)
15438 for (int j = 0; j != NumBroadcastElts; ++j)
15439 BroadcastMask[i + j] = j;
15440
15441 // Avoid returning the same shuffle operation. For example,
15442 // v8i32 = vector_shuffle<0,1,0,1,0,1,0,1> t5, undef:v8i32
15443 if (BroadcastMask == Mask)
15444 return SDValue();
15445
15446 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
15447 BroadcastMask);
15448 }
15449 }
15450
15451 // Bail if the shuffle mask doesn't cross 128-bit lanes.
15452 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
15453 return SDValue();
15454
15455 // Bail if we already have a repeated lane shuffle mask.
15456 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15457 return SDValue();
15458
15459 // Helper to look for repeated mask in each split sublane, and that those
15460 // sublanes can then be permuted into place.
15461 auto ShuffleSubLanes = [&](int SubLaneScale) {
15462 int NumSubLanes = NumLanes * SubLaneScale;
15463 int NumSubLaneElts = NumLaneElts / SubLaneScale;
15464
15465 // Check that all the sources are coming from the same lane and see if we
15466 // can form a repeating shuffle mask (local to each sub-lane). At the same
15467 // time, determine the source sub-lane for each destination sub-lane.
15468 int TopSrcSubLane = -1;
15469 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
15470 SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
15471 SubLaneScale,
15472 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
15473
15474 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
15475 // Extract the sub-lane mask, check that it all comes from the same lane
15476 // and normalize the mask entries to come from the first lane.
15477 int SrcLane = -1;
15478 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
15479 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
15480 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
15481 if (M < 0)
15482 continue;
15483 int Lane = (M % NumElts) / NumLaneElts;
15484 if ((0 <= SrcLane) && (SrcLane != Lane))
15485 return SDValue();
15486 SrcLane = Lane;
15487 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
15488 SubLaneMask[Elt] = LocalM;
15489 }
15490
15491 // Whole sub-lane is UNDEF.
15492 if (SrcLane < 0)
15493 continue;
15494
15495 // Attempt to match against the candidate repeated sub-lane masks.
15496 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
15497 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
15498 for (int i = 0; i != NumSubLaneElts; ++i) {
15499 if (M1[i] < 0 || M2[i] < 0)
15500 continue;
15501 if (M1[i] != M2[i])
15502 return false;
15503 }
15504 return true;
15505 };
15506
15507 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
15508 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
15509 continue;
15510
15511 // Merge the sub-lane mask into the matching repeated sub-lane mask.
15512 for (int i = 0; i != NumSubLaneElts; ++i) {
15513 int M = SubLaneMask[i];
15514 if (M < 0)
15515 continue;
15516 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
15517 "Unexpected mask element");
15518 RepeatedSubLaneMask[i] = M;
15519 }
15520
15521 // Track the top most source sub-lane - by setting the remaining to
15522 // UNDEF we can greatly simplify shuffle matching.
15523 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
15524 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
15525 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
15526 break;
15527 }
15528
15529 // Bail if we failed to find a matching repeated sub-lane mask.
15530 if (Dst2SrcSubLanes[DstSubLane] < 0)
15531 return SDValue();
15532 }
15533 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
15534 "Unexpected source lane");
15535
15536 // Create a repeating shuffle mask for the entire vector.
15537 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
15538 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
15539 int Lane = SubLane / SubLaneScale;
15540 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
15541 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
15542 int M = RepeatedSubLaneMask[Elt];
15543 if (M < 0)
15544 continue;
15545 int Idx = (SubLane * NumSubLaneElts) + Elt;
15546 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
15547 }
15548 }
15549
15550 // Shuffle each source sub-lane to its destination.
15551 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
15552 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
15553 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
15554 if (SrcSubLane < 0)
15555 continue;
15556 for (int j = 0; j != NumSubLaneElts; ++j)
15557 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
15558 }
15559
15560 // Avoid returning the same shuffle operation.
15561 // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32
15562 if (RepeatedMask == Mask || SubLaneMask == Mask)
15563 return SDValue();
15564
15565 SDValue RepeatedShuffle =
15566 DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
15567
15568 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
15569 SubLaneMask);
15570 };
15571
15572 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
15573 // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
15574 // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.
15575 // Otherwise we can only permute whole 128-bit lanes.
15576 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
15577 if (Subtarget.hasAVX2() && VT.is256BitVector()) {
15578 bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);
15579 MinSubLaneScale = 2;
15580 MaxSubLaneScale =
15581 (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
15582 }
15583 if (Subtarget.hasBWI() && VT == MVT::v64i8)
15584 MinSubLaneScale = MaxSubLaneScale = 4;
15585
15586 for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
15587 if (SDValue Shuffle = ShuffleSubLanes(Scale))
15588 return Shuffle;
15589
15590 return SDValue();
15591}
15592
15594 bool &ForceV1Zero, bool &ForceV2Zero,
15595 unsigned &ShuffleImm, ArrayRef<int> Mask,
15596 const APInt &Zeroable) {
15597 int NumElts = VT.getVectorNumElements();
15598 assert(VT.getScalarSizeInBits() == 64 &&
15599 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
15600 "Unexpected data type for VSHUFPD");
15601 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
15602 "Illegal shuffle mask");
15603
15604 bool ZeroLane[2] = { true, true };
15605 for (int i = 0; i < NumElts; ++i)
15606 ZeroLane[i & 1] &= Zeroable[i];
15607
15608 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
15609 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
15610 ShuffleImm = 0;
15611 bool ShufpdMask = true;
15612 bool CommutableMask = true;
15613 for (int i = 0; i < NumElts; ++i) {
15614 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
15615 continue;
15616 if (Mask[i] < 0)
15617 return false;
15618 int Val = (i & 6) + NumElts * (i & 1);
15619 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
15620 if (Mask[i] < Val || Mask[i] > Val + 1)
15621 ShufpdMask = false;
15622 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
15623 CommutableMask = false;
15624 ShuffleImm |= (Mask[i] % 2) << i;
15625 }
15626
15627 if (!ShufpdMask && !CommutableMask)
15628 return false;
15629
15630 if (!ShufpdMask && CommutableMask)
15631 std::swap(V1, V2);
15632
15633 ForceV1Zero = ZeroLane[0];
15634 ForceV2Zero = ZeroLane[1];
15635 return true;
15636}
15637
15639 SDValue V2, ArrayRef<int> Mask,
15640 const APInt &Zeroable,
15641 const X86Subtarget &Subtarget,
15642 SelectionDAG &DAG) {
15643 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
15644 "Unexpected data type for VSHUFPD");
15645
15646 unsigned Immediate = 0;
15647 bool ForceV1Zero = false, ForceV2Zero = false;
15648 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
15649 Mask, Zeroable))
15650 return SDValue();
15651
15652 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
15653 if (ForceV1Zero)
15654 V1 = getZeroVector(VT, Subtarget, DAG, DL);
15655 if (ForceV2Zero)
15656 V2 = getZeroVector(VT, Subtarget, DAG, DL);
15657
15658 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
15659 DAG.getTargetConstant(Immediate, DL, MVT::i8));
15660}
15661
15662// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
15663// by zeroable elements in the remaining 24 elements. Turn this into two
15664// vmovqb instructions shuffled together.
15666 SDValue V1, SDValue V2,
15667 ArrayRef<int> Mask,
15668 const APInt &Zeroable,
15669 SelectionDAG &DAG) {
15670 assert(VT == MVT::v32i8 && "Unexpected type!");
15671
15672 // The first 8 indices should be every 8th element.
15673 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
15674 return SDValue();
15675
15676 // Remaining elements need to be zeroable.
15677 if (Zeroable.countl_one() < (Mask.size() - 8))
15678 return SDValue();
15679
15680 V1 = DAG.getBitcast(MVT::v4i64, V1);
15681 V2 = DAG.getBitcast(MVT::v4i64, V2);
15682
15683 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
15684 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
15685
15686 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
15687 // the upper bits of the result using an unpckldq.
15688 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
15689 { 0, 1, 2, 3, 16, 17, 18, 19,
15690 4, 5, 6, 7, 20, 21, 22, 23 });
15691 // Insert the unpckldq into a zero vector to widen to v32i8.
15692 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
15693 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
15694 DAG.getIntPtrConstant(0, DL));
15695}
15696
15697// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2
15698// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2
15699// =>
15700// ul = unpckl v1, v2
15701// uh = unpckh v1, v2
15702// a = vperm ul, uh
15703// b = vperm ul, uh
15704//
15705// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
15706// and permute. We cannot directly match v3 because it is split into two
15707// 256-bit vectors in earlier isel stages. Therefore, this function matches a
15708// pair of 256-bit shuffles and makes sure the masks are consecutive.
15709//
15710// Once unpck and permute nodes are created, the permute corresponding to this
15711// shuffle is returned, while the other permute replaces the other half of the
15712// shuffle in the selection dag.
15714 SDValue V1, SDValue V2,
15715 ArrayRef<int> Mask,
15716 SelectionDAG &DAG) {
15717 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
15718 VT != MVT::v32i8)
15719 return SDValue();
15720 // <B0, B1, B0+1, B1+1, ..., >
15721 auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,
15722 unsigned Begin1) {
15723 size_t Size = Mask.size();
15724 assert(Size % 2 == 0 && "Expected even mask size");
15725 for (unsigned I = 0; I < Size; I += 2) {
15726 if (Mask[I] != (int)(Begin0 + I / 2) ||
15727 Mask[I + 1] != (int)(Begin1 + I / 2))
15728 return false;
15729 }
15730 return true;
15731 };
15732 // Check which half is this shuffle node
15733 int NumElts = VT.getVectorNumElements();
15734 size_t FirstQtr = NumElts / 2;
15735 size_t ThirdQtr = NumElts + NumElts / 2;
15736 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
15737 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
15738 if (!IsFirstHalf && !IsSecondHalf)
15739 return SDValue();
15740
15741 // Find the intersection between shuffle users of V1 and V2.
15742 SmallVector<SDNode *, 2> Shuffles;
15743 for (SDNode *User : V1->uses())
15744 if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
15745 User->getOperand(1) == V2)
15746 Shuffles.push_back(User);
15747 // Limit user size to two for now.
15748 if (Shuffles.size() != 2)
15749 return SDValue();
15750 // Find out which half of the 512-bit shuffles is each smaller shuffle
15751 auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
15752 auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
15753 SDNode *FirstHalf;
15754 SDNode *SecondHalf;
15755 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
15756 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
15757 FirstHalf = Shuffles[0];
15758 SecondHalf = Shuffles[1];
15759 } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
15760 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
15761 FirstHalf = Shuffles[1];
15762 SecondHalf = Shuffles[0];
15763 } else {
15764 return SDValue();
15765 }
15766 // Lower into unpck and perm. Return the perm of this shuffle and replace
15767 // the other.
15768 SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
15769 SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
15770 SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
15771 DAG.getTargetConstant(0x20, DL, MVT::i8));
15772 SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
15773 DAG.getTargetConstant(0x31, DL, MVT::i8));
15774 if (IsFirstHalf) {
15775 DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);
15776 return Perm1;
15777 }
15778 DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);
15779 return Perm2;
15780}
15781
15782/// Handle lowering of 4-lane 64-bit floating point shuffles.
15783///
15784/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
15785/// isn't available.
15787 const APInt &Zeroable, SDValue V1, SDValue V2,
15788 const X86Subtarget &Subtarget,
15789 SelectionDAG &DAG) {
15790 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
15791 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
15792 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
15793
15794 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
15795 Subtarget, DAG))
15796 return V;
15797
15798 if (V2.isUndef()) {
15799 // Check for being able to broadcast a single element.
15800 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
15801 Mask, Subtarget, DAG))
15802 return Broadcast;
15803
15804 // Use low duplicate instructions for masks that match their pattern.
15805 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
15806 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
15807
15808 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
15809 // Non-half-crossing single input shuffles can be lowered with an
15810 // interleaved permutation.
15811 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
15812 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
15813 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
15814 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
15815 }
15816
15817 // With AVX2 we have direct support for this permutation.
15818 if (Subtarget.hasAVX2())
15819 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
15820 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
15821
15822 // Try to create an in-lane repeating shuffle mask and then shuffle the
15823 // results into the target lanes.
15825 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
15826 return V;
15827
15828 // Try to permute the lanes and then use a per-lane permute.
15829 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
15830 Mask, DAG, Subtarget))
15831 return V;
15832
15833 // Otherwise, fall back.
15834 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
15835 DAG, Subtarget);
15836 }
15837
15838 // Use dedicated unpack instructions for masks that match their pattern.
15839 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
15840 return V;
15841
15842 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
15843 Zeroable, Subtarget, DAG))
15844 return Blend;
15845
15846 // Check if the blend happens to exactly fit that of SHUFPD.
15847 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
15848 Zeroable, Subtarget, DAG))
15849 return Op;
15850
15851 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
15852 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
15853
15854 // If we have lane crossing shuffles AND they don't all come from the lower
15855 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15856 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
15857 // canonicalize to a blend of splat which isn't necessary for this combine.
15858 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
15859 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
15860 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
15861 (V2.getOpcode() != ISD::BUILD_VECTOR))
15862 return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
15863
15864 // If we have one input in place, then we can permute the other input and
15865 // blend the result.
15866 if (V1IsInPlace || V2IsInPlace)
15867 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
15868 Subtarget, DAG);
15869
15870 // Try to create an in-lane repeating shuffle mask and then shuffle the
15871 // results into the target lanes.
15873 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
15874 return V;
15875
15876 // Try to simplify this by merging 128-bit lanes to enable a lane-based
15877 // shuffle. However, if we have AVX2 and either inputs are already in place,
15878 // we will be able to shuffle even across lanes the other input in a single
15879 // instruction so skip this pattern.
15880 if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))
15882 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
15883 return V;
15884
15885 // If we have VLX support, we can use VEXPAND.
15886 if (Subtarget.hasVLX())
15887 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
15888 DAG, Subtarget))
15889 return V;
15890
15891 // If we have AVX2 then we always want to lower with a blend because an v4 we
15892 // can fully permute the elements.
15893 if (Subtarget.hasAVX2())
15894 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
15895 Subtarget, DAG);
15896
15897 // Otherwise fall back on generic lowering.
15898 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
15899 Subtarget, DAG);
15900}
15901
15902/// Handle lowering of 4-lane 64-bit integer shuffles.
15903///
15904/// This routine is only called when we have AVX2 and thus a reasonable
15905/// instruction set for v4i64 shuffling..
15907 const APInt &Zeroable, SDValue V1, SDValue V2,
15908 const X86Subtarget &Subtarget,
15909 SelectionDAG &DAG) {
15910 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
15911 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
15912 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
15913 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
15914
15915 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
15916 Subtarget, DAG))
15917 return V;
15918
15919 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
15920 Zeroable, Subtarget, DAG))
15921 return Blend;
15922
15923 // Check for being able to broadcast a single element.
15924 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
15925 Subtarget, DAG))
15926 return Broadcast;
15927
15928 // Try to use shift instructions if fast.
15929 if (Subtarget.preferLowerShuffleAsShift())
15930 if (SDValue Shift =
15931 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
15932 Subtarget, DAG, /*BitwiseOnly*/ true))
15933 return Shift;
15934
15935 if (V2.isUndef()) {
15936 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
15937 // can use lower latency instructions that will operate on both lanes.
15938 SmallVector<int, 2> RepeatedMask;
15939 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
15940 SmallVector<int, 4> PSHUFDMask;
15941 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
15942 return DAG.getBitcast(
15943 MVT::v4i64,
15944 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
15945 DAG.getBitcast(MVT::v8i32, V1),
15946 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
15947 }
15948
15949 // AVX2 provides a direct instruction for permuting a single input across
15950 // lanes.
15951 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
15952 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
15953 }
15954
15955 // Try to use shift instructions.
15956 if (SDValue Shift =
15957 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,
15958 DAG, /*BitwiseOnly*/ false))
15959 return Shift;
15960
15961 // If we have VLX support, we can use VALIGN or VEXPAND.
15962 if (Subtarget.hasVLX()) {
15963 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
15964 Zeroable, Subtarget, DAG))
15965 return Rotate;
15966
15967 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
15968 DAG, Subtarget))
15969 return V;
15970 }
15971
15972 // Try to use PALIGNR.
15973 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
15974 Subtarget, DAG))
15975 return Rotate;
15976
15977 // Use dedicated unpack instructions for masks that match their pattern.
15978 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
15979 return V;
15980
15981 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
15982 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
15983
15984 // If we have one input in place, then we can permute the other input and
15985 // blend the result.
15986 if (V1IsInPlace || V2IsInPlace)
15987 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
15988 Subtarget, DAG);
15989
15990 // Try to create an in-lane repeating shuffle mask and then shuffle the
15991 // results into the target lanes.
15993 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
15994 return V;
15995
15996 // Try to lower to PERMQ(BLENDD(V1,V2)).
15997 if (SDValue V =
15998 lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
15999 return V;
16000
16001 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16002 // shuffle. However, if we have AVX2 and either inputs are already in place,
16003 // we will be able to shuffle even across lanes the other input in a single
16004 // instruction so skip this pattern.
16005 if (!V1IsInPlace && !V2IsInPlace)
16007 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16008 return Result;
16009
16010 // Otherwise fall back on generic blend lowering.
16011 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16012 Subtarget, DAG);
16013}
16014
16015/// Handle lowering of 8-lane 32-bit floating point shuffles.
16016///
16017/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
16018/// isn't available.
16020 const APInt &Zeroable, SDValue V1, SDValue V2,
16021 const X86Subtarget &Subtarget,
16022 SelectionDAG &DAG) {
16023 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16024 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16025 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16026
16027 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
16028 Zeroable, Subtarget, DAG))
16029 return Blend;
16030
16031 // Check for being able to broadcast a single element.
16032 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
16033 Subtarget, DAG))
16034 return Broadcast;
16035
16036 if (!Subtarget.hasAVX2()) {
16037 SmallVector<int> InLaneMask;
16038 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
16039
16040 if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
16041 if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
16042 /*SimpleOnly*/ true))
16043 return R;
16044 }
16045 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16046 Zeroable, Subtarget, DAG))
16047 return DAG.getBitcast(MVT::v8f32, ZExt);
16048
16049 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16050 // options to efficiently lower the shuffle.
16051 SmallVector<int, 4> RepeatedMask;
16052 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
16053 assert(RepeatedMask.size() == 4 &&
16054 "Repeated masks must be half the mask width!");
16055
16056 // Use even/odd duplicate instructions for masks that match their pattern.
16057 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
16058 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
16059 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
16060 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
16061
16062 if (V2.isUndef())
16063 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
16064 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16065
16066 // Use dedicated unpack instructions for masks that match their pattern.
16067 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
16068 return V;
16069
16070 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
16071 // have already handled any direct blends.
16072 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
16073 }
16074
16075 // Try to create an in-lane repeating shuffle mask and then shuffle the
16076 // results into the target lanes.
16078 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16079 return V;
16080
16081 // If we have a single input shuffle with different shuffle patterns in the
16082 // two 128-bit lanes use the variable mask to VPERMILPS.
16083 if (V2.isUndef()) {
16084 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
16085 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16086 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
16087 }
16088 if (Subtarget.hasAVX2()) {
16089 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16090 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
16091 }
16092 // Otherwise, fall back.
16093 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
16094 DAG, Subtarget);
16095 }
16096
16097 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16098 // shuffle.
16100 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16101 return Result;
16102
16103 // If we have VLX support, we can use VEXPAND.
16104 if (Subtarget.hasVLX())
16105 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
16106 DAG, Subtarget))
16107 return V;
16108
16109 // Try to match an interleave of two v8f32s and lower them as unpck and
16110 // permutes using ymms. This needs to go before we try to split the vectors.
16111 //
16112 // TODO: Expand this to AVX1. Currently v8i32 is casted to v8f32 and hits
16113 // this path inadvertently.
16114 if (Subtarget.hasAVX2() && !Subtarget.hasAVX512())
16115 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
16116 Mask, DAG))
16117 return V;
16118
16119 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16120 // since after split we get a more efficient code using vpunpcklwd and
16121 // vpunpckhwd instrs than vblend.
16122 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
16123 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,
16124 DAG);
16125
16126 // If we have AVX2 then we always want to lower with a blend because at v8 we
16127 // can fully permute the elements.
16128 if (Subtarget.hasAVX2())
16129 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
16130 Subtarget, DAG);
16131
16132 // Otherwise fall back on generic lowering.
16133 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
16134 Subtarget, DAG);
16135}
16136
16137/// Handle lowering of 8-lane 32-bit integer shuffles.
16138///
16139/// This routine is only called when we have AVX2 and thus a reasonable
16140/// instruction set for v8i32 shuffling..
16142 const APInt &Zeroable, SDValue V1, SDValue V2,
16143 const X86Subtarget &Subtarget,
16144 SelectionDAG &DAG) {
16145 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16146 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16147 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16148 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
16149
16150 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
16151
16152 // Whenever we can lower this as a zext, that instruction is strictly faster
16153 // than any alternative. It also allows us to fold memory operands into the
16154 // shuffle in many cases.
16155 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16156 Zeroable, Subtarget, DAG))
16157 return ZExt;
16158
16159 // Try to match an interleave of two v8i32s and lower them as unpck and
16160 // permutes using ymms. This needs to go before we try to split the vectors.
16161 if (!Subtarget.hasAVX512())
16162 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,
16163 Mask, DAG))
16164 return V;
16165
16166 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16167 // since after split we get a more efficient code than vblend by using
16168 // vpunpcklwd and vpunpckhwd instrs.
16169 if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
16170 !Subtarget.hasAVX512())
16171 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,
16172 DAG);
16173
16174 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
16175 Zeroable, Subtarget, DAG))
16176 return Blend;
16177
16178 // Check for being able to broadcast a single element.
16179 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
16180 Subtarget, DAG))
16181 return Broadcast;
16182
16183 // Try to use shift instructions if fast.
16184 if (Subtarget.preferLowerShuffleAsShift()) {
16185 if (SDValue Shift =
16186 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16187 Subtarget, DAG, /*BitwiseOnly*/ true))
16188 return Shift;
16189 if (NumV2Elements == 0)
16190 if (SDValue Rotate =
16191 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16192 return Rotate;
16193 }
16194
16195 // If the shuffle mask is repeated in each 128-bit lane we can use more
16196 // efficient instructions that mirror the shuffles across the two 128-bit
16197 // lanes.
16198 SmallVector<int, 4> RepeatedMask;
16199 bool Is128BitLaneRepeatedShuffle =
16200 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
16201 if (Is128BitLaneRepeatedShuffle) {
16202 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
16203 if (V2.isUndef())
16204 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
16205 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16206
16207 // Use dedicated unpack instructions for masks that match their pattern.
16208 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
16209 return V;
16210 }
16211
16212 // Try to use shift instructions.
16213 if (SDValue Shift =
16214 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,
16215 DAG, /*BitwiseOnly*/ false))
16216 return Shift;
16217
16218 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
16219 if (SDValue Rotate =
16220 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16221 return Rotate;
16222
16223 // If we have VLX support, we can use VALIGN or EXPAND.
16224 if (Subtarget.hasVLX()) {
16225 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
16226 Zeroable, Subtarget, DAG))
16227 return Rotate;
16228
16229 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
16230 DAG, Subtarget))
16231 return V;
16232 }
16233
16234 // Try to use byte rotation instructions.
16235 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
16236 Subtarget, DAG))
16237 return Rotate;
16238
16239 // Try to create an in-lane repeating shuffle mask and then shuffle the
16240 // results into the target lanes.
16242 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16243 return V;
16244
16245 if (V2.isUndef()) {
16246 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16247 // because that should be faster than the variable permute alternatives.
16248 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))
16249 return V;
16250
16251 // If the shuffle patterns aren't repeated but it's a single input, directly
16252 // generate a cross-lane VPERMD instruction.
16253 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16254 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
16255 }
16256
16257 // Assume that a single SHUFPS is faster than an alternative sequence of
16258 // multiple instructions (even if the CPU has a domain penalty).
16259 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
16260 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
16261 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
16262 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
16263 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
16264 CastV1, CastV2, DAG);
16265 return DAG.getBitcast(MVT::v8i32, ShufPS);
16266 }
16267
16268 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16269 // shuffle.
16271 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16272 return Result;
16273
16274 // Otherwise fall back on generic blend lowering.
16275 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
16276 Subtarget, DAG);
16277}
16278
16279/// Handle lowering of 16-lane 16-bit integer shuffles.
16280///
16281/// This routine is only called when we have AVX2 and thus a reasonable
16282/// instruction set for v16i16 shuffling..
16284 const APInt &Zeroable, SDValue V1, SDValue V2,
16285 const X86Subtarget &Subtarget,
16286 SelectionDAG &DAG) {
16287 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16288 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16289 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16290 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
16291
16292 // Whenever we can lower this as a zext, that instruction is strictly faster
16293 // than any alternative. It also allows us to fold memory operands into the
16294 // shuffle in many cases.
16296 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16297 return ZExt;
16298
16299 // Check for being able to broadcast a single element.
16300 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
16301 Subtarget, DAG))
16302 return Broadcast;
16303
16304 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
16305 Zeroable, Subtarget, DAG))
16306 return Blend;
16307
16308 // Use dedicated unpack instructions for masks that match their pattern.
16309 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
16310 return V;
16311
16312 // Use dedicated pack instructions for masks that match their pattern.
16313 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
16314 Subtarget))
16315 return V;
16316
16317 // Try to use lower using a truncation.
16318 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16319 Subtarget, DAG))
16320 return V;
16321
16322 // Try to use shift instructions.
16323 if (SDValue Shift =
16324 lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16325 Subtarget, DAG, /*BitwiseOnly*/ false))
16326 return Shift;
16327
16328 // Try to use byte rotation instructions.
16329 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
16330 Subtarget, DAG))
16331 return Rotate;
16332
16333 // Try to create an in-lane repeating shuffle mask and then shuffle the
16334 // results into the target lanes.
16336 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16337 return V;
16338
16339 if (V2.isUndef()) {
16340 // Try to use bit rotation instructions.
16341 if (SDValue Rotate =
16342 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
16343 return Rotate;
16344
16345 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16346 // because that should be faster than the variable permute alternatives.
16347 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))
16348 return V;
16349
16350 // There are no generalized cross-lane shuffle operations available on i16
16351 // element types.
16352 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
16354 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
16355 return V;
16356
16357 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
16358 DAG, Subtarget);
16359 }
16360
16361 SmallVector<int, 8> RepeatedMask;
16362 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
16363 // As this is a single-input shuffle, the repeated mask should be
16364 // a strictly valid v8i16 mask that we can pass through to the v8i16
16365 // lowering to handle even the v16 case.
16367 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
16368 }
16369 }
16370
16371 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
16372 Zeroable, Subtarget, DAG))
16373 return PSHUFB;
16374
16375 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
16376 if (Subtarget.hasBWI())
16377 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
16378
16379 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16380 // shuffle.
16382 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16383 return Result;
16384
16385 // Try to permute the lanes and then use a per-lane permute.
16387 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
16388 return V;
16389
16390 // Try to match an interleave of two v16i16s and lower them as unpck and
16391 // permutes using ymms.
16392 if (!Subtarget.hasAVX512())
16393 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,
16394 Mask, DAG))
16395 return V;
16396
16397 // Otherwise fall back on generic lowering.
16398 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
16399 Subtarget, DAG);
16400}
16401
16402/// Handle lowering of 32-lane 8-bit integer shuffles.
16403///
16404/// This routine is only called when we have AVX2 and thus a reasonable
16405/// instruction set for v32i8 shuffling..
16407 const APInt &Zeroable, SDValue V1, SDValue V2,
16408 const X86Subtarget &Subtarget,
16409 SelectionDAG &DAG) {
16410 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
16411 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
16412 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
16413 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
16414
16415 // Whenever we can lower this as a zext, that instruction is strictly faster
16416 // than any alternative. It also allows us to fold memory operands into the
16417 // shuffle in many cases.
16418 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
16419 Zeroable, Subtarget, DAG))
16420 return ZExt;
16421
16422 // Check for being able to broadcast a single element.
16423 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
16424 Subtarget, DAG))
16425 return Broadcast;
16426
16427 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
16428 Zeroable, Subtarget, DAG))
16429 return Blend;
16430
16431 // Use dedicated unpack instructions for masks that match their pattern.
16432 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
16433 return V;
16434
16435 // Use dedicated pack instructions for masks that match their pattern.
16436 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
16437 Subtarget))
16438 return V;
16439
16440 // Try to use lower using a truncation.
16441 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
16442 Subtarget, DAG))
16443 return V;
16444
16445 // Try to use shift instructions.
16446 if (SDValue Shift =
16447 lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,
16448 DAG, /*BitwiseOnly*/ false))
16449 return Shift;
16450
16451 // Try to use byte rotation instructions.
16452 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
16453 Subtarget, DAG))
16454 return Rotate;
16455
16456 // Try to use bit rotation instructions.
16457 if (V2.isUndef())
16458 if (SDValue Rotate =
16459 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
16460 return Rotate;
16461
16462 // Try to create an in-lane repeating shuffle mask and then shuffle the
16463 // results into the target lanes.
16465 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16466 return V;
16467
16468 // There are no generalized cross-lane shuffle operations available on i8
16469 // element types.
16470 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
16471 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16472 // because that should be faster than the variable permute alternatives.
16473 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))
16474 return V;
16475
16477 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
16478 return V;
16479
16480 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
16481 DAG, Subtarget);
16482 }
16483
16484 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
16485 Zeroable, Subtarget, DAG))
16486 return PSHUFB;
16487
16488 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
16489 if (Subtarget.hasVBMI())
16490 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
16491
16492 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16493 // shuffle.
16495 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16496 return Result;
16497
16498 // Try to permute the lanes and then use a per-lane permute.
16500 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
16501 return V;
16502
16503 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16504 // by zeroable elements in the remaining 24 elements. Turn this into two
16505 // vmovqb instructions shuffled together.
16506 if (Subtarget.hasVLX())
16507 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
16508 Mask, Zeroable, DAG))
16509 return V;
16510
16511 // Try to match an interleave of two v32i8s and lower them as unpck and
16512 // permutes using ymms.
16513 if (!Subtarget.hasAVX512())
16514 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,
16515 Mask, DAG))
16516 return V;
16517
16518 // Otherwise fall back on generic lowering.
16519 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
16520 Subtarget, DAG);
16521}
16522
16523/// High-level routine to lower various 256-bit x86 vector shuffles.
16524///
16525/// This routine either breaks down the specific type of a 256-bit x86 vector
16526/// shuffle or splits it into two 128-bit shuffles and fuses the results back
16527/// together based on the available instructions.
16529 SDValue V1, SDValue V2, const APInt &Zeroable,
16530 const X86Subtarget &Subtarget,
16531 SelectionDAG &DAG) {
16532 // If we have a single input to the zero element, insert that into V1 if we
16533 // can do so cheaply.
16534 int NumElts = VT.getVectorNumElements();
16535 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
16536
16537 if (NumV2Elements == 1 && Mask[0] >= NumElts)
16539 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
16540 return Insertion;
16541
16542 // Handle special cases where the lower or upper half is UNDEF.
16543 if (SDValue V =
16544 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
16545 return V;
16546
16547 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
16548 // can check for those subtargets here and avoid much of the subtarget
16549 // querying in the per-vector-type lowering routines. With AVX1 we have
16550 // essentially *zero* ability to manipulate a 256-bit vector with integer
16551 // types. Since we'll use floating point types there eventually, just
16552 // immediately cast everything to a float and operate entirely in that domain.
16553 if (VT.isInteger() && !Subtarget.hasAVX2()) {
16554 int ElementBits = VT.getScalarSizeInBits();
16555 if (ElementBits < 32) {
16556 // No floating point type available, if we can't use the bit operations
16557 // for masking/blending then decompose into 128-bit vectors.
16558 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
16559 Subtarget, DAG))
16560 return V;
16561 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
16562 return V;
16563 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
16564 }
16565
16566 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
16568 V1 = DAG.getBitcast(FpVT, V1);
16569 V2 = DAG.getBitcast(FpVT, V2);
16570 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
16571 }
16572
16573 if (VT == MVT::v16f16 || VT == MVT::v16bf16) {
16574 V1 = DAG.getBitcast(MVT::v16i16, V1);
16575 V2 = DAG.getBitcast(MVT::v16i16, V2);
16576 return DAG.getBitcast(VT,
16577 DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
16578 }
16579
16580 switch (VT.SimpleTy) {
16581 case MVT::v4f64:
16582 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16583 case MVT::v4i64:
16584 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16585 case MVT::v8f32:
16586 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16587 case MVT::v8i32:
16588 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16589 case MVT::v16i16:
16590 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16591 case MVT::v32i8:
16592 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16593
16594 default:
16595 llvm_unreachable("Not a valid 256-bit x86 vector type!");
16596 }
16597}
16598
16599/// Try to lower a vector shuffle as a 128-bit shuffles.
16601 const APInt &Zeroable, SDValue V1, SDValue V2,
16602 const X86Subtarget &Subtarget,
16603 SelectionDAG &DAG) {
16604 assert(VT.getScalarSizeInBits() == 64 &&
16605 "Unexpected element type size for 128bit shuffle.");
16606
16607 // To handle 256 bit vector requires VLX and most probably
16608 // function lowerV2X128VectorShuffle() is better solution.
16609 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
16610
16611 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
16612 SmallVector<int, 4> Widened128Mask;
16613 if (!canWidenShuffleElements(Mask, Widened128Mask))
16614 return SDValue();
16615 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
16616
16617 // Try to use an insert into a zero vector.
16618 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
16619 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
16620 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
16621 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
16622 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
16623 DAG.getIntPtrConstant(0, DL));
16624 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
16625 getZeroVector(VT, Subtarget, DAG, DL), LoV,
16626 DAG.getIntPtrConstant(0, DL));
16627 }
16628
16629 // Check for patterns which can be matched with a single insert of a 256-bit
16630 // subvector.
16631 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
16632 if (OnlyUsesV1 ||
16633 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
16634 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
16635 SDValue SubVec =
16636 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
16637 DAG.getIntPtrConstant(0, DL));
16638 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
16639 DAG.getIntPtrConstant(4, DL));
16640 }
16641
16642 // See if this is an insertion of the lower 128-bits of V2 into V1.
16643 bool IsInsert = true;
16644 int V2Index = -1;
16645 for (int i = 0; i < 4; ++i) {
16646 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
16647 if (Widened128Mask[i] < 0)
16648 continue;
16649
16650 // Make sure all V1 subvectors are in place.
16651 if (Widened128Mask[i] < 4) {
16652 if (Widened128Mask[i] != i) {
16653 IsInsert = false;
16654 break;
16655 }
16656 } else {
16657 // Make sure we only have a single V2 index and its the lowest 128-bits.
16658 if (V2Index >= 0 || Widened128Mask[i] != 4) {
16659 IsInsert = false;
16660 break;
16661 }
16662 V2Index = i;
16663 }
16664 }
16665 if (IsInsert && V2Index >= 0) {
16666 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16667 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
16668 DAG.getIntPtrConstant(0, DL));
16669 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
16670 }
16671
16672 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
16673 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
16674 // possible we at least ensure the lanes stay sequential to help later
16675 // combines.
16676 SmallVector<int, 2> Widened256Mask;
16677 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
16678 Widened128Mask.clear();
16679 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
16680 }
16681
16682 // Try to lower to vshuf64x2/vshuf32x4.
16683 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
16684 int PermMask[4] = {-1, -1, -1, -1};
16685 // Ensure elements came from the same Op.
16686 for (int i = 0; i < 4; ++i) {
16687 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
16688 if (Widened128Mask[i] < 0)
16689 continue;
16690
16691 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
16692 unsigned OpIndex = i / 2;
16693 if (Ops[OpIndex].isUndef())
16694 Ops[OpIndex] = Op;
16695 else if (Ops[OpIndex] != Op)
16696 return SDValue();
16697
16698 PermMask[i] = Widened128Mask[i] % 4;
16699 }
16700
16701 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
16702 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
16703}
16704
16705/// Handle lowering of 8-lane 64-bit floating point shuffles.
16707 const APInt &Zeroable, SDValue V1, SDValue V2,
16708 const X86Subtarget &Subtarget,
16709 SelectionDAG &DAG) {
16710 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
16711 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
16712 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16713
16714 if (V2.isUndef()) {
16715 // Use low duplicate instructions for masks that match their pattern.
16716 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
16717 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
16718
16719 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
16720 // Non-half-crossing single input shuffles can be lowered with an
16721 // interleaved permutation.
16722 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16723 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
16724 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
16725 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
16726 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
16727 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16728 }
16729
16730 SmallVector<int, 4> RepeatedMask;
16731 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
16732 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
16733 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16734 }
16735
16736 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
16737 V2, Subtarget, DAG))
16738 return Shuf128;
16739
16740 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
16741 return Unpck;
16742
16743 // Check if the blend happens to exactly fit that of SHUFPD.
16744 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
16745 Zeroable, Subtarget, DAG))
16746 return Op;
16747
16748 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
16749 DAG, Subtarget))
16750 return V;
16751
16752 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
16753 Zeroable, Subtarget, DAG))
16754 return Blend;
16755
16756 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
16757}
16758
16759/// Handle lowering of 16-lane 32-bit floating point shuffles.
16761 const APInt &Zeroable, SDValue V1, SDValue V2,
16762 const X86Subtarget &Subtarget,
16763 SelectionDAG &DAG) {
16764 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
16765 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
16766 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16767
16768 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16769 // options to efficiently lower the shuffle.
16770 SmallVector<int, 4> RepeatedMask;
16771 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
16772 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
16773
16774 // Use even/odd duplicate instructions for masks that match their pattern.
16775 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
16776 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
16777 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
16778 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
16779
16780 if (V2.isUndef())
16781 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
16782 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16783
16784 // Use dedicated unpack instructions for masks that match their pattern.
16785 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
16786 return V;
16787
16788 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
16789 Zeroable, Subtarget, DAG))
16790 return Blend;
16791
16792 // Otherwise, fall back to a SHUFPS sequence.
16793 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
16794 }
16795
16796 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
16797 Zeroable, Subtarget, DAG))
16798 return Blend;
16799
16801 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
16802 return DAG.getBitcast(MVT::v16f32, ZExt);
16803
16804 // Try to create an in-lane repeating shuffle mask and then shuffle the
16805 // results into the target lanes.
16807 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
16808 return V;
16809
16810 // If we have a single input shuffle with different shuffle patterns in the
16811 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
16812 if (V2.isUndef() &&
16813 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
16814 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
16815 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
16816 }
16817
16818 // If we have AVX512F support, we can use VEXPAND.
16819 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
16820 V1, V2, DAG, Subtarget))
16821 return V;
16822
16823 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
16824}
16825
16826/// Handle lowering of 8-lane 64-bit integer shuffles.
16828 const APInt &Zeroable, SDValue V1, SDValue V2,
16829 const X86Subtarget &Subtarget,
16830 SelectionDAG &DAG) {
16831 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
16832 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
16833 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16834
16835 // Try to use shift instructions if fast.
16836 if (Subtarget.preferLowerShuffleAsShift())
16837 if (SDValue Shift =
16838 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
16839 Subtarget, DAG, /*BitwiseOnly*/ true))
16840 return Shift;
16841
16842 if (V2.isUndef()) {
16843 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16844 // can use lower latency instructions that will operate on all four
16845 // 128-bit lanes.
16846 SmallVector<int, 2> Repeated128Mask;
16847 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
16848 SmallVector<int, 4> PSHUFDMask;
16849 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
16850 return DAG.getBitcast(
16851 MVT::v8i64,
16852 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
16853 DAG.getBitcast(MVT::v16i32, V1),
16854 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16855 }
16856
16857 SmallVector<int, 4> Repeated256Mask;
16858 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
16859 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
16860 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
16861 }
16862
16863 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
16864 V2, Subtarget, DAG))
16865 return Shuf128;
16866
16867 // Try to use shift instructions.
16868 if (SDValue Shift =
16869 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,
16870 DAG, /*BitwiseOnly*/ false))
16871 return Shift;
16872
16873 // Try to use VALIGN.
16874 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
16875 Zeroable, Subtarget, DAG))
16876 return Rotate;
16877
16878 // Try to use PALIGNR.
16879 if (Subtarget.hasBWI())
16880 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
16881 Subtarget, DAG))
16882 return Rotate;
16883
16884 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
16885 return Unpck;
16886
16887 // If we have AVX512F support, we can use VEXPAND.
16888 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
16889 DAG, Subtarget))
16890 return V;
16891
16892 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
16893 Zeroable, Subtarget, DAG))
16894 return Blend;
16895
16896 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
16897}
16898
16899/// Handle lowering of 16-lane 32-bit integer shuffles.
16901 const APInt &Zeroable, SDValue V1, SDValue V2,
16902 const X86Subtarget &Subtarget,
16903 SelectionDAG &DAG) {
16904 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
16905 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
16906 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16907
16908 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
16909
16910 // Whenever we can lower this as a zext, that instruction is strictly faster
16911 // than any alternative. It also allows us to fold memory operands into the
16912 // shuffle in many cases.
16914 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
16915 return ZExt;
16916
16917 // Try to use shift instructions if fast.
16918 if (Subtarget.preferLowerShuffleAsShift()) {
16919 if (SDValue Shift =
16920 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
16921 Subtarget, DAG, /*BitwiseOnly*/ true))
16922 return Shift;
16923 if (NumV2Elements == 0)
16924 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,
16925 Subtarget, DAG))
16926 return Rotate;
16927 }
16928
16929 // If the shuffle mask is repeated in each 128-bit lane we can use more
16930 // efficient instructions that mirror the shuffles across the four 128-bit
16931 // lanes.
16932 SmallVector<int, 4> RepeatedMask;
16933 bool Is128BitLaneRepeatedShuffle =
16934 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
16935 if (Is128BitLaneRepeatedShuffle) {
16936 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
16937 if (V2.isUndef())
16938 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
16939 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16940
16941 // Use dedicated unpack instructions for masks that match their pattern.
16942 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
16943 return V;
16944 }
16945
16946 // Try to use shift instructions.
16947 if (SDValue Shift =
16948 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
16949 Subtarget, DAG, /*BitwiseOnly*/ false))
16950 return Shift;
16951
16952 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
16953 if (SDValue Rotate =
16954 lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))
16955 return Rotate;
16956
16957 // Try to use VALIGN.
16958 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
16959 Zeroable, Subtarget, DAG))
16960 return Rotate;
16961
16962 // Try to use byte rotation instructions.
16963 if (Subtarget.hasBWI())
16964 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
16965 Subtarget, DAG))
16966 return Rotate;
16967
16968 // Assume that a single SHUFPS is faster than using a permv shuffle.
16969 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
16970 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
16971 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
16972 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
16973 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
16974 CastV1, CastV2, DAG);
16975 return DAG.getBitcast(MVT::v16i32, ShufPS);
16976 }
16977
16978 // Try to create an in-lane repeating shuffle mask and then shuffle the
16979 // results into the target lanes.
16981 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
16982 return V;
16983
16984 // If we have AVX512F support, we can use VEXPAND.
16985 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
16986 DAG, Subtarget))
16987 return V;
16988
16989 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
16990 Zeroable, Subtarget, DAG))
16991 return Blend;
16992
16993 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
16994}
16995
16996/// Handle lowering of 32-lane 16-bit integer shuffles.
16998 const APInt &Zeroable, SDValue V1, SDValue V2,
16999 const X86Subtarget &Subtarget,
17000 SelectionDAG &DAG) {
17001 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17002 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17003 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17004 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
17005
17006 // Whenever we can lower this as a zext, that instruction is strictly faster
17007 // than any alternative. It also allows us to fold memory operands into the
17008 // shuffle in many cases.
17010 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17011 return ZExt;
17012
17013 // Use dedicated unpack instructions for masks that match their pattern.
17014 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
17015 return V;
17016
17017 // Use dedicated pack instructions for masks that match their pattern.
17018 if (SDValue V =
17019 lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))
17020 return V;
17021
17022 // Try to use shift instructions.
17023 if (SDValue Shift =
17024 lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,
17025 Subtarget, DAG, /*BitwiseOnly*/ false))
17026 return Shift;
17027
17028 // Try to use byte rotation instructions.
17029 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
17030 Subtarget, DAG))
17031 return Rotate;
17032
17033 if (V2.isUndef()) {
17034 // Try to use bit rotation instructions.
17035 if (SDValue Rotate =
17036 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
17037 return Rotate;
17038
17039 SmallVector<int, 8> RepeatedMask;
17040 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
17041 // As this is a single-input shuffle, the repeated mask should be
17042 // a strictly valid v8i16 mask that we can pass through to the v8i16
17043 // lowering to handle even the v32 case.
17044 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
17045 RepeatedMask, Subtarget, DAG);
17046 }
17047 }
17048
17049 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
17050 Zeroable, Subtarget, DAG))
17051 return Blend;
17052
17053 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
17054 Zeroable, Subtarget, DAG))
17055 return PSHUFB;
17056
17057 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
17058}
17059
17060/// Handle lowering of 64-lane 8-bit integer shuffles.
17062 const APInt &Zeroable, SDValue V1, SDValue V2,
17063 const X86Subtarget &Subtarget,
17064 SelectionDAG &DAG) {
17065 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17066 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17067 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
17068 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
17069
17070 // Whenever we can lower this as a zext, that instruction is strictly faster
17071 // than any alternative. It also allows us to fold memory operands into the
17072 // shuffle in many cases.
17074 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
17075 return ZExt;
17076
17077 // Use dedicated unpack instructions for masks that match their pattern.
17078 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
17079 return V;
17080
17081 // Use dedicated pack instructions for masks that match their pattern.
17082 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
17083 Subtarget))
17084 return V;
17085
17086 // Try to use shift instructions.
17087 if (SDValue Shift =
17088 lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,
17089 DAG, /*BitwiseOnly*/ false))
17090 return Shift;
17091
17092 // Try to use byte rotation instructions.
17093 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
17094 Subtarget, DAG))
17095 return Rotate;
17096
17097 // Try to use bit rotation instructions.
17098 if (V2.isUndef())
17099 if (SDValue Rotate =
17100 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
17101 return Rotate;
17102
17103 // Lower as AND if possible.
17104 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
17105 Zeroable, Subtarget, DAG))
17106 return Masked;
17107
17108 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
17109 Zeroable, Subtarget, DAG))
17110 return PSHUFB;
17111
17112 // Try to create an in-lane repeating shuffle mask and then shuffle the
17113 // results into the target lanes.
17115 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17116 return V;
17117
17119 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
17120 return Result;
17121
17122 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
17123 Zeroable, Subtarget, DAG))
17124 return Blend;
17125
17126 if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
17127 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
17128 // PALIGNR will be cheaper than the second PSHUFB+OR.
17129 if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
17130 Mask, Subtarget, DAG))
17131 return V;
17132
17133 // If we can't directly blend but can use PSHUFB, that will be better as it
17134 // can both shuffle and set up the inefficient blend.
17135 bool V1InUse, V2InUse;
17136 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
17137 DAG, V1InUse, V2InUse);
17138 }
17139
17140 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17141 // shuffle.
17142 if (!V2.isUndef())
17144 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17145 return Result;
17146
17147 // VBMI can use VPERMV/VPERMV3 byte shuffles.
17148 if (Subtarget.hasVBMI())
17149 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
17150
17151 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17152}
17153
17154/// High-level routine to lower various 512-bit x86 vector shuffles.
17155///
17156/// This routine either breaks down the specific type of a 512-bit x86 vector
17157/// shuffle or splits it into two 256-bit shuffles and fuses the results back
17158/// together based on the available instructions.
17160 MVT VT, SDValue V1, SDValue V2,
17161 const APInt &Zeroable,
17162 const X86Subtarget &Subtarget,
17163 SelectionDAG &DAG) {
17164 assert(Subtarget.hasAVX512() &&
17165 "Cannot lower 512-bit vectors w/ basic ISA!");
17166
17167 // If we have a single input to the zero element, insert that into V1 if we
17168 // can do so cheaply.
17169 int NumElts = Mask.size();
17170 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17171
17172 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17174 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17175 return Insertion;
17176
17177 // Handle special cases where the lower or upper half is UNDEF.
17178 if (SDValue V =
17179 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17180 return V;
17181
17182 // Check for being able to broadcast a single element.
17183 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
17184 Subtarget, DAG))
17185 return Broadcast;
17186
17187 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
17188 // Try using bit ops for masking and blending before falling back to
17189 // splitting.
17190 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17191 Subtarget, DAG))
17192 return V;
17193 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17194 return V;
17195
17196 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17197 }
17198
17199 if (VT == MVT::v32f16 || VT == MVT::v32bf16) {
17200 if (!Subtarget.hasBWI())
17201 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
17202 /*SimpleOnly*/ false);
17203
17204 V1 = DAG.getBitcast(MVT::v32i16, V1);
17205 V2 = DAG.getBitcast(MVT::v32i16, V2);
17206 return DAG.getBitcast(VT,
17207 DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
17208 }
17209
17210 // Dispatch to each element type for lowering. If we don't have support for
17211 // specific element type shuffles at 512 bits, immediately split them and
17212 // lower them. Each lowering routine of a given type is allowed to assume that
17213 // the requisite ISA extensions for that element type are available.
17214 switch (VT.SimpleTy) {
17215 case MVT::v8f64:
17216 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17217 case MVT::v16f32:
17218 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17219 case MVT::v8i64:
17220 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17221 case MVT::v16i32:
17222 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17223 case MVT::v32i16:
17224 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17225 case MVT::v64i8:
17226 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17227
17228 default:
17229 llvm_unreachable("Not a valid 512-bit x86 vector type!");
17230 }
17231}
17232
17234 MVT VT, SDValue V1, SDValue V2,
17235 const X86Subtarget &Subtarget,
17236 SelectionDAG &DAG) {
17237 // Shuffle should be unary.
17238 if (!V2.isUndef())
17239 return SDValue();
17240
17241 int ShiftAmt = -1;
17242 int NumElts = Mask.size();
17243 for (int i = 0; i != NumElts; ++i) {
17244 int M = Mask[i];
17245 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
17246 "Unexpected mask index.");
17247 if (M < 0)
17248 continue;
17249
17250 // The first non-undef element determines our shift amount.
17251 if (ShiftAmt < 0) {
17252 ShiftAmt = M - i;
17253 // Need to be shifting right.
17254 if (ShiftAmt <= 0)
17255 return SDValue();
17256 }
17257 // All non-undef elements must shift by the same amount.
17258 if (ShiftAmt != M - i)
17259 return SDValue();
17260 }
17261 assert(ShiftAmt >= 0 && "All undef?");
17262
17263 // Great we found a shift right.
17264 SDValue Res = widenMaskVector(V1, false, Subtarget, DAG, DL);
17265 Res = DAG.getNode(X86ISD::KSHIFTR, DL, Res.getValueType(), Res,
17266 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17267 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17268 DAG.getIntPtrConstant(0, DL));
17269}
17270
17271// Determine if this shuffle can be implemented with a KSHIFT instruction.
17272// Returns the shift amount if possible or -1 if not. This is a simplified
17273// version of matchShuffleAsShift.
17274static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
17275 int MaskOffset, const APInt &Zeroable) {
17276 int Size = Mask.size();
17277
17278 auto CheckZeros = [&](int Shift, bool Left) {
17279 for (int j = 0; j < Shift; ++j)
17280 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
17281 return false;
17282
17283 return true;
17284 };
17285
17286 auto MatchShift = [&](int Shift, bool Left) {
17287 unsigned Pos = Left ? Shift : 0;
17288 unsigned Low = Left ? 0 : Shift;
17289 unsigned Len = Size - Shift;
17290 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
17291 };
17292
17293 for (int Shift = 1; Shift != Size; ++Shift)
17294 for (bool Left : {true, false})
17295 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
17297 return Shift;
17298 }
17299
17300 return -1;
17301}
17302
17303
17304// Lower vXi1 vector shuffles.
17305// There is no a dedicated instruction on AVX-512 that shuffles the masks.
17306// The only way to shuffle bits is to sign-extend the mask vector to SIMD
17307// vector, shuffle and then truncate it back.
17309 MVT VT, SDValue V1, SDValue V2,
17310 const APInt &Zeroable,
17311 const X86Subtarget &Subtarget,
17312 SelectionDAG &DAG) {
17313 assert(Subtarget.hasAVX512() &&
17314 "Cannot lower 512-bit vectors w/o basic ISA!");
17315
17316 int NumElts = Mask.size();
17317 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17318
17319 // Try to recognize shuffles that are just padding a subvector with zeros.
17320 int SubvecElts = 0;
17321 int Src = -1;
17322 for (int i = 0; i != NumElts; ++i) {
17323 if (Mask[i] >= 0) {
17324 // Grab the source from the first valid mask. All subsequent elements need
17325 // to use this same source.
17326 if (Src < 0)
17327 Src = Mask[i] / NumElts;
17328 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
17329 break;
17330 }
17331
17332 ++SubvecElts;
17333 }
17334 assert(SubvecElts != NumElts && "Identity shuffle?");
17335
17336 // Clip to a power 2.
17337 SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);
17338
17339 // Make sure the number of zeroable bits in the top at least covers the bits
17340 // not covered by the subvector.
17341 if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
17342 assert(Src >= 0 && "Expected a source!");
17343 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
17344 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
17345 Src == 0 ? V1 : V2,
17346 DAG.getIntPtrConstant(0, DL));
17347 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17348 DAG.getConstant(0, DL, VT),
17349 Extract, DAG.getIntPtrConstant(0, DL));
17350 }
17351
17352 // Try a simple shift right with undef elements. Later we'll try with zeros.
17353 if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,
17354 DAG))
17355 return Shift;
17356
17357 // Try to match KSHIFTs.
17358 unsigned Offset = 0;
17359 for (SDValue V : { V1, V2 }) {
17360 unsigned Opcode;
17361 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
17362 if (ShiftAmt >= 0) {
17363 SDValue Res = widenMaskVector(V, false, Subtarget, DAG, DL);
17364 MVT WideVT = Res.getSimpleValueType();
17365 // Widened right shifts need two shifts to ensure we shift in zeroes.
17366 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
17367 int WideElts = WideVT.getVectorNumElements();
17368 // Shift left to put the original vector in the MSBs of the new size.
17369 Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
17370 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
17371 // Increase the shift amount to account for the left shift.
17372 ShiftAmt += WideElts - NumElts;
17373 }
17374
17375 Res = DAG.getNode(Opcode, DL, WideVT, Res,
17376 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17377 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17378 DAG.getIntPtrConstant(0, DL));
17379 }
17380 Offset += NumElts; // Increment for next iteration.
17381 }
17382
17383 // If we're performing an unary shuffle on a SETCC result, try to shuffle the
17384 // ops instead.
17385 // TODO: What other unary shuffles would benefit from this?
17386 if (NumV2Elements == 0 && V1.getOpcode() == ISD::SETCC && V1->hasOneUse()) {
17387 SDValue Op0 = V1.getOperand(0);
17388 SDValue Op1 = V1.getOperand(1);
17389 ISD::CondCode CC = cast<CondCodeSDNode>(V1.getOperand(2))->get();
17390 EVT OpVT = Op0.getValueType();
17391 if (OpVT.getScalarSizeInBits() >= 32 || isBroadcastShuffleMask(Mask))
17392 return DAG.getSetCC(
17393 DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
17394 DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
17395 }
17396
17397 MVT ExtVT;
17398 switch (VT.SimpleTy) {
17399 default:
17400 llvm_unreachable("Expected a vector of i1 elements");
17401 case MVT::v2i1:
17402 ExtVT = MVT::v2i64;
17403 break;
17404 case MVT::v4i1:
17405 ExtVT = MVT::v4i32;
17406 break;
17407 case MVT::v8i1:
17408 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
17409 // shuffle.
17410 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
17411 break;
17412 case MVT::v16i1:
17413 // Take 512-bit type, unless we are avoiding 512-bit types and have the
17414 // 256-bit operation available.
17415 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
17416 break;
17417 case MVT::v32i1:
17418 // Take 512-bit type, unless we are avoiding 512-bit types and have the
17419 // 256-bit operation available.
17420 assert(Subtarget.hasBWI() && "Expected AVX512BW support");
17421 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
17422 break;
17423 case MVT::v64i1:
17424 // Fall back to scalarization. FIXME: We can do better if the shuffle
17425 // can be partitioned cleanly.
17426 if (!Subtarget.useBWIRegs())
17427 return SDValue();
17428 ExtVT = MVT::v64i8;
17429 break;
17430 }
17431
17432 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
17433 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
17434
17435 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
17436 // i1 was sign extended we can use X86ISD::CVT2MASK.
17437 int NumElems = VT.getVectorNumElements();
17438 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
17439 (Subtarget.hasDQI() && (NumElems < 32)))
17440 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
17441 Shuffle, ISD::SETGT);
17442
17443 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
17444}
17445
17446/// Helper function that returns true if the shuffle mask should be
17447/// commuted to improve canonicalization.
17449 int NumElements = Mask.size();
17450
17451 int NumV1Elements = 0, NumV2Elements = 0;
17452 for (int M : Mask)
17453 if (M < 0)
17454 continue;
17455 else if (M < NumElements)
17456 ++NumV1Elements;
17457 else
17458 ++NumV2Elements;
17459
17460 // Commute the shuffle as needed such that more elements come from V1 than
17461 // V2. This allows us to match the shuffle pattern strictly on how many
17462 // elements come from V1 without handling the symmetric cases.
17463 if (NumV2Elements > NumV1Elements)
17464 return true;
17465
17466 assert(NumV1Elements > 0 && "No V1 indices");
17467
17468 if (NumV2Elements == 0)
17469 return false;
17470
17471 // When the number of V1 and V2 elements are the same, try to minimize the
17472 // number of uses of V2 in the low half of the vector. When that is tied,
17473 // ensure that the sum of indices for V1 is equal to or lower than the sum
17474 // indices for V2. When those are equal, try to ensure that the number of odd
17475 // indices for V1 is lower than the number of odd indices for V2.
17476 if (NumV1Elements == NumV2Elements) {
17477 int LowV1Elements = 0, LowV2Elements = 0;
17478 for (int M : Mask.slice(0, NumElements / 2))
17479 if (M >= NumElements)
17480 ++LowV2Elements;
17481 else if (M >= 0)
17482 ++LowV1Elements;
17483 if (LowV2Elements > LowV1Elements)
17484 return true;
17485 if (LowV2Elements == LowV1Elements) {
17486 int SumV1Indices = 0, SumV2Indices = 0;
17487 for (int i = 0, Size = Mask.size(); i < Size; ++i)
17488 if (Mask[i] >= NumElements)
17489 SumV2Indices += i;
17490 else if (Mask[i] >= 0)
17491 SumV1Indices += i;
17492 if (SumV2Indices < SumV1Indices)
17493 return true;
17494 if (SumV2Indices == SumV1Indices) {
17495 int NumV1OddIndices = 0, NumV2OddIndices = 0;
17496 for (int i = 0, Size = Mask.size(); i < Size; ++i)
17497 if (Mask[i] >= NumElements)
17498 NumV2OddIndices += i % 2;
17499 else if (Mask[i] >= 0)
17500 NumV1OddIndices += i % 2;
17501 if (NumV2OddIndices < NumV1OddIndices)
17502 return true;
17503 }
17504 }
17505 }
17506
17507 return false;
17508}
17509
17511 const X86Subtarget &Subtarget) {
17512 if (!Subtarget.hasAVX512())
17513 return false;
17514
17515 if (!V.getValueType().isSimple())
17516 return false;
17517
17518 MVT VT = V.getSimpleValueType().getScalarType();
17519 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
17520 return false;
17521
17522 // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd
17523 // are preferable to blendw/blendvb/masked-mov.
17524 if ((VT == MVT::i16 || VT == MVT::i8) &&
17525 V.getSimpleValueType().getSizeInBits() < 512)
17526 return false;
17527
17528 auto HasMaskOperation = [&](SDValue V) {
17529 // TODO: Currently we only check limited opcode. We probably extend
17530 // it to all binary operation by checking TLI.isBinOp().
17531 switch (V->getOpcode()) {
17532 default:
17533 return false;
17534 case ISD::ADD:
17535 case ISD::SUB:
17536 case ISD::AND:
17537 case ISD::XOR:
17538 case ISD::OR:
17539 case ISD::SMAX:
17540 case ISD::SMIN:
17541 case ISD::UMAX:
17542 case ISD::UMIN:
17543 case ISD::ABS:
17544 case ISD::SHL:
17545 case ISD::SRL:
17546 case ISD::SRA:
17547 case ISD::MUL:
17548 break;
17549 }
17550 if (!V->hasOneUse())
17551 return false;
17552
17553 return true;
17554 };
17555
17556 if (HasMaskOperation(V))
17557 return true;
17558
17559 return false;
17560}
17561
17562// Forward declaration.
17565 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
17566 const X86Subtarget &Subtarget);
17567
17568 /// Top-level lowering for x86 vector shuffles.
17569///
17570/// This handles decomposition, canonicalization, and lowering of all x86
17571/// vector shuffles. Most of the specific lowering strategies are encapsulated
17572/// above in helper routines. The canonicalization attempts to widen shuffles
17573/// to involve fewer lanes of wider elements, consolidate symmetric patterns
17574/// s.t. only one of the two inputs needs to be tested, etc.
17576 SelectionDAG &DAG) {
17577 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
17578 ArrayRef<int> OrigMask = SVOp->getMask();
17579 SDValue V1 = Op.getOperand(0);
17580 SDValue V2 = Op.getOperand(1);
17581 MVT VT = Op.getSimpleValueType();
17582 int NumElements = VT.getVectorNumElements();
17583 SDLoc DL(Op);
17584 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
17585
17586 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
17587 "Can't lower MMX shuffles");
17588
17589 bool V1IsUndef = V1.isUndef();
17590 bool V2IsUndef = V2.isUndef();
17591 if (V1IsUndef && V2IsUndef)
17592 return DAG.getUNDEF(VT);
17593
17594 // When we create a shuffle node we put the UNDEF node to second operand,
17595 // but in some cases the first operand may be transformed to UNDEF.
17596 // In this case we should just commute the node.
17597 if (V1IsUndef)
17598 return DAG.getCommutedVectorShuffle(*SVOp);
17599
17600 // Check for non-undef masks pointing at an undef vector and make the masks
17601 // undef as well. This makes it easier to match the shuffle based solely on
17602 // the mask.
17603 if (V2IsUndef &&
17604 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
17605 SmallVector<int, 8> NewMask(OrigMask);
17606 for (int &M : NewMask)
17607 if (M >= NumElements)
17608 M = -1;
17609 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
17610 }
17611
17612 // Check for illegal shuffle mask element index values.
17613 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
17614 (void)MaskUpperLimit;
17615 assert(llvm::all_of(OrigMask,
17616 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
17617 "Out of bounds shuffle index");
17618
17619 // We actually see shuffles that are entirely re-arrangements of a set of
17620 // zero inputs. This mostly happens while decomposing complex shuffles into
17621 // simple ones. Directly lower these as a buildvector of zeros.
17622 APInt KnownUndef, KnownZero;
17623 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
17624
17625 APInt Zeroable = KnownUndef | KnownZero;
17626 if (Zeroable.isAllOnes())
17627 return getZeroVector(VT, Subtarget, DAG, DL);
17628
17629 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
17630
17631 // Try to collapse shuffles into using a vector type with fewer elements but
17632 // wider element types. We cap this to not form integers or floating point
17633 // elements wider than 64 bits. It does not seem beneficial to form i128
17634 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
17635 SmallVector<int, 16> WidenedMask;
17636 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
17637 !canCombineAsMaskOperation(V1, Subtarget) &&
17638 !canCombineAsMaskOperation(V2, Subtarget) &&
17639 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
17640 // Shuffle mask widening should not interfere with a broadcast opportunity
17641 // by obfuscating the operands with bitcasts.
17642 // TODO: Avoid lowering directly from this top-level function: make this
17643 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
17644 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
17645 Subtarget, DAG))
17646 return Broadcast;
17647
17648 MVT NewEltVT = VT.isFloatingPoint()
17651 int NewNumElts = NumElements / 2;
17652 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
17653 // Make sure that the new vector type is legal. For example, v2f64 isn't
17654 // legal on SSE1.
17655 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
17656 if (V2IsZero) {
17657 // Modify the new Mask to take all zeros from the all-zero vector.
17658 // Choose indices that are blend-friendly.
17659 bool UsedZeroVector = false;
17660 assert(is_contained(WidenedMask, SM_SentinelZero) &&
17661 "V2's non-undef elements are used?!");
17662 for (int i = 0; i != NewNumElts; ++i)
17663 if (WidenedMask[i] == SM_SentinelZero) {
17664 WidenedMask[i] = i + NewNumElts;
17665 UsedZeroVector = true;
17666 }
17667 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
17668 // some elements to be undef.
17669 if (UsedZeroVector)
17670 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
17671 }
17672 V1 = DAG.getBitcast(NewVT, V1);
17673 V2 = DAG.getBitcast(NewVT, V2);
17674 return DAG.getBitcast(
17675 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
17676 }
17677 }
17678
17679 SmallVector<SDValue> Ops = {V1, V2};
17680 SmallVector<int> Mask(OrigMask);
17681
17682 // Canonicalize the shuffle with any horizontal ops inputs.
17683 // NOTE: This may update Ops and Mask.
17685 Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
17686 return DAG.getBitcast(VT, HOp);
17687
17688 V1 = DAG.getBitcast(VT, Ops[0]);
17689 V2 = DAG.getBitcast(VT, Ops[1]);
17690 assert(NumElements == (int)Mask.size() &&
17691 "canonicalizeShuffleMaskWithHorizOp "
17692 "shouldn't alter the shuffle mask size");
17693
17694 // Commute the shuffle if it will improve canonicalization.
17697 std::swap(V1, V2);
17698 }
17699
17700 // For each vector width, delegate to a specialized lowering routine.
17701 if (VT.is128BitVector())
17702 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17703
17704 if (VT.is256BitVector())
17705 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17706
17707 if (VT.is512BitVector())
17708 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17709
17710 if (Is1BitVector)
17711 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17712
17713 llvm_unreachable("Unimplemented!");
17714}
17715
17716/// Try to lower a VSELECT instruction to a vector shuffle.
17718 const X86Subtarget &Subtarget,
17719 SelectionDAG &DAG) {
17720 SDValue Cond = Op.getOperand(0);
17721 SDValue LHS = Op.getOperand(1);
17722 SDValue RHS = Op.getOperand(2);
17723 MVT VT = Op.getSimpleValueType();
17724
17725 // Only non-legal VSELECTs reach this lowering, convert those into generic
17726 // shuffles and re-use the shuffle lowering path for blends.
17730 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
17731 }
17732
17733 return SDValue();
17734}
17735
17736SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
17737 SDValue Cond = Op.getOperand(0);
17738 SDValue LHS = Op.getOperand(1);
17739 SDValue RHS = Op.getOperand(2);
17740
17741 SDLoc dl(Op);
17742 MVT VT = Op.getSimpleValueType();
17743 if (isSoftF16(VT, Subtarget)) {
17745 return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
17746 DAG.getBitcast(NVT, LHS),
17747 DAG.getBitcast(NVT, RHS)));
17748 }
17749
17750 // A vselect where all conditions and data are constants can be optimized into
17751 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
17755 return SDValue();
17756
17757 // Try to lower this to a blend-style vector shuffle. This can handle all
17758 // constant condition cases.
17759 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
17760 return BlendOp;
17761
17762 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
17763 // with patterns on the mask registers on AVX-512.
17764 MVT CondVT = Cond.getSimpleValueType();
17765 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
17766 if (CondEltSize == 1)
17767 return Op;
17768
17769 // Variable blends are only legal from SSE4.1 onward.
17770 if (!Subtarget.hasSSE41())
17771 return SDValue();
17772
17773 unsigned EltSize = VT.getScalarSizeInBits();
17774 unsigned NumElts = VT.getVectorNumElements();
17775
17776 // Expand v32i16/v64i8 without BWI.
17777 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
17778 return SDValue();
17779
17780 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
17781 // into an i1 condition so that we can use the mask-based 512-bit blend
17782 // instructions.
17783 if (VT.getSizeInBits() == 512) {
17784 // Build a mask by testing the condition against zero.
17785 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
17786 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
17787 DAG.getConstant(0, dl, CondVT),
17788 ISD::SETNE);
17789 // Now return a new VSELECT using the mask.
17790 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
17791 }
17792
17793 // SEXT/TRUNC cases where the mask doesn't match the destination size.
17794 if (CondEltSize != EltSize) {
17795 // If we don't have a sign splat, rely on the expansion.
17796 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
17797 return SDValue();
17798
17799 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
17800 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
17801 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
17802 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
17803 }
17804
17805 // Only some types will be legal on some subtargets. If we can emit a legal
17806 // VSELECT-matching blend, return Op, and but if we need to expand, return
17807 // a null value.
17808 switch (VT.SimpleTy) {
17809 default:
17810 // Most of the vector types have blends past SSE4.1.
17811 return Op;
17812
17813 case MVT::v32i8:
17814 // The byte blends for AVX vectors were introduced only in AVX2.
17815 if (Subtarget.hasAVX2())
17816 return Op;
17817
17818 return SDValue();
17819
17820 case MVT::v8i16:
17821 case MVT::v16i16: {
17822 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
17823 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
17824 Cond = DAG.getBitcast(CastVT, Cond);
17825 LHS = DAG.getBitcast(CastVT, LHS);
17826 RHS = DAG.getBitcast(CastVT, RHS);
17827 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
17828 return DAG.getBitcast(VT, Select);
17829 }
17830 }
17831}
17832
17834 MVT VT = Op.getSimpleValueType();
17835 SDValue Vec = Op.getOperand(0);
17836 SDValue Idx = Op.getOperand(1);
17837 assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
17838 SDLoc dl(Op);
17839
17841 return SDValue();
17842
17843 if (VT.getSizeInBits() == 8) {
17844 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
17845 // we're going to zero extend the register or fold the store.
17848 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
17849 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
17850 DAG.getBitcast(MVT::v4i32, Vec), Idx));
17851
17852 unsigned IdxVal = Idx->getAsZExtVal();
17853 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
17854 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
17855 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
17856 }
17857
17858 if (VT == MVT::f32) {
17859 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
17860 // the result back to FR32 register. It's only worth matching if the
17861 // result has a single use which is a store or a bitcast to i32. And in
17862 // the case of a store, it's not worth it if the index is a constant 0,
17863 // because a MOVSSmr can be used instead, which is smaller and faster.
17864 if (!Op.hasOneUse())
17865 return SDValue();
17866 SDNode *User = *Op.getNode()->use_begin();
17867 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
17868 (User->getOpcode() != ISD::BITCAST ||
17869 User->getValueType(0) != MVT::i32))
17870 return SDValue();
17871 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
17872 DAG.getBitcast(MVT::v4i32, Vec), Idx);
17873 return DAG.getBitcast(MVT::f32, Extract);
17874 }
17875
17876 if (VT == MVT::i32 || VT == MVT::i64)
17877 return Op;
17878
17879 return SDValue();
17880}
17881
17882/// Extract one bit from mask vector, like v16i1 or v8i1.
17883/// AVX-512 feature.
17885 const X86Subtarget &Subtarget) {
17886 SDValue Vec = Op.getOperand(0);
17887 SDLoc dl(Vec);
17888 MVT VecVT = Vec.getSimpleValueType();
17889 SDValue Idx = Op.getOperand(1);
17890 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
17891 MVT EltVT = Op.getSimpleValueType();
17892
17893 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
17894 "Unexpected vector type in ExtractBitFromMaskVector");
17895
17896 // variable index can't be handled in mask registers,
17897 // extend vector to VR512/128
17898 if (!IdxC) {
17899 unsigned NumElts = VecVT.getVectorNumElements();
17900 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
17901 // than extending to 128/256bit.
17902 if (NumElts == 1) {
17903 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
17905 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, DAG.getBitcast(IntVT, Vec));
17906 }
17907 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
17908 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
17909 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
17910 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
17911 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
17912 }
17913
17914 unsigned IdxVal = IdxC->getZExtValue();
17915 if (IdxVal == 0) // the operation is legal
17916 return Op;
17917
17918 // Extend to natively supported kshift.
17919 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
17920
17921 // Use kshiftr instruction to move to the lower element.
17922 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
17923 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
17924
17925 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
17926 DAG.getIntPtrConstant(0, dl));
17927}
17928
17929// Helper to find all the extracted elements from a vector.
17931 MVT VT = N->getSimpleValueType(0);
17932 unsigned NumElts = VT.getVectorNumElements();
17933 APInt DemandedElts = APInt::getZero(NumElts);
17934 for (SDNode *User : N->uses()) {
17935 switch (User->getOpcode()) {
17936 case X86ISD::PEXTRB:
17937 case X86ISD::PEXTRW:
17939 if (!isa<ConstantSDNode>(User->getOperand(1))) {
17940 DemandedElts.setAllBits();
17941 return DemandedElts;
17942 }
17943 DemandedElts.setBit(User->getConstantOperandVal(1));
17944 break;
17945 case ISD::BITCAST: {
17946 if (!User->getValueType(0).isSimple() ||
17947 !User->getValueType(0).isVector()) {
17948 DemandedElts.setAllBits();
17949 return DemandedElts;
17950 }
17951 APInt DemandedSrcElts = getExtractedDemandedElts(User);
17952 DemandedElts |= APIntOps::ScaleBitMask(DemandedSrcElts, NumElts);
17953 break;
17954 }
17955 default:
17956 DemandedElts.setAllBits();
17957 return DemandedElts;
17958 }
17959 }
17960 return DemandedElts;
17961}
17962
17963SDValue
17964X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
17965 SelectionDAG &DAG) const {
17966 SDLoc dl(Op);
17967 SDValue Vec = Op.getOperand(0);
17968 MVT VecVT = Vec.getSimpleValueType();
17969 SDValue Idx = Op.getOperand(1);
17970 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
17971
17972 if (VecVT.getVectorElementType() == MVT::i1)
17973 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
17974
17975 if (!IdxC) {
17976 // Its more profitable to go through memory (1 cycles throughput)
17977 // than using VMOVD + VPERMV/PSHUFB sequence (2/3 cycles throughput)
17978 // IACA tool was used to get performance estimation
17979 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
17980 //
17981 // example : extractelement <16 x i8> %a, i32 %i
17982 //
17983 // Block Throughput: 3.00 Cycles
17984 // Throughput Bottleneck: Port5
17985 //
17986 // | Num Of | Ports pressure in cycles | |
17987 // | Uops | 0 - DV | 5 | 6 | 7 | |
17988 // ---------------------------------------------
17989 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
17990 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
17991 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
17992 // Total Num Of Uops: 4
17993 //
17994 //
17995 // Block Throughput: 1.00 Cycles
17996 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
17997 //
17998 // | | Ports pressure in cycles | |
17999 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
18000 // ---------------------------------------------------------
18001 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
18002 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
18003 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
18004 // Total Num Of Uops: 4
18005
18006 return SDValue();
18007 }
18008
18009 unsigned IdxVal = IdxC->getZExtValue();
18010
18011 // If this is a 256-bit vector result, first extract the 128-bit vector and
18012 // then extract the element from the 128-bit vector.
18013 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
18014 // Get the 128-bit vector.
18015 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
18016 MVT EltVT = VecVT.getVectorElementType();
18017
18018 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
18019 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
18020
18021 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
18022 // this can be done with a mask.
18023 IdxVal &= ElemsPerChunk - 1;
18024 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18025 DAG.getIntPtrConstant(IdxVal, dl));
18026 }
18027
18028 assert(VecVT.is128BitVector() && "Unexpected vector length");
18029
18030 MVT VT = Op.getSimpleValueType();
18031
18032 if (VT == MVT::i16) {
18033 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
18034 // we're going to zero extend the register or fold the store (SSE41 only).
18035 if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
18036 !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
18037 if (Subtarget.hasFP16())
18038 return Op;
18039
18040 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
18041 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18042 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18043 }
18044
18045 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
18046 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18047 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18048 }
18049
18050 if (Subtarget.hasSSE41())
18051 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
18052 return Res;
18053
18054 // Only extract a single element from a v16i8 source - determine the common
18055 // DWORD/WORD that all extractions share, and extract the sub-byte.
18056 // TODO: Add QWORD MOVQ extraction?
18057 if (VT == MVT::i8) {
18058 APInt DemandedElts = getExtractedDemandedElts(Vec.getNode());
18059 assert(DemandedElts.getBitWidth() == 16 && "Vector width mismatch");
18060
18061 // Extract either the lowest i32 or any i16, and extract the sub-byte.
18062 int DWordIdx = IdxVal / 4;
18063 if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
18064 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18065 DAG.getBitcast(MVT::v4i32, Vec),
18066 DAG.getIntPtrConstant(DWordIdx, dl));
18067 int ShiftVal = (IdxVal % 4) * 8;
18068 if (ShiftVal != 0)
18069 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
18070 DAG.getConstant(ShiftVal, dl, MVT::i8));
18071 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18072 }
18073
18074 int WordIdx = IdxVal / 2;
18075 if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
18076 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
18077 DAG.getBitcast(MVT::v8i16, Vec),
18078 DAG.getIntPtrConstant(WordIdx, dl));
18079 int ShiftVal = (IdxVal % 2) * 8;
18080 if (ShiftVal != 0)
18081 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
18082 DAG.getConstant(ShiftVal, dl, MVT::i8));
18083 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18084 }
18085 }
18086
18087 if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
18088 if (IdxVal == 0)
18089 return Op;
18090
18091 // Shuffle the element to the lowest element, then movss or movsh.
18093 Mask[0] = static_cast<int>(IdxVal);
18094 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18095 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18096 DAG.getIntPtrConstant(0, dl));
18097 }
18098
18099 if (VT.getSizeInBits() == 64) {
18100 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
18101 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
18102 // to match extract_elt for f64.
18103 if (IdxVal == 0)
18104 return Op;
18105
18106 // UNPCKHPD the element to the lowest double word, then movsd.
18107 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
18108 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
18109 int Mask[2] = { 1, -1 };
18110 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18111 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18112 DAG.getIntPtrConstant(0, dl));
18113 }
18114
18115 return SDValue();
18116}
18117
18118/// Insert one bit to mask vector, like v16i1 or v8i1.
18119/// AVX-512 feature.
18121 const X86Subtarget &Subtarget) {
18122 SDLoc dl(Op);
18123 SDValue Vec = Op.getOperand(0);
18124 SDValue Elt = Op.getOperand(1);
18125 SDValue Idx = Op.getOperand(2);
18126 MVT VecVT = Vec.getSimpleValueType();
18127
18128 if (!isa<ConstantSDNode>(Idx)) {
18129 // Non constant index. Extend source and destination,
18130 // insert element and then truncate the result.
18131 unsigned NumElts = VecVT.getVectorNumElements();
18132 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18133 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18134 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
18135 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
18136 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
18137 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
18138 }
18139
18140 // Copy into a k-register, extract to v1i1 and insert_subvector.
18141 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
18142 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
18143}
18144
18145SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
18146 SelectionDAG &DAG) const {
18147 MVT VT = Op.getSimpleValueType();
18148 MVT EltVT = VT.getVectorElementType();
18149 unsigned NumElts = VT.getVectorNumElements();
18150 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
18151
18152 if (EltVT == MVT::i1)
18153 return InsertBitToMaskVector(Op, DAG, Subtarget);
18154
18155 SDLoc dl(Op);
18156 SDValue N0 = Op.getOperand(0);
18157 SDValue N1 = Op.getOperand(1);
18158 SDValue N2 = Op.getOperand(2);
18159 auto *N2C = dyn_cast<ConstantSDNode>(N2);
18160
18161 if (EltVT == MVT::bf16) {
18163 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT,
18164 DAG.getBitcast(IVT, N0),
18165 DAG.getBitcast(MVT::i16, N1), N2);
18166 return DAG.getBitcast(VT, Res);
18167 }
18168
18169 if (!N2C) {
18170 // Variable insertion indices, usually we're better off spilling to stack,
18171 // but AVX512 can use a variable compare+select by comparing against all
18172 // possible vector indices, and FP insertion has less gpr->simd traffic.
18173 if (!(Subtarget.hasBWI() ||
18174 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
18175 (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
18176 return SDValue();
18177
18178 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
18179 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
18180 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
18181 return SDValue();
18182
18183 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
18184 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
18185 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
18186
18187 SmallVector<SDValue, 16> RawIndices;
18188 for (unsigned I = 0; I != NumElts; ++I)
18189 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
18190 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
18191
18192 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
18193 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
18195 }
18196
18197 if (N2C->getAPIntValue().uge(NumElts))
18198 return SDValue();
18199 uint64_t IdxVal = N2C->getZExtValue();
18200
18201 bool IsZeroElt = X86::isZeroNode(N1);
18202 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
18203
18204 if (IsZeroElt || IsAllOnesElt) {
18205 // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
18206 // We don't deal with i8 0 since it appears to be handled elsewhere.
18207 if (IsAllOnesElt &&
18208 ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
18209 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
18210 SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
18211 SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
18212 SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
18213 CstVectorElts[IdxVal] = OnesCst;
18214 SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
18215 return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
18216 }
18217 // See if we can do this more efficiently with a blend shuffle with a
18218 // rematerializable vector.
18219 if (Subtarget.hasSSE41() &&
18220 (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
18221 SmallVector<int, 8> BlendMask;
18222 for (unsigned i = 0; i != NumElts; ++i)
18223 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
18224 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
18225 : getOnesVector(VT, DAG, dl);
18226 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
18227 }
18228 }
18229
18230 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
18231 // into that, and then insert the subvector back into the result.
18232 if (VT.is256BitVector() || VT.is512BitVector()) {
18233 // With a 256-bit vector, we can insert into the zero element efficiently
18234 // using a blend if we have AVX or AVX2 and the right data type.
18235 if (VT.is256BitVector() && IdxVal == 0) {
18236 // TODO: It is worthwhile to cast integer to floating point and back
18237 // and incur a domain crossing penalty if that's what we'll end up
18238 // doing anyway after extracting to a 128-bit vector.
18239 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
18240 (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
18241 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
18242 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
18243 DAG.getTargetConstant(1, dl, MVT::i8));
18244 }
18245 }
18246
18247 unsigned NumEltsIn128 = 128 / EltSizeInBits;
18248 assert(isPowerOf2_32(NumEltsIn128) &&
18249 "Vectors will always have power-of-two number of elements.");
18250
18251 // If we are not inserting into the low 128-bit vector chunk,
18252 // then prefer the broadcast+blend sequence.
18253 // FIXME: relax the profitability check iff all N1 uses are insertions.
18254 if (IdxVal >= NumEltsIn128 &&
18255 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
18256 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
18257 X86::mayFoldLoad(N1, Subtarget)))) {
18258 SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
18259 SmallVector<int, 8> BlendMask;
18260 for (unsigned i = 0; i != NumElts; ++i)
18261 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
18262 return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
18263 }
18264
18265 // Get the desired 128-bit vector chunk.
18266 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
18267
18268 // Insert the element into the desired chunk.
18269 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
18270 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
18271
18272 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
18273 DAG.getIntPtrConstant(IdxIn128, dl));
18274
18275 // Insert the changed part back into the bigger vector
18276 return insert128BitVector(N0, V, IdxVal, DAG, dl);
18277 }
18278 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
18279
18280 // This will be just movw/movd/movq/movsh/movss/movsd.
18281 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
18282 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
18283 EltVT == MVT::f16 || EltVT == MVT::i64) {
18284 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
18285 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
18286 }
18287
18288 // We can't directly insert an i8 or i16 into a vector, so zero extend
18289 // it to i32 first.
18290 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
18291 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
18292 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
18293 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
18294 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
18295 return DAG.getBitcast(VT, N1);
18296 }
18297 }
18298
18299 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
18300 // argument. SSE41 required for pinsrb.
18301 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
18302 unsigned Opc;
18303 if (VT == MVT::v8i16) {
18304 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
18305 Opc = X86ISD::PINSRW;
18306 } else {
18307 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
18308 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
18309 Opc = X86ISD::PINSRB;
18310 }
18311
18312 assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
18313 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
18314 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
18315 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
18316 }
18317
18318 if (Subtarget.hasSSE41()) {
18319 if (EltVT == MVT::f32) {
18320 // Bits [7:6] of the constant are the source select. This will always be
18321 // zero here. The DAG Combiner may combine an extract_elt index into
18322 // these bits. For example (insert (extract, 3), 2) could be matched by
18323 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
18324 // Bits [5:4] of the constant are the destination select. This is the
18325 // value of the incoming immediate.
18326 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
18327 // combine either bitwise AND or insert of float 0.0 to set these bits.
18328
18329 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
18330 if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
18331 // If this is an insertion of 32-bits into the low 32-bits of
18332 // a vector, we prefer to generate a blend with immediate rather
18333 // than an insertps. Blends are simpler operations in hardware and so
18334 // will always have equal or better performance than insertps.
18335 // But if optimizing for size and there's a load folding opportunity,
18336 // generate insertps because blendps does not have a 32-bit memory
18337 // operand form.
18338 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
18339 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
18340 DAG.getTargetConstant(1, dl, MVT::i8));
18341 }
18342 // Create this as a scalar to vector..
18343 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
18344 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
18345 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
18346 }
18347
18348 // PINSR* works with constant index.
18349 if (EltVT == MVT::i32 || EltVT == MVT::i64)
18350 return Op;
18351 }
18352
18353 return SDValue();
18354}
18355
18357 SelectionDAG &DAG) {
18358 SDLoc dl(Op);
18359 MVT OpVT = Op.getSimpleValueType();
18360
18361 // It's always cheaper to replace a xor+movd with xorps and simplifies further
18362 // combines.
18363 if (X86::isZeroNode(Op.getOperand(0)))
18364 return getZeroVector(OpVT, Subtarget, DAG, dl);
18365
18366 // If this is a 256-bit vector result, first insert into a 128-bit
18367 // vector and then insert into the 256-bit vector.
18368 if (!OpVT.is128BitVector()) {
18369 // Insert into a 128-bit vector.
18370 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
18372 OpVT.getVectorNumElements() / SizeFactor);
18373
18374 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
18375
18376 // Insert the 128-bit vector.
18377 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
18378 }
18379 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
18380 "Expected an SSE type!");
18381
18382 // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
18383 // tblgen.
18384 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
18385 return Op;
18386
18387 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
18388 return DAG.getBitcast(
18389 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
18390}
18391
18392// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
18393// simple superregister reference or explicit instructions to insert
18394// the upper bits of a vector.
18396 SelectionDAG &DAG) {
18397 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
18398
18399 return insert1BitVector(Op, DAG, Subtarget);
18400}
18401
18403 SelectionDAG &DAG) {
18404 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
18405 "Only vXi1 extract_subvectors need custom lowering");
18406
18407 SDLoc dl(Op);
18408 SDValue Vec = Op.getOperand(0);
18409 uint64_t IdxVal = Op.getConstantOperandVal(1);
18410
18411 if (IdxVal == 0) // the operation is legal
18412 return Op;
18413
18414 // Extend to natively supported kshift.
18415 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18416
18417 // Shift to the LSB.
18418 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
18419 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18420
18421 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
18422 DAG.getIntPtrConstant(0, dl));
18423}
18424
18425// Returns the appropriate wrapper opcode for a global reference.
18426unsigned X86TargetLowering::getGlobalWrapperKind(
18427 const GlobalValue *GV, const unsigned char OpFlags) const {
18428 // References to absolute symbols are never PC-relative.
18429 if (GV && GV->isAbsoluteSymbolRef())
18430 return X86ISD::Wrapper;
18431
18432 // The following OpFlags under RIP-rel PIC use RIP.
18433 if (Subtarget.isPICStyleRIPRel() &&
18434 (OpFlags == X86II::MO_NO_FLAG || OpFlags == X86II::MO_COFFSTUB ||
18435 OpFlags == X86II::MO_DLLIMPORT))
18436 return X86ISD::WrapperRIP;
18437
18438 // GOTPCREL references must always use RIP.
18439 if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
18440 return X86ISD::WrapperRIP;
18441
18442 return X86ISD::Wrapper;
18443}
18444
18445// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
18446// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
18447// one of the above mentioned nodes. It has to be wrapped because otherwise
18448// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
18449// be used to form addressing mode. These wrapped nodes will be selected
18450// into MOV32ri.
18451SDValue
18452X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
18453 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
18454
18455 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18456 // global base reg.
18457 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
18458
18459 auto PtrVT = getPointerTy(DAG.getDataLayout());
18461 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
18462 SDLoc DL(CP);
18463 Result =
18464 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
18465 // With PIC, the address is actually $g + Offset.
18466 if (OpFlag) {
18467 Result =
18468 DAG.getNode(ISD::ADD, DL, PtrVT,
18469 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
18470 }
18471
18472 return Result;
18473}
18474
18475SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
18476 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
18477
18478 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18479 // global base reg.
18480 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
18481
18482 auto PtrVT = getPointerTy(DAG.getDataLayout());
18483 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
18484 SDLoc DL(JT);
18485 Result =
18486 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
18487
18488 // With PIC, the address is actually $g + Offset.
18489 if (OpFlag)
18490 Result =
18491 DAG.getNode(ISD::ADD, DL, PtrVT,
18492 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
18493
18494 return Result;
18495}
18496
18497SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
18498 SelectionDAG &DAG) const {
18499 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
18500}
18501
18502SDValue
18503X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
18504 // Create the TargetBlockAddressAddress node.
18505 unsigned char OpFlags =
18507 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
18508 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
18509 SDLoc dl(Op);
18510 auto PtrVT = getPointerTy(DAG.getDataLayout());
18511 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
18512 Result =
18513 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlags), dl, PtrVT, Result);
18514
18515 // With PIC, the address is actually $g + Offset.
18516 if (isGlobalRelativeToPICBase(OpFlags)) {
18517 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
18518 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
18519 }
18520
18521 return Result;
18522}
18523
18524/// Creates target global address or external symbol nodes for calls or
18525/// other uses.
18526SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
18527 bool ForCall) const {
18528 // Unpack the global address or external symbol.
18529 const SDLoc &dl = SDLoc(Op);
18530 const GlobalValue *GV = nullptr;
18531 int64_t Offset = 0;
18532 const char *ExternalSym = nullptr;
18533 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
18534 GV = G->getGlobal();
18535 Offset = G->getOffset();
18536 } else {
18537 const auto *ES = cast<ExternalSymbolSDNode>(Op);
18538 ExternalSym = ES->getSymbol();
18539 }
18540
18541 // Calculate some flags for address lowering.
18543 unsigned char OpFlags;
18544 if (ForCall)
18545 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
18546 else
18547 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
18548 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
18549 bool NeedsLoad = isGlobalStubReference(OpFlags);
18550
18552 auto PtrVT = getPointerTy(DAG.getDataLayout());
18554
18555 if (GV) {
18556 // Create a target global address if this is a global. If possible, fold the
18557 // offset into the global address reference. Otherwise, ADD it on later.
18558 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
18559 // allowed because if the address of foo is 0, the ELF R_X86_64_32
18560 // relocation will compute to a negative value, which is invalid.
18561 int64_t GlobalOffset = 0;
18562 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
18564 std::swap(GlobalOffset, Offset);
18565 }
18566 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
18567 } else {
18568 // If this is not a global address, this must be an external symbol.
18569 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
18570 }
18571
18572 // If this is a direct call, avoid the wrapper if we don't need to do any
18573 // loads or adds. This allows SDAG ISel to match direct calls.
18574 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
18575 return Result;
18576
18577 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
18578
18579 // With PIC, the address is actually $g + Offset.
18580 if (HasPICReg) {
18581 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
18582 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
18583 }
18584
18585 // For globals that require a load from a stub to get the address, emit the
18586 // load.
18587 if (NeedsLoad)
18588 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
18590
18591 // If there was a non-zero offset that we didn't fold, create an explicit
18592 // addition for it.
18593 if (Offset != 0)
18594 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
18595 DAG.getConstant(Offset, dl, PtrVT));
18596
18597 return Result;
18598}
18599
18600SDValue
18601X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
18602 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
18603}
18604
18605static SDValue
18607 SDValue *InGlue, const EVT PtrVT, unsigned ReturnReg,
18608 unsigned char OperandFlags, bool LocalDynamic = false) {
18610 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18611 SDLoc dl(GA);
18612 SDValue TGA;
18613 bool UseTLSDESC = DAG.getTarget().useTLSDESC();
18614 if (LocalDynamic && UseTLSDESC) {
18615 TGA = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, OperandFlags);
18616 auto UI = TGA->use_begin();
18617 // Reuse existing GetTLSADDR node if we can find it.
18618 if (UI != TGA->use_end())
18619 return SDValue(*UI->use_begin()->use_begin(), 0);
18620 } else {
18621 TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
18622 GA->getOffset(), OperandFlags);
18623 }
18624
18625 X86ISD::NodeType CallType = UseTLSDESC ? X86ISD::TLSDESC
18626 : LocalDynamic ? X86ISD::TLSBASEADDR
18628
18629 if (InGlue) {
18630 SDValue Ops[] = { Chain, TGA, *InGlue };
18631 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
18632 } else {
18633 SDValue Ops[] = { Chain, TGA };
18634 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
18635 }
18636
18637 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
18638 MFI.setAdjustsStack(true);
18639 MFI.setHasCalls(true);
18640
18641 SDValue Glue = Chain.getValue(1);
18642 SDValue Ret = DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
18643
18644 if (!UseTLSDESC)
18645 return Ret;
18646
18647 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
18648 unsigned Seg = Subtarget.is64Bit() ? X86AS::FS : X86AS::GS;
18649
18651 SDValue Offset =
18652 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
18654 return DAG.getNode(ISD::ADD, dl, PtrVT, Ret, Offset);
18655}
18656
18657// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
18658static SDValue
18660 const EVT PtrVT) {
18661 SDValue InGlue;
18662 SDLoc dl(GA); // ? function entry point might be better
18663 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
18665 SDLoc(), PtrVT), InGlue);
18666 InGlue = Chain.getValue(1);
18667
18668 return GetTLSADDR(DAG, Chain, GA, &InGlue, PtrVT, X86::EAX, X86II::MO_TLSGD);
18669}
18670
18671// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
18672static SDValue
18674 const EVT PtrVT) {
18675 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
18676 X86::RAX, X86II::MO_TLSGD);
18677}
18678
18679// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
18680static SDValue
18682 const EVT PtrVT) {
18683 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
18684 X86::EAX, X86II::MO_TLSGD);
18685}
18686
18688 SelectionDAG &DAG, const EVT PtrVT,
18689 bool Is64Bit, bool Is64BitLP64) {
18690 SDLoc dl(GA);
18691
18692 // Get the start address of the TLS block for this module.
18696
18697 SDValue Base;
18698 if (Is64Bit) {
18699 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
18700 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg,
18701 X86II::MO_TLSLD, /*LocalDynamic=*/true);
18702 } else {
18703 SDValue InGlue;
18704 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
18705 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InGlue);
18706 InGlue = Chain.getValue(1);
18707 Base = GetTLSADDR(DAG, Chain, GA, &InGlue, PtrVT, X86::EAX,
18708 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
18709 }
18710
18711 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
18712 // of Base.
18713
18714 // Build x@dtpoff.
18715 unsigned char OperandFlags = X86II::MO_DTPOFF;
18716 unsigned WrapperKind = X86ISD::Wrapper;
18717 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
18718 GA->getValueType(0),
18719 GA->getOffset(), OperandFlags);
18720 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
18721
18722 // Add x@dtpoff with the base.
18723 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
18724}
18725
18726// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
18728 const EVT PtrVT, TLSModel::Model model,
18729 bool is64Bit, bool isPIC) {
18730 SDLoc dl(GA);
18731
18732 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
18734 PointerType::get(*DAG.getContext(), is64Bit ? 257 : 256));
18735
18736 SDValue ThreadPointer =
18737 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
18739
18740 unsigned char OperandFlags = 0;
18741 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
18742 // initialexec.
18743 unsigned WrapperKind = X86ISD::Wrapper;
18744 if (model == TLSModel::LocalExec) {
18745 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
18746 } else if (model == TLSModel::InitialExec) {
18747 if (is64Bit) {
18748 OperandFlags = X86II::MO_GOTTPOFF;
18749 WrapperKind = X86ISD::WrapperRIP;
18750 } else {
18751 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
18752 }
18753 } else {
18754 llvm_unreachable("Unexpected model");
18755 }
18756
18757 // emit "addl x@ntpoff,%eax" (local exec)
18758 // or "addl x@indntpoff,%eax" (initial exec)
18759 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
18760 SDValue TGA =
18761 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
18762 GA->getOffset(), OperandFlags);
18763 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
18764
18765 if (model == TLSModel::InitialExec) {
18766 if (isPIC && !is64Bit) {
18767 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
18768 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
18769 Offset);
18770 }
18771
18772 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
18774 }
18775
18776 // The address of the thread local variable is the add of the thread
18777 // pointer with the offset of the variable.
18778 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
18779}
18780
18781SDValue
18782X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
18783
18784 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
18785
18786 if (DAG.getTarget().useEmulatedTLS())
18787 return LowerToTLSEmulatedModel(GA, DAG);
18788
18789 const GlobalValue *GV = GA->getGlobal();
18790 auto PtrVT = getPointerTy(DAG.getDataLayout());
18791 bool PositionIndependent = isPositionIndependent();
18792
18793 if (Subtarget.isTargetELF()) {
18794 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
18795 switch (model) {
18797 if (Subtarget.is64Bit()) {
18798 if (Subtarget.isTarget64BitLP64())
18799 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
18800 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
18801 }
18802 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
18804 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
18805 Subtarget.isTarget64BitLP64());
18808 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
18809 PositionIndependent);
18810 }
18811 llvm_unreachable("Unknown TLS model.");
18812 }
18813
18814 if (Subtarget.isTargetDarwin()) {
18815 // Darwin only has one model of TLS. Lower to that.
18816 unsigned char OpFlag = 0;
18817 unsigned WrapperKind = 0;
18818
18819 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18820 // global base reg.
18821 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
18822 if (PIC32) {
18823 OpFlag = X86II::MO_TLVP_PIC_BASE;
18824 WrapperKind = X86ISD::Wrapper;
18825 } else {
18826 OpFlag = X86II::MO_TLVP;
18827 WrapperKind = X86ISD::WrapperRIP;
18828 }
18829 SDLoc DL(Op);
18831 GA->getValueType(0),
18832 GA->getOffset(), OpFlag);
18833 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
18834
18835 // With PIC32, the address is actually $g + Offset.
18836 if (PIC32)
18837 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
18838 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
18839 Offset);
18840
18841 // Lowering the machine isd will make sure everything is in the right
18842 // location.
18843 SDValue Chain = DAG.getEntryNode();
18844 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18845 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
18846 SDValue Args[] = { Chain, Offset };
18847 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
18848 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
18849
18850 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
18852 MFI.setAdjustsStack(true);
18853
18854 // And our return value (tls address) is in the standard call return value
18855 // location.
18856 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
18857 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
18858 }
18859
18860 if (Subtarget.isOSWindows()) {
18861 // Just use the implicit TLS architecture
18862 // Need to generate something similar to:
18863 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
18864 // ; from TEB
18865 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
18866 // mov rcx, qword [rdx+rcx*8]
18867 // mov eax, .tls$:tlsvar
18868 // [rax+rcx] contains the address
18869 // Windows 64bit: gs:0x58
18870 // Windows 32bit: fs:__tls_array
18871
18872 SDLoc dl(GA);
18873 SDValue Chain = DAG.getEntryNode();
18874
18875 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
18876 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
18877 // use its literal value of 0x2C.
18879 Subtarget.is64Bit() ? PointerType::get(*DAG.getContext(), 256)
18880 : PointerType::get(*DAG.getContext(), 257));
18881
18882 SDValue TlsArray = Subtarget.is64Bit()
18883 ? DAG.getIntPtrConstant(0x58, dl)
18884 : (Subtarget.isTargetWindowsGNU()
18885 ? DAG.getIntPtrConstant(0x2C, dl)
18886 : DAG.getExternalSymbol("_tls_array", PtrVT));
18887
18889 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
18890
18891 SDValue res;
18893 res = ThreadPointer;
18894 } else {
18895 // Load the _tls_index variable
18896 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
18897 if (Subtarget.is64Bit())
18898 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
18899 MachinePointerInfo(), MVT::i32);
18900 else
18901 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
18902
18903 const DataLayout &DL = DAG.getDataLayout();
18904 SDValue Scale =
18905 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
18906 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
18907
18908 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
18909 }
18910
18911 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
18912
18913 // Get the offset of start of .tls section
18914 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
18915 GA->getValueType(0),
18917 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
18918
18919 // The address of the thread local variable is the add of the thread
18920 // pointer with the offset of the variable.
18921 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
18922 }
18923
18924 llvm_unreachable("TLS not implemented for this target.");
18925}
18926
18928 if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
18930 TLSModel::Model Model = TM.getTLSModel(&GV);
18931 switch (Model) {
18934 // We can include the %fs segment register in addressing modes.
18935 return true;
18938 // These models do not result in %fs relative addresses unless
18939 // TLS descriptior are used.
18940 //
18941 // Even in the case of TLS descriptors we currently have no way to model
18942 // the difference between %fs access and the computations needed for the
18943 // offset and returning `true` for TLS-desc currently duplicates both
18944 // which is detrimental :-/
18945 return false;
18946 }
18947 }
18948 return false;
18949}
18950
18951/// Lower SRA_PARTS and friends, which return two i32 values
18952/// and take a 2 x i32 value to shift plus a shift amount.
18953/// TODO: Can this be moved to general expansion code?
18955 SDValue Lo, Hi;
18956 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
18957 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
18958}
18959
18960// Try to use a packed vector operation to handle i64 on 32-bit targets when
18961// AVX512DQ is enabled.
18963 SelectionDAG &DAG,
18964 const X86Subtarget &Subtarget) {
18965 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
18966 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
18967 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
18968 Op.getOpcode() == ISD::UINT_TO_FP) &&
18969 "Unexpected opcode!");
18970 bool IsStrict = Op->isStrictFPOpcode();
18971 unsigned OpNo = IsStrict ? 1 : 0;
18972 SDValue Src = Op.getOperand(OpNo);
18973 MVT SrcVT = Src.getSimpleValueType();
18974 MVT VT = Op.getSimpleValueType();
18975
18976 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
18977 (VT != MVT::f32 && VT != MVT::f64))
18978 return SDValue();
18979
18980 // Pack the i64 into a vector, do the operation and extract.
18981
18982 // Using 256-bit to ensure result is 128-bits for f32 case.
18983 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
18984 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
18985 MVT VecVT = MVT::getVectorVT(VT, NumElts);
18986
18987 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
18988 if (IsStrict) {
18989 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
18990 {Op.getOperand(0), InVec});
18991 SDValue Chain = CvtVec.getValue(1);
18992 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
18993 DAG.getIntPtrConstant(0, dl));
18994 return DAG.getMergeValues({Value, Chain}, dl);
18995 }
18996
18997 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
18998
18999 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19000 DAG.getIntPtrConstant(0, dl));
19001}
19002
19003// Try to use a packed vector operation to handle i64 on 32-bit targets.
19005 const X86Subtarget &Subtarget) {
19006 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19007 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19008 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19009 Op.getOpcode() == ISD::UINT_TO_FP) &&
19010 "Unexpected opcode!");
19011 bool IsStrict = Op->isStrictFPOpcode();
19012 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
19013 MVT SrcVT = Src.getSimpleValueType();
19014 MVT VT = Op.getSimpleValueType();
19015
19016 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
19017 return SDValue();
19018
19019 // Pack the i64 into a vector, do the operation and extract.
19020
19021 assert(Subtarget.hasFP16() && "Expected FP16");
19022
19023 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
19024 if (IsStrict) {
19025 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
19026 {Op.getOperand(0), InVec});
19027 SDValue Chain = CvtVec.getValue(1);
19028 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19029 DAG.getIntPtrConstant(0, dl));
19030 return DAG.getMergeValues({Value, Chain}, dl);
19031 }
19032
19033 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
19034
19035 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19036 DAG.getIntPtrConstant(0, dl));
19037}
19038
19039static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
19040 const X86Subtarget &Subtarget) {
19041 switch (Opcode) {
19042 case ISD::SINT_TO_FP:
19043 // TODO: Handle wider types with AVX/AVX512.
19044 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
19045 return false;
19046 // CVTDQ2PS or (V)CVTDQ2PD
19047 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
19048
19049 case ISD::UINT_TO_FP:
19050 // TODO: Handle wider types and i64 elements.
19051 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
19052 return false;
19053 // VCVTUDQ2PS or VCVTUDQ2PD
19054 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
19055
19056 default:
19057 return false;
19058 }
19059}
19060
19061/// Given a scalar cast operation that is extracted from a vector, try to
19062/// vectorize the cast op followed by extraction. This will avoid an expensive
19063/// round-trip between XMM and GPR.
19065 SelectionDAG &DAG,
19066 const X86Subtarget &Subtarget) {
19067 // TODO: This could be enhanced to handle smaller integer types by peeking
19068 // through an extend.
19069 SDValue Extract = Cast.getOperand(0);
19070 MVT DestVT = Cast.getSimpleValueType();
19071 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19072 !isa<ConstantSDNode>(Extract.getOperand(1)))
19073 return SDValue();
19074
19075 // See if we have a 128-bit vector cast op for this type of cast.
19076 SDValue VecOp = Extract.getOperand(0);
19077 MVT FromVT = VecOp.getSimpleValueType();
19078 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
19079 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
19080 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
19081 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
19082 return SDValue();
19083
19084 // If we are extracting from a non-zero element, first shuffle the source
19085 // vector to allow extracting from element zero.
19086 if (!isNullConstant(Extract.getOperand(1))) {
19087 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
19088 Mask[0] = Extract.getConstantOperandVal(1);
19089 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
19090 }
19091 // If the source vector is wider than 128-bits, extract the low part. Do not
19092 // create an unnecessarily wide vector cast op.
19093 if (FromVT != Vec128VT)
19094 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
19095
19096 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
19097 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
19098 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
19099 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
19100 DAG.getIntPtrConstant(0, DL));
19101}
19102
19103/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
19104/// try to vectorize the cast ops. This will avoid an expensive round-trip
19105/// between XMM and GPR.
19106static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL,
19107 SelectionDAG &DAG,
19108 const X86Subtarget &Subtarget) {
19109 // TODO: Allow FP_TO_UINT.
19110 SDValue CastToInt = CastToFP.getOperand(0);
19111 MVT VT = CastToFP.getSimpleValueType();
19112 if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
19113 return SDValue();
19114
19115 MVT IntVT = CastToInt.getSimpleValueType();
19116 SDValue X = CastToInt.getOperand(0);
19117 MVT SrcVT = X.getSimpleValueType();
19118 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
19119 return SDValue();
19120
19121 // See if we have 128-bit vector cast instructions for this type of cast.
19122 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
19123 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
19124 IntVT != MVT::i32)
19125 return SDValue();
19126
19127 unsigned SrcSize = SrcVT.getSizeInBits();
19128 unsigned IntSize = IntVT.getSizeInBits();
19129 unsigned VTSize = VT.getSizeInBits();
19130 MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
19131 MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
19132 MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
19133
19134 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
19135 unsigned ToIntOpcode =
19136 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
19137 unsigned ToFPOpcode =
19138 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
19139
19140 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
19141 //
19142 // We are not defining the high elements (for example, zero them) because
19143 // that could nullify any performance advantage that we hoped to gain from
19144 // this vector op hack. We do not expect any adverse effects (like denorm
19145 // penalties) with cast ops.
19146 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
19147 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
19148 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
19149 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
19150 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
19151}
19152
19154 SelectionDAG &DAG,
19155 const X86Subtarget &Subtarget) {
19156 bool IsStrict = Op->isStrictFPOpcode();
19157 MVT VT = Op->getSimpleValueType(0);
19158 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
19159
19160 if (Subtarget.hasDQI()) {
19161 assert(!Subtarget.hasVLX() && "Unexpected features");
19162
19163 assert((Src.getSimpleValueType() == MVT::v2i64 ||
19164 Src.getSimpleValueType() == MVT::v4i64) &&
19165 "Unsupported custom type");
19166
19167 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
19168 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
19169 "Unexpected VT!");
19170 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
19171
19172 // Need to concat with zero vector for strict fp to avoid spurious
19173 // exceptions.
19174 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
19175 : DAG.getUNDEF(MVT::v8i64);
19176 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
19177 DAG.getIntPtrConstant(0, DL));
19178 SDValue Res, Chain;
19179 if (IsStrict) {
19180 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
19181 {Op->getOperand(0), Src});
19182 Chain = Res.getValue(1);
19183 } else {
19184 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
19185 }
19186
19187 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19188 DAG.getIntPtrConstant(0, DL));
19189
19190 if (IsStrict)
19191 return DAG.getMergeValues({Res, Chain}, DL);
19192 return Res;
19193 }
19194
19195 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
19196 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
19197 if (VT != MVT::v4f32 || IsSigned)
19198 return SDValue();
19199
19200 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
19201 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
19202 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
19203 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
19204 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
19205 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
19206 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
19207 SmallVector<SDValue, 4> SignCvts(4);
19208 SmallVector<SDValue, 4> Chains(4);
19209 for (int i = 0; i != 4; ++i) {
19210 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
19211 DAG.getIntPtrConstant(i, DL));
19212 if (IsStrict) {
19213 SignCvts[i] =
19214 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
19215 {Op.getOperand(0), Elt});
19216 Chains[i] = SignCvts[i].getValue(1);
19217 } else {
19218 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
19219 }
19220 }
19221 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
19222
19223 SDValue Slow, Chain;
19224 if (IsStrict) {
19225 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
19226 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
19227 {Chain, SignCvt, SignCvt});
19228 Chain = Slow.getValue(1);
19229 } else {
19230 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
19231 }
19232
19233 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
19234 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
19235
19236 if (IsStrict)
19237 return DAG.getMergeValues({Cvt, Chain}, DL);
19238
19239 return Cvt;
19240}
19241
19243 SelectionDAG &DAG) {
19244 bool IsStrict = Op->isStrictFPOpcode();
19245 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
19246 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
19247 MVT VT = Op.getSimpleValueType();
19248 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
19249
19250 SDValue Rnd = DAG.getIntPtrConstant(0, dl);
19251 if (IsStrict)
19252 return DAG.getNode(
19253 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
19254 {Chain,
19255 DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
19256 Rnd});
19257 return DAG.getNode(ISD::FP_ROUND, dl, VT,
19258 DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
19259}
19260
19261static bool isLegalConversion(MVT VT, bool IsSigned,
19262 const X86Subtarget &Subtarget) {
19263 if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
19264 return true;
19265 if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
19266 return true;
19267 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
19268 return true;
19269 if (Subtarget.useAVX512Regs()) {
19270 if (VT == MVT::v16i32)
19271 return true;
19272 if (VT == MVT::v8i64 && Subtarget.hasDQI())
19273 return true;
19274 }
19275 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
19276 (VT == MVT::v2i64 || VT == MVT::v4i64))
19277 return true;
19278 return false;
19279}
19280
19281SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
19282 SelectionDAG &DAG) const {
19283 bool IsStrict = Op->isStrictFPOpcode();
19284 unsigned OpNo = IsStrict ? 1 : 0;
19285 SDValue Src = Op.getOperand(OpNo);
19286 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
19287 MVT SrcVT = Src.getSimpleValueType();
19288 MVT VT = Op.getSimpleValueType();
19289 SDLoc dl(Op);
19290
19291 if (isSoftF16(VT, Subtarget))
19292 return promoteXINT_TO_FP(Op, dl, DAG);
19293 else if (isLegalConversion(SrcVT, true, Subtarget))
19294 return Op;
19295
19296 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
19297 return LowerWin64_INT128_TO_FP(Op, DAG);
19298
19299 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
19300 return Extract;
19301
19302 if (SDValue R = lowerFPToIntToFP(Op, dl, DAG, Subtarget))
19303 return R;
19304
19305 if (SrcVT.isVector()) {
19306 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
19307 // Note: Since v2f64 is a legal type. We don't need to zero extend the
19308 // source for strict FP.
19309 if (IsStrict)
19310 return DAG.getNode(
19311 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
19312 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
19313 DAG.getUNDEF(SrcVT))});
19314 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
19315 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
19316 DAG.getUNDEF(SrcVT)));
19317 }
19318 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
19319 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
19320
19321 return SDValue();
19322 }
19323
19324 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
19325 "Unknown SINT_TO_FP to lower!");
19326
19327 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
19328
19329 // These are really Legal; return the operand so the caller accepts it as
19330 // Legal.
19331 if (SrcVT == MVT::i32 && UseSSEReg)
19332 return Op;
19333 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
19334 return Op;
19335
19336 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
19337 return V;
19338 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
19339 return V;
19340
19341 // SSE doesn't have an i16 conversion so we need to promote.
19342 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
19343 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
19344 if (IsStrict)
19345 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
19346 {Chain, Ext});
19347
19348 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
19349 }
19350
19351 if (VT == MVT::f128 || !Subtarget.hasX87())
19352 return SDValue();
19353
19354 SDValue ValueToStore = Src;
19355 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
19356 // Bitcasting to f64 here allows us to do a single 64-bit store from
19357 // an SSE register, avoiding the store forwarding penalty that would come
19358 // with two 32-bit stores.
19359 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
19360
19361 unsigned Size = SrcVT.getStoreSize();
19362 Align Alignment(Size);
19364 auto PtrVT = getPointerTy(MF.getDataLayout());
19365 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
19366 MachinePointerInfo MPI =
19368 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
19369 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
19370 std::pair<SDValue, SDValue> Tmp =
19371 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
19372
19373 if (IsStrict)
19374 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
19375
19376 return Tmp.first;
19377}
19378
19379std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
19380 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
19381 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
19382 // Build the FILD
19383 SDVTList Tys;
19384 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
19385 if (useSSE)
19386 Tys = DAG.getVTList(MVT::f80, MVT::Other);
19387 else
19388 Tys = DAG.getVTList(DstVT, MVT::Other);
19389
19390 SDValue FILDOps[] = {Chain, Pointer};
19391 SDValue Result =
19392 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
19393 Alignment, MachineMemOperand::MOLoad);
19394 Chain = Result.getValue(1);
19395
19396 if (useSSE) {
19398 unsigned SSFISize = DstVT.getStoreSize();
19399 int SSFI =
19400 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
19401 auto PtrVT = getPointerTy(MF.getDataLayout());
19402 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
19403 Tys = DAG.getVTList(MVT::Other);
19404 SDValue FSTOps[] = {Chain, Result, StackSlot};
19407 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
19408
19409 Chain =
19410 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
19411 Result = DAG.getLoad(
19412 DstVT, DL, Chain, StackSlot,
19414 Chain = Result.getValue(1);
19415 }
19416
19417 return { Result, Chain };
19418}
19419
19420/// Horizontal vector math instructions may be slower than normal math with
19421/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
19422/// implementation, and likely shuffle complexity of the alternate sequence.
19423static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
19424 const X86Subtarget &Subtarget) {
19425 bool IsOptimizingSize = DAG.shouldOptForSize();
19426 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
19427 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
19428}
19429
19430/// 64-bit unsigned integer to double expansion.
19432 SelectionDAG &DAG,
19433 const X86Subtarget &Subtarget) {
19434 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
19435 // when converting 0 when rounding toward negative infinity. Caller will
19436 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
19437 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
19438 // This algorithm is not obvious. Here it is what we're trying to output:
19439 /*
19440 movq %rax, %xmm0
19441 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
19442 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
19443 #ifdef __SSE3__
19444 haddpd %xmm0, %xmm0
19445 #else
19446 pshufd $0x4e, %xmm0, %xmm1
19447 addpd %xmm1, %xmm0
19448 #endif
19449 */
19450
19452
19453 // Build some magic constants.
19454 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
19456 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
19457 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
19458
19460 CV1.push_back(
19461 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
19462 APInt(64, 0x4330000000000000ULL))));
19463 CV1.push_back(
19464 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
19465 APInt(64, 0x4530000000000000ULL))));
19466 Constant *C1 = ConstantVector::get(CV1);
19467 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
19468
19469 // Load the 64-bit value into an XMM register.
19470 SDValue XR1 =
19471 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
19472 SDValue CLod0 = DAG.getLoad(
19473 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
19475 SDValue Unpck1 =
19476 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
19477
19478 SDValue CLod1 = DAG.getLoad(
19479 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
19481 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
19482 // TODO: Are there any fast-math-flags to propagate here?
19483 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
19484 SDValue Result;
19485
19486 if (Subtarget.hasSSE3() &&
19487 shouldUseHorizontalOp(true, DAG, Subtarget)) {
19488 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
19489 } else {
19490 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
19491 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
19492 }
19493 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
19494 DAG.getIntPtrConstant(0, dl));
19495 return Result;
19496}
19497
19498/// 32-bit unsigned integer to float expansion.
19500 SelectionDAG &DAG,
19501 const X86Subtarget &Subtarget) {
19502 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
19503 // FP constant to bias correct the final result.
19504 SDValue Bias = DAG.getConstantFP(
19505 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);
19506
19507 // Load the 32-bit value into an XMM register.
19508 SDValue Load =
19509 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
19510
19511 // Zero out the upper parts of the register.
19512 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
19513
19514 // Or the load with the bias.
19515 SDValue Or = DAG.getNode(
19516 ISD::OR, dl, MVT::v2i64,
19517 DAG.getBitcast(MVT::v2i64, Load),
19518 DAG.getBitcast(MVT::v2i64,
19519 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
19520 Or =
19521 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
19522 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
19523
19524 if (Op.getNode()->isStrictFPOpcode()) {
19525 // Subtract the bias.
19526 // TODO: Are there any fast-math-flags to propagate here?
19527 SDValue Chain = Op.getOperand(0);
19528 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
19529 {Chain, Or, Bias});
19530
19531 if (Op.getValueType() == Sub.getValueType())
19532 return Sub;
19533
19534 // Handle final rounding.
19535 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
19536 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
19537
19538 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
19539 }
19540
19541 // Subtract the bias.
19542 // TODO: Are there any fast-math-flags to propagate here?
19543 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
19544
19545 // Handle final rounding.
19546 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
19547}
19548
19550 SelectionDAG &DAG,
19551 const X86Subtarget &Subtarget) {
19552 if (Op.getSimpleValueType() != MVT::v2f64)
19553 return SDValue();
19554
19555 bool IsStrict = Op->isStrictFPOpcode();
19556
19557 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
19558 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
19559
19560 if (Subtarget.hasAVX512()) {
19561 if (!Subtarget.hasVLX()) {
19562 // Let generic type legalization widen this.
19563 if (!IsStrict)
19564 return SDValue();
19565 // Otherwise pad the integer input with 0s and widen the operation.
19566 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
19567 DAG.getConstant(0, DL, MVT::v2i32));
19568 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
19569 {Op.getOperand(0), N0});
19570 SDValue Chain = Res.getValue(1);
19571 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
19572 DAG.getIntPtrConstant(0, DL));
19573 return DAG.getMergeValues({Res, Chain}, DL);
19574 }
19575
19576 // Legalize to v4i32 type.
19577 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
19578 DAG.getUNDEF(MVT::v2i32));
19579 if (IsStrict)
19580 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
19581 {Op.getOperand(0), N0});
19582 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
19583 }
19584
19585 // Zero extend to 2i64, OR with the floating point representation of 2^52.
19586 // This gives us the floating point equivalent of 2^52 + the i32 integer
19587 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
19588 // point leaving just our i32 integers in double format.
19589 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
19590 SDValue VBias = DAG.getConstantFP(
19591 llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);
19592 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
19593 DAG.getBitcast(MVT::v2i64, VBias));
19594 Or = DAG.getBitcast(MVT::v2f64, Or);
19595
19596 if (IsStrict)
19597 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
19598 {Op.getOperand(0), Or, VBias});
19599 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
19600}
19601
19603 SelectionDAG &DAG,
19604 const X86Subtarget &Subtarget) {
19605 bool IsStrict = Op->isStrictFPOpcode();
19606 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
19607 MVT VecIntVT = V.getSimpleValueType();
19608 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
19609 "Unsupported custom type");
19610
19611 if (Subtarget.hasAVX512()) {
19612 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
19613 assert(!Subtarget.hasVLX() && "Unexpected features");
19614 MVT VT = Op->getSimpleValueType(0);
19615
19616 // v8i32->v8f64 is legal with AVX512 so just return it.
19617 if (VT == MVT::v8f64)
19618 return Op;
19619
19620 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&
19621 "Unexpected VT!");
19622 MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
19623 MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
19624 // Need to concat with zero vector for strict fp to avoid spurious
19625 // exceptions.
19626 SDValue Tmp =
19627 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
19628 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
19629 DAG.getIntPtrConstant(0, DL));
19630 SDValue Res, Chain;
19631 if (IsStrict) {
19632 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
19633 {Op->getOperand(0), V});
19634 Chain = Res.getValue(1);
19635 } else {
19636 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
19637 }
19638
19639 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19640 DAG.getIntPtrConstant(0, DL));
19641
19642 if (IsStrict)
19643 return DAG.getMergeValues({Res, Chain}, DL);
19644 return Res;
19645 }
19646
19647 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
19648 Op->getSimpleValueType(0) == MVT::v4f64) {
19649 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
19650 Constant *Bias = ConstantFP::get(
19651 *DAG.getContext(),
19652 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
19653 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
19654 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
19655 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
19656 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
19657 SDValue VBias = DAG.getMemIntrinsicNode(
19658 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
19661
19662 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
19663 DAG.getBitcast(MVT::v4i64, VBias));
19664 Or = DAG.getBitcast(MVT::v4f64, Or);
19665
19666 if (IsStrict)
19667 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
19668 {Op.getOperand(0), Or, VBias});
19669 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
19670 }
19671
19672 // The algorithm is the following:
19673 // #ifdef __SSE4_1__
19674 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
19675 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
19676 // (uint4) 0x53000000, 0xaa);
19677 // #else
19678 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
19679 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
19680 // #endif
19681 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
19682 // return (float4) lo + fhi;
19683
19684 bool Is128 = VecIntVT == MVT::v4i32;
19685 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
19686 // If we convert to something else than the supported type, e.g., to v4f64,
19687 // abort early.
19688 if (VecFloatVT != Op->getSimpleValueType(0))
19689 return SDValue();
19690
19691 // In the #idef/#else code, we have in common:
19692 // - The vector of constants:
19693 // -- 0x4b000000
19694 // -- 0x53000000
19695 // - A shift:
19696 // -- v >> 16
19697
19698 // Create the splat vector for 0x4b000000.
19699 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
19700 // Create the splat vector for 0x53000000.
19701 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
19702
19703 // Create the right shift.
19704 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
19705 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
19706
19707 SDValue Low, High;
19708 if (Subtarget.hasSSE41()) {
19709 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
19710 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
19711 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
19712 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
19713 // Low will be bitcasted right away, so do not bother bitcasting back to its
19714 // original type.
19715 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
19716 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
19717 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
19718 // (uint4) 0x53000000, 0xaa);
19719 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
19720 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
19721 // High will be bitcasted right away, so do not bother bitcasting back to
19722 // its original type.
19723 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
19724 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
19725 } else {
19726 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
19727 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
19728 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
19729 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
19730
19731 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
19732 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
19733 }
19734
19735 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
19736 SDValue VecCstFSub = DAG.getConstantFP(
19737 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
19738
19739 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
19740 // NOTE: By using fsub of a positive constant instead of fadd of a negative
19741 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
19742 // enabled. See PR24512.
19743 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
19744 // TODO: Are there any fast-math-flags to propagate here?
19745 // (float4) lo;
19746 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
19747 // return (float4) lo + fhi;
19748 if (IsStrict) {
19749 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
19750 {Op.getOperand(0), HighBitcast, VecCstFSub});
19751 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
19752 {FHigh.getValue(1), LowBitcast, FHigh});
19753 }
19754
19755 SDValue FHigh =
19756 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
19757 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
19758}
19759
19761 const X86Subtarget &Subtarget) {
19762 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
19763 SDValue N0 = Op.getOperand(OpNo);
19764 MVT SrcVT = N0.getSimpleValueType();
19765
19766 switch (SrcVT.SimpleTy) {
19767 default:
19768 llvm_unreachable("Custom UINT_TO_FP is not supported!");
19769 case MVT::v2i32:
19770 return lowerUINT_TO_FP_v2i32(Op, dl, DAG, Subtarget);
19771 case MVT::v4i32:
19772 case MVT::v8i32:
19773 return lowerUINT_TO_FP_vXi32(Op, dl, DAG, Subtarget);
19774 case MVT::v2i64:
19775 case MVT::v4i64:
19776 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
19777 }
19778}
19779
19780SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
19781 SelectionDAG &DAG) const {
19782 bool IsStrict = Op->isStrictFPOpcode();
19783 unsigned OpNo = IsStrict ? 1 : 0;
19784 SDValue Src = Op.getOperand(OpNo);
19785 SDLoc dl(Op);
19786 auto PtrVT = getPointerTy(DAG.getDataLayout());
19787 MVT SrcVT = Src.getSimpleValueType();
19788 MVT DstVT = Op->getSimpleValueType(0);
19789 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
19790
19791 // Bail out when we don't have native conversion instructions.
19792 if (DstVT == MVT::f128)
19793 return SDValue();
19794
19795 if (isSoftF16(DstVT, Subtarget))
19796 return promoteXINT_TO_FP(Op, dl, DAG);
19797 else if (isLegalConversion(SrcVT, false, Subtarget))
19798 return Op;
19799
19800 if (DstVT.isVector())
19801 return lowerUINT_TO_FP_vec(Op, dl, DAG, Subtarget);
19802
19803 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
19804 return LowerWin64_INT128_TO_FP(Op, DAG);
19805
19806 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
19807 return Extract;
19808
19809 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
19810 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
19811 // Conversions from unsigned i32 to f32/f64 are legal,
19812 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
19813 return Op;
19814 }
19815
19816 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
19817 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
19818 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
19819 if (IsStrict)
19820 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
19821 {Chain, Src});
19822 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
19823 }
19824
19825 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
19826 return V;
19827 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
19828 return V;
19829
19830 // The transform for i64->f64 isn't correct for 0 when rounding to negative
19831 // infinity. It produces -0.0, so disable under strictfp.
19832 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
19833 !IsStrict)
19834 return LowerUINT_TO_FP_i64(Op, dl, DAG, Subtarget);
19835 // The transform for i32->f64/f32 isn't correct for 0 when rounding to
19836 // negative infinity. So disable under strictfp. Using FILD instead.
19837 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
19838 !IsStrict)
19839 return LowerUINT_TO_FP_i32(Op, dl, DAG, Subtarget);
19840 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
19841 (DstVT == MVT::f32 || DstVT == MVT::f64))
19842 return SDValue();
19843
19844 // Make a 64-bit buffer, and use it to build an FILD.
19845 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
19846 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
19847 Align SlotAlign(8);
19848 MachinePointerInfo MPI =
19850 if (SrcVT == MVT::i32) {
19851 SDValue OffsetSlot =
19852 DAG.getMemBasePlusOffset(StackSlot, TypeSize::getFixed(4), dl);
19853 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
19854 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
19855 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
19856 std::pair<SDValue, SDValue> Tmp =
19857 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
19858 if (IsStrict)
19859 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
19860
19861 return Tmp.first;
19862 }
19863
19864 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
19865 SDValue ValueToStore = Src;
19866 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
19867 // Bitcasting to f64 here allows us to do a single 64-bit store from
19868 // an SSE register, avoiding the store forwarding penalty that would come
19869 // with two 32-bit stores.
19870 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
19871 }
19872 SDValue Store =
19873 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
19874 // For i64 source, we need to add the appropriate power of 2 if the input
19875 // was negative. We must be careful to do the computation in x87 extended
19876 // precision, not in SSE.
19877 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
19878 SDValue Ops[] = {Store, StackSlot};
19879 SDValue Fild =
19880 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
19881 SlotAlign, MachineMemOperand::MOLoad);
19882 Chain = Fild.getValue(1);
19883
19884 // Check whether the sign bit is set.
19885 SDValue SignSet = DAG.getSetCC(
19886 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
19887 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
19888
19889 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
19890 APInt FF(64, 0x5F80000000000000ULL);
19891 SDValue FudgePtr =
19892 DAG.getConstantPool(ConstantInt::get(*DAG.getContext(), FF), PtrVT);
19893 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
19894
19895 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
19896 SDValue Zero = DAG.getIntPtrConstant(0, dl);
19897 SDValue Four = DAG.getIntPtrConstant(4, dl);
19898 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
19899 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
19900
19901 // Load the value out, extending it from f32 to f80.
19902 SDValue Fudge = DAG.getExtLoad(
19903 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
19905 CPAlignment);
19906 Chain = Fudge.getValue(1);
19907 // Extend everything to 80 bits to force it to be done on x87.
19908 // TODO: Are there any fast-math-flags to propagate here?
19909 if (IsStrict) {
19910 unsigned Opc = ISD::STRICT_FADD;
19911 // Windows needs the precision control changed to 80bits around this add.
19912 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
19914
19915 SDValue Add =
19916 DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
19917 // STRICT_FP_ROUND can't handle equal types.
19918 if (DstVT == MVT::f80)
19919 return Add;
19920 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
19921 {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
19922 }
19923 unsigned Opc = ISD::FADD;
19924 // Windows needs the precision control changed to 80bits around this add.
19925 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
19926 Opc = X86ISD::FP80_ADD;
19927
19928 SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
19929 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
19930 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
19931}
19932
19933// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
19934// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
19935// just return an SDValue().
19936// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
19937// to i16, i32 or i64, and we lower it to a legal sequence and return the
19938// result.
19939SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
19940 bool IsSigned,
19941 SDValue &Chain) const {
19942 bool IsStrict = Op->isStrictFPOpcode();
19943 SDLoc DL(Op);
19944
19945 EVT DstTy = Op.getValueType();
19946 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
19947 EVT TheVT = Value.getValueType();
19948 auto PtrVT = getPointerTy(DAG.getDataLayout());
19949
19950 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
19951 // f16 must be promoted before using the lowering in this routine.
19952 // fp128 does not use this lowering.
19953 return SDValue();
19954 }
19955
19956 // If using FIST to compute an unsigned i64, we'll need some fixup
19957 // to handle values above the maximum signed i64. A FIST is always
19958 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
19959 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
19960
19961 // FIXME: This does not generate an invalid exception if the input does not
19962 // fit in i32. PR44019
19963 if (!IsSigned && DstTy != MVT::i64) {
19964 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
19965 // The low 32 bits of the fist result will have the correct uint32 result.
19966 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
19967 DstTy = MVT::i64;
19968 }
19969
19970 assert(DstTy.getSimpleVT() <= MVT::i64 &&
19971 DstTy.getSimpleVT() >= MVT::i16 &&
19972 "Unknown FP_TO_INT to lower!");
19973
19974 // We lower FP->int64 into FISTP64 followed by a load from a temporary
19975 // stack slot.
19977 unsigned MemSize = DstTy.getStoreSize();
19978 int SSFI =
19979 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
19980 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
19981
19982 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
19983
19984 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
19985
19986 if (UnsignedFixup) {
19987 //
19988 // Conversion to unsigned i64 is implemented with a select,
19989 // depending on whether the source value fits in the range
19990 // of a signed i64. Let Thresh be the FP equivalent of
19991 // 0x8000000000000000ULL.
19992 //
19993 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
19994 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
19995 // FistSrc = (Value - FltOfs);
19996 // Fist-to-mem64 FistSrc
19997 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
19998 // to XOR'ing the high 32 bits with Adjust.
19999 //
20000 // Being a power of 2, Thresh is exactly representable in all FP formats.
20001 // For X87 we'd like to use the smallest FP type for this constant, but
20002 // for DAG type consistency we have to match the FP operand type.
20003
20004 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
20006 bool LosesInfo = false;
20007 if (TheVT == MVT::f64)
20008 // The rounding mode is irrelevant as the conversion should be exact.
20010 &LosesInfo);
20011 else if (TheVT == MVT::f80)
20012 Status = Thresh.convert(APFloat::x87DoubleExtended(),
20013 APFloat::rmNearestTiesToEven, &LosesInfo);
20014
20015 assert(Status == APFloat::opOK && !LosesInfo &&
20016 "FP conversion should have been exact");
20017
20018 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
20019
20020 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
20021 *DAG.getContext(), TheVT);
20022 SDValue Cmp;
20023 if (IsStrict) {
20024 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
20025 /*IsSignaling*/ true);
20026 Chain = Cmp.getValue(1);
20027 } else {
20028 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
20029 }
20030
20031 // Our preferred lowering of
20032 //
20033 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
20034 //
20035 // is
20036 //
20037 // (Value >= Thresh) << 63
20038 //
20039 // but since we can get here after LegalOperations, DAGCombine might do the
20040 // wrong thing if we create a select. So, directly create the preferred
20041 // version.
20042 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
20043 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
20044 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
20045
20046 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
20047 DAG.getConstantFP(0.0, DL, TheVT));
20048
20049 if (IsStrict) {
20050 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
20051 { Chain, Value, FltOfs });
20052 Chain = Value.getValue(1);
20053 } else
20054 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
20055 }
20056
20058
20059 // FIXME This causes a redundant load/store if the SSE-class value is already
20060 // in memory, such as if it is on the callstack.
20061 if (isScalarFPTypeInSSEReg(TheVT)) {
20062 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
20063 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
20064 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20065 SDValue Ops[] = { Chain, StackSlot };
20066
20067 unsigned FLDSize = TheVT.getStoreSize();
20068 assert(FLDSize <= MemSize && "Stack slot not big enough");
20070 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
20071 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
20072 Chain = Value.getValue(1);
20073 }
20074
20075 // Build the FP_TO_INT*_IN_MEM
20077 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
20078 SDValue Ops[] = { Chain, Value, StackSlot };
20080 DAG.getVTList(MVT::Other),
20081 Ops, DstTy, MMO);
20082
20083 SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);
20084 Chain = Res.getValue(1);
20085
20086 // If we need an unsigned fixup, XOR the result with adjust.
20087 if (UnsignedFixup)
20088 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
20089
20090 return Res;
20091}
20092
20094 const X86Subtarget &Subtarget) {
20095 MVT VT = Op.getSimpleValueType();
20096 SDValue In = Op.getOperand(0);
20097 MVT InVT = In.getSimpleValueType();
20098 SDLoc dl(Op);
20099 unsigned Opc = Op.getOpcode();
20100
20101 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
20102 assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&
20103 "Unexpected extension opcode");
20105 "Expected same number of elements");
20106 assert((VT.getVectorElementType() == MVT::i16 ||
20107 VT.getVectorElementType() == MVT::i32 ||
20108 VT.getVectorElementType() == MVT::i64) &&
20109 "Unexpected element type");
20110 assert((InVT.getVectorElementType() == MVT::i8 ||
20111 InVT.getVectorElementType() == MVT::i16 ||
20112 InVT.getVectorElementType() == MVT::i32) &&
20113 "Unexpected element type");
20114
20115 unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);
20116
20117 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
20118 assert(InVT == MVT::v32i8 && "Unexpected VT!");
20119 return splitVectorIntUnary(Op, DAG, dl);
20120 }
20121
20122 if (Subtarget.hasInt256())
20123 return Op;
20124
20125 // Optimize vectors in AVX mode:
20126 //
20127 // v8i16 -> v8i32
20128 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
20129 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
20130 // Concat upper and lower parts.
20131 //
20132 // v4i32 -> v4i64
20133 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
20134 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
20135 // Concat upper and lower parts.
20136 //
20137 MVT HalfVT = VT.getHalfNumVectorElementsVT();
20138 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
20139
20140 // Short-circuit if we can determine that each 128-bit half is the same value.
20141 // Otherwise, this is difficult to match and optimize.
20142 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
20143 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
20144 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
20145
20146 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
20147 SDValue Undef = DAG.getUNDEF(InVT);
20148 bool NeedZero = Opc == ISD::ZERO_EXTEND;
20149 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
20150 OpHi = DAG.getBitcast(HalfVT, OpHi);
20151
20152 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
20153}
20154
20155// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
20156static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
20157 const SDLoc &dl, SelectionDAG &DAG) {
20158 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
20159 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20160 DAG.getIntPtrConstant(0, dl));
20161 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20162 DAG.getIntPtrConstant(8, dl));
20163 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
20164 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
20165 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
20166 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20167}
20168
20170 const X86Subtarget &Subtarget,
20171 SelectionDAG &DAG) {
20172 MVT VT = Op->getSimpleValueType(0);
20173 SDValue In = Op->getOperand(0);
20174 MVT InVT = In.getSimpleValueType();
20175 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
20176 SDLoc DL(Op);
20177 unsigned NumElts = VT.getVectorNumElements();
20178
20179 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
20180 // avoids a constant pool load.
20181 if (VT.getVectorElementType() != MVT::i8) {
20182 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
20183 return DAG.getNode(ISD::SRL, DL, VT, Extend,
20184 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
20185 }
20186
20187 // Extend VT if BWI is not supported.
20188 MVT ExtVT = VT;
20189 if (!Subtarget.hasBWI()) {
20190 // If v16i32 is to be avoided, we'll need to split and concatenate.
20191 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
20192 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
20193
20194 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
20195 }
20196
20197 // Widen to 512-bits if VLX is not supported.
20198 MVT WideVT = ExtVT;
20199 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
20200 NumElts *= 512 / ExtVT.getSizeInBits();
20201 InVT = MVT::getVectorVT(MVT::i1, NumElts);
20202 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
20203 In, DAG.getIntPtrConstant(0, DL));
20204 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
20205 NumElts);
20206 }
20207
20208 SDValue One = DAG.getConstant(1, DL, WideVT);
20209 SDValue Zero = DAG.getConstant(0, DL, WideVT);
20210
20211 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
20212
20213 // Truncate if we had to extend above.
20214 if (VT != ExtVT) {
20215 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
20216 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
20217 }
20218
20219 // Extract back to 128/256-bit if we widened.
20220 if (WideVT != VT)
20221 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
20222 DAG.getIntPtrConstant(0, DL));
20223
20224 return SelectedVal;
20225}
20226
20228 SelectionDAG &DAG) {
20229 SDValue In = Op.getOperand(0);
20230 MVT SVT = In.getSimpleValueType();
20231
20232 if (SVT.getVectorElementType() == MVT::i1)
20233 return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
20234
20235 assert(Subtarget.hasAVX() && "Expected AVX support");
20236 return LowerAVXExtend(Op, DAG, Subtarget);
20237}
20238
20239/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
20240/// It makes use of the fact that vectors with enough leading sign/zero bits
20241/// prevent the PACKSS/PACKUS from saturating the results.
20242/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
20243/// within each 128-bit lane.
20244static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
20245 const SDLoc &DL, SelectionDAG &DAG,
20246 const X86Subtarget &Subtarget) {
20247 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
20248 "Unexpected PACK opcode");
20249 assert(DstVT.isVector() && "VT not a vector?");
20250
20251 // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
20252 if (!Subtarget.hasSSE2())
20253 return SDValue();
20254
20255 EVT SrcVT = In.getValueType();
20256
20257 // No truncation required, we might get here due to recursive calls.
20258 if (SrcVT == DstVT)
20259 return In;
20260
20261 unsigned NumElems = SrcVT.getVectorNumElements();
20262 if (NumElems < 2 || !isPowerOf2_32(NumElems) )
20263 return SDValue();
20264
20265 unsigned DstSizeInBits = DstVT.getSizeInBits();
20266 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
20267 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
20268 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
20269
20270 LLVMContext &Ctx = *DAG.getContext();
20271 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
20272 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
20273
20274 // Pack to the largest type possible:
20275 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
20276 EVT InVT = MVT::i16, OutVT = MVT::i8;
20277 if (SrcVT.getScalarSizeInBits() > 16 &&
20278 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
20279 InVT = MVT::i32;
20280 OutVT = MVT::i16;
20281 }
20282
20283 // Sub-128-bit truncation - widen to 128-bit src and pack in the lower half.
20284 // On pre-AVX512, pack the src in both halves to help value tracking.
20285 if (SrcSizeInBits <= 128) {
20286 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
20287 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
20288 In = widenSubVector(In, false, Subtarget, DAG, DL, 128);
20289 SDValue LHS = DAG.getBitcast(InVT, In);
20290 SDValue RHS = Subtarget.hasAVX512() ? DAG.getUNDEF(InVT) : LHS;
20291 SDValue Res = DAG.getNode(Opcode, DL, OutVT, LHS, RHS);
20292 Res = extractSubVector(Res, 0, DAG, DL, SrcSizeInBits / 2);
20293 Res = DAG.getBitcast(PackedVT, Res);
20294 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20295 }
20296
20297 // Split lower/upper subvectors.
20298 SDValue Lo, Hi;
20299 std::tie(Lo, Hi) = splitVector(In, DAG, DL);
20300
20301 // If Hi is undef, then don't bother packing it and widen the result instead.
20302 if (Hi.isUndef()) {
20303 EVT DstHalfVT = DstVT.getHalfNumVectorElementsVT(Ctx);
20304 if (SDValue Res =
20305 truncateVectorWithPACK(Opcode, DstHalfVT, Lo, DL, DAG, Subtarget))
20306 return widenSubVector(Res, false, Subtarget, DAG, DL, DstSizeInBits);
20307 }
20308
20309 unsigned SubSizeInBits = SrcSizeInBits / 2;
20310 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
20311 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
20312
20313 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
20314 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
20315 Lo = DAG.getBitcast(InVT, Lo);
20316 Hi = DAG.getBitcast(InVT, Hi);
20317 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
20318 return DAG.getBitcast(DstVT, Res);
20319 }
20320
20321 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
20322 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
20323 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
20324 Lo = DAG.getBitcast(InVT, Lo);
20325 Hi = DAG.getBitcast(InVT, Hi);
20326 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
20327
20328 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
20329 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
20330 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
20332 int Scale = 64 / OutVT.getScalarSizeInBits();
20333 narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
20334 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
20335
20336 if (DstVT.is256BitVector())
20337 return DAG.getBitcast(DstVT, Res);
20338
20339 // If 512bit -> 128bit truncate another stage.
20340 Res = DAG.getBitcast(PackedVT, Res);
20341 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20342 }
20343
20344 // Recursively pack lower/upper subvectors, concat result and pack again.
20345 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
20346
20347 if (PackedVT.is128BitVector()) {
20348 // Avoid CONCAT_VECTORS on sub-128bit nodes as these can fail after
20349 // type legalization.
20350 SDValue Res =
20351 truncateVectorWithPACK(Opcode, PackedVT, In, DL, DAG, Subtarget);
20352 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20353 }
20354
20355 EVT HalfPackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
20356 Lo = truncateVectorWithPACK(Opcode, HalfPackedVT, Lo, DL, DAG, Subtarget);
20357 Hi = truncateVectorWithPACK(Opcode, HalfPackedVT, Hi, DL, DAG, Subtarget);
20358 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
20359 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20360}
20361
20362/// Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
20363/// e.g. trunc <8 x i32> X to <8 x i16> -->
20364/// MaskX = X & 0xffff (clear high bits to prevent saturation)
20365/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
20367 const X86Subtarget &Subtarget,
20368 SelectionDAG &DAG) {
20369 In = DAG.getZeroExtendInReg(In, DL, DstVT);
20370 return truncateVectorWithPACK(X86ISD::PACKUS, DstVT, In, DL, DAG, Subtarget);
20371}
20372
20373/// Truncate using inreg sign extension and X86ISD::PACKSS.
20375 const X86Subtarget &Subtarget,
20376 SelectionDAG &DAG) {
20377 EVT SrcVT = In.getValueType();
20378 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, SrcVT, In,
20379 DAG.getValueType(DstVT));
20380 return truncateVectorWithPACK(X86ISD::PACKSS, DstVT, In, DL, DAG, Subtarget);
20381}
20382
20383/// Helper to determine if \p In truncated to \p DstVT has the necessary
20384/// signbits / leading zero bits to be truncated with PACKSS / PACKUS,
20385/// possibly by converting a SRL node to SRA for sign extension.
20386static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
20387 SDValue In, const SDLoc &DL,
20388 SelectionDAG &DAG,
20389 const X86Subtarget &Subtarget) {
20390 // Requires SSE2.
20391 if (!Subtarget.hasSSE2())
20392 return SDValue();
20393
20394 EVT SrcVT = In.getValueType();
20395 EVT DstSVT = DstVT.getVectorElementType();
20396 EVT SrcSVT = SrcVT.getVectorElementType();
20397
20398 // Check we have a truncation suited for PACKSS/PACKUS.
20399 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20400 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
20401 return SDValue();
20402
20403 assert(SrcSVT.getSizeInBits() > DstSVT.getSizeInBits() && "Bad truncation");
20404 unsigned NumStages = Log2_32(SrcSVT.getSizeInBits() / DstSVT.getSizeInBits());
20405
20406 // Truncation from 128-bit to vXi32 can be better handled with PSHUFD.
20407 // Truncation to sub-64-bit vXi16 can be better handled with PSHUFD/PSHUFLW.
20408 // Truncation from v2i64 to v2i8 can be better handled with PSHUFB.
20409 if ((DstSVT == MVT::i32 && SrcVT.getSizeInBits() <= 128) ||
20410 (DstSVT == MVT::i16 && SrcVT.getSizeInBits() <= (64 * NumStages)) ||
20411 (DstVT == MVT::v2i8 && SrcVT == MVT::v2i64 && Subtarget.hasSSSE3()))
20412 return SDValue();
20413
20414 // Prefer to lower v4i64 -> v4i32 as a shuffle unless we can cheaply
20415 // split this for packing.
20416 if (SrcVT == MVT::v4i64 && DstVT == MVT::v4i32 &&
20417 !isFreeToSplitVector(In.getNode(), DAG) &&
20418 (!Subtarget.hasAVX() || DAG.ComputeNumSignBits(In) != 64))
20419 return SDValue();
20420
20421 // Don't truncate AVX512 targets as multiple PACK nodes stages.
20422 if (Subtarget.hasAVX512() && NumStages > 1)
20423 return SDValue();
20424
20425 unsigned NumSrcEltBits = SrcVT.getScalarSizeInBits();
20426 unsigned NumPackedSignBits = std::min<unsigned>(DstSVT.getSizeInBits(), 16);
20427 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
20428
20429 // Truncate with PACKUS if we are truncating a vector with leading zero
20430 // bits that extend all the way to the packed/truncated value.
20431 // e.g. Masks, zext_in_reg, etc.
20432 // Pre-SSE41 we can only use PACKUSWB.
20433 KnownBits Known = DAG.computeKnownBits(In);
20434 if ((NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) {
20435 PackOpcode = X86ISD::PACKUS;
20436 return In;
20437 }
20438
20439 // Truncate with PACKSS if we are truncating a vector with sign-bits
20440 // that extend all the way to the packed/truncated value.
20441 // e.g. Comparison result, sext_in_reg, etc.
20442 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
20443
20444 // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
20445 // a sign splat (or AVX512 VPSRAQ support). ComputeNumSignBits struggles to
20446 // see through BITCASTs later on and combines/simplifications can't then use
20447 // it.
20448 if (DstSVT == MVT::i32 && NumSignBits != SrcSVT.getSizeInBits() &&
20449 !Subtarget.hasAVX512())
20450 return SDValue();
20451
20452 unsigned MinSignBits = NumSrcEltBits - NumPackedSignBits;
20453 if (MinSignBits < NumSignBits) {
20454 PackOpcode = X86ISD::PACKSS;
20455 return In;
20456 }
20457
20458 // If we have a srl that only generates signbits that we will discard in
20459 // the truncation then we can use PACKSS by converting the srl to a sra.
20460 // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
20461 if (In.getOpcode() == ISD::SRL && In->hasOneUse())
20462 if (const APInt *ShAmt = DAG.getValidShiftAmountConstant(In)) {
20463 if (*ShAmt == MinSignBits) {
20464 PackOpcode = X86ISD::PACKSS;
20465 return DAG.getNode(ISD::SRA, DL, SrcVT, In->ops());
20466 }
20467 }
20468
20469 return SDValue();
20470}
20471
20472/// This function lowers a vector truncation of 'extended sign-bits' or
20473/// 'extended zero-bits' values.
20474/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
20476 const SDLoc &DL,
20477 const X86Subtarget &Subtarget,
20478 SelectionDAG &DAG) {
20479 MVT SrcVT = In.getSimpleValueType();
20480 MVT DstSVT = DstVT.getVectorElementType();
20481 MVT SrcSVT = SrcVT.getVectorElementType();
20482 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20483 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
20484 return SDValue();
20485
20486 // If the upper half of the source is undef, then attempt to split and
20487 // only truncate the lower half.
20488 if (DstVT.getSizeInBits() >= 128) {
20489 SmallVector<SDValue> LowerOps;
20490 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
20491 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
20492 if (SDValue Res = LowerTruncateVecPackWithSignBits(DstHalfVT, Lo, DL,
20493 Subtarget, DAG))
20494 return widenSubVector(Res, false, Subtarget, DAG, DL,
20495 DstVT.getSizeInBits());
20496 }
20497 }
20498
20499 unsigned PackOpcode;
20500 if (SDValue Src =
20501 matchTruncateWithPACK(PackOpcode, DstVT, In, DL, DAG, Subtarget))
20502 return truncateVectorWithPACK(PackOpcode, DstVT, Src, DL, DAG, Subtarget);
20503
20504 return SDValue();
20505}
20506
20507/// This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into
20508/// X86ISD::PACKUS/X86ISD::PACKSS operations.
20510 const X86Subtarget &Subtarget,
20511 SelectionDAG &DAG) {
20512 MVT SrcVT = In.getSimpleValueType();
20513 MVT DstSVT = DstVT.getVectorElementType();
20514 MVT SrcSVT = SrcVT.getVectorElementType();
20515 unsigned NumElems = DstVT.getVectorNumElements();
20516 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20517 (DstSVT == MVT::i8 || DstSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
20518 NumElems >= 8))
20519 return SDValue();
20520
20521 // SSSE3's pshufb results in less instructions in the cases below.
20522 if (Subtarget.hasSSSE3() && NumElems == 8) {
20523 if (SrcSVT == MVT::i16)
20524 return SDValue();
20525 if (SrcSVT == MVT::i32 && (DstSVT == MVT::i8 || !Subtarget.hasSSE41()))
20526 return SDValue();
20527 }
20528
20529 // If the upper half of the source is undef, then attempt to split and
20530 // only truncate the lower half.
20531 if (DstVT.getSizeInBits() >= 128) {
20532 SmallVector<SDValue> LowerOps;
20533 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
20534 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
20535 if (SDValue Res = LowerTruncateVecPack(DstHalfVT, Lo, DL, Subtarget, DAG))
20536 return widenSubVector(Res, false, Subtarget, DAG, DL,
20537 DstVT.getSizeInBits());
20538 }
20539 }
20540
20541 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
20542 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
20543 // truncate 2 x v4i32 to v8i16.
20544 if (Subtarget.hasSSE41() || DstSVT == MVT::i8)
20545 return truncateVectorWithPACKUS(DstVT, In, DL, Subtarget, DAG);
20546
20547 if (SrcSVT == MVT::i16 || SrcSVT == MVT::i32)
20548 return truncateVectorWithPACKSS(DstVT, In, DL, Subtarget, DAG);
20549
20550 // Special case vXi64 -> vXi16, shuffle to vXi32 and then use PACKSS.
20551 if (DstSVT == MVT::i16 && SrcSVT == MVT::i64) {
20552 MVT TruncVT = MVT::getVectorVT(MVT::i32, NumElems);
20553 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, In);
20554 return truncateVectorWithPACKSS(DstVT, Trunc, DL, Subtarget, DAG);
20555 }
20556
20557 return SDValue();
20558}
20559
20561 const X86Subtarget &Subtarget) {
20562
20563 SDLoc DL(Op);
20564 MVT VT = Op.getSimpleValueType();
20565 SDValue In = Op.getOperand(0);
20566 MVT InVT = In.getSimpleValueType();
20567
20568 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
20569
20570 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
20571 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
20572 if (InVT.getScalarSizeInBits() <= 16) {
20573 if (Subtarget.hasBWI()) {
20574 // legal, will go to VPMOVB2M, VPMOVW2M
20575 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
20576 // We need to shift to get the lsb into sign position.
20577 // Shift packed bytes not supported natively, bitcast to word
20578 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
20579 In = DAG.getNode(ISD::SHL, DL, ExtVT,
20580 DAG.getBitcast(ExtVT, In),
20581 DAG.getConstant(ShiftInx, DL, ExtVT));
20582 In = DAG.getBitcast(InVT, In);
20583 }
20584 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
20585 In, ISD::SETGT);
20586 }
20587 // Use TESTD/Q, extended vector to packed dword/qword.
20588 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
20589 "Unexpected vector type.");
20590 unsigned NumElts = InVT.getVectorNumElements();
20591 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
20592 // We need to change to a wider element type that we have support for.
20593 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
20594 // For 16 element vectors we extend to v16i32 unless we are explicitly
20595 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
20596 // we need to split into two 8 element vectors which we can extend to v8i32,
20597 // truncate and concat the results. There's an additional complication if
20598 // the original type is v16i8. In that case we can't split the v16i8
20599 // directly, so we need to shuffle high elements to low and use
20600 // sign_extend_vector_inreg.
20601 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
20602 SDValue Lo, Hi;
20603 if (InVT == MVT::v16i8) {
20604 Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
20605 Hi = DAG.getVectorShuffle(
20606 InVT, DL, In, In,
20607 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
20608 Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
20609 } else {
20610 assert(InVT == MVT::v16i16 && "Unexpected VT!");
20611 Lo = extract128BitVector(In, 0, DAG, DL);
20612 Hi = extract128BitVector(In, 8, DAG, DL);
20613 }
20614 // We're split now, just emit two truncates and a concat. The two
20615 // truncates will trigger legalization to come back to this function.
20616 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
20617 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
20618 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
20619 }
20620 // We either have 8 elements or we're allowed to use 512-bit vectors.
20621 // If we have VLX, we want to use the narrowest vector that can get the
20622 // job done so we use vXi32.
20623 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
20624 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
20625 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
20626 InVT = ExtVT;
20627 ShiftInx = InVT.getScalarSizeInBits() - 1;
20628 }
20629
20630 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
20631 // We need to shift to get the lsb into sign position.
20632 In = DAG.getNode(ISD::SHL, DL, InVT, In,
20633 DAG.getConstant(ShiftInx, DL, InVT));
20634 }
20635 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
20636 if (Subtarget.hasDQI())
20637 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
20638 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
20639}
20640
20641SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
20642 SDLoc DL(Op);
20643 MVT VT = Op.getSimpleValueType();
20644 SDValue In = Op.getOperand(0);
20645 MVT InVT = In.getSimpleValueType();
20647 "Invalid TRUNCATE operation");
20648
20649 // If we're called by the type legalizer, handle a few cases.
20650 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20651 if (!TLI.isTypeLegal(VT) || !TLI.isTypeLegal(InVT)) {
20652 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
20653 VT.is128BitVector() && Subtarget.hasAVX512()) {
20654 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
20655 "Unexpected subtarget!");
20656 // The default behavior is to truncate one step, concatenate, and then
20657 // truncate the remainder. We'd rather produce two 64-bit results and
20658 // concatenate those.
20659 SDValue Lo, Hi;
20660 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
20661
20662 EVT LoVT, HiVT;
20663 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
20664
20665 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
20666 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
20667 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
20668 }
20669
20670 // Pre-AVX512 (or prefer-256bit) see if we can make use of PACKSS/PACKUS.
20671 if (!Subtarget.hasAVX512() ||
20672 (InVT.is512BitVector() && VT.is256BitVector()))
20673 if (SDValue SignPack =
20674 LowerTruncateVecPackWithSignBits(VT, In, DL, Subtarget, DAG))
20675 return SignPack;
20676
20677 // Pre-AVX512 see if we can make use of PACKSS/PACKUS.
20678 if (!Subtarget.hasAVX512())
20679 return LowerTruncateVecPack(VT, In, DL, Subtarget, DAG);
20680
20681 // Otherwise let default legalization handle it.
20682 return SDValue();
20683 }
20684
20685 if (VT.getVectorElementType() == MVT::i1)
20686 return LowerTruncateVecI1(Op, DAG, Subtarget);
20687
20688 // Attempt to truncate with PACKUS/PACKSS even on AVX512 if we'd have to
20689 // concat from subvectors to use VPTRUNC etc.
20690 if (!Subtarget.hasAVX512() || isFreeToSplitVector(In.getNode(), DAG))
20691 if (SDValue SignPack =
20692 LowerTruncateVecPackWithSignBits(VT, In, DL, Subtarget, DAG))
20693 return SignPack;
20694
20695 // vpmovqb/w/d, vpmovdb/w, vpmovwb
20696 if (Subtarget.hasAVX512()) {
20697 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
20698 assert(VT == MVT::v32i8 && "Unexpected VT!");
20699 return splitVectorIntUnary(Op, DAG, DL);
20700 }
20701
20702 // word to byte only under BWI. Otherwise we have to promoted to v16i32
20703 // and then truncate that. But we should only do that if we haven't been
20704 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
20705 // handled by isel patterns.
20706 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
20707 Subtarget.canExtendTo512DQ())
20708 return Op;
20709 }
20710
20711 // Handle truncation of V256 to V128 using shuffles.
20712 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
20713
20714 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
20715 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
20716 if (Subtarget.hasInt256()) {
20717 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
20718 In = DAG.getBitcast(MVT::v8i32, In);
20719 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
20720 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
20721 DAG.getIntPtrConstant(0, DL));
20722 }
20723
20724 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
20725 DAG.getIntPtrConstant(0, DL));
20726 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
20727 DAG.getIntPtrConstant(2, DL));
20728 static const int ShufMask[] = {0, 2, 4, 6};
20729 return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),
20730 DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);
20731 }
20732
20733 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
20734 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
20735 if (Subtarget.hasInt256()) {
20736 // The PSHUFB mask:
20737 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
20738 -1, -1, -1, -1, -1, -1, -1, -1,
20739 16, 17, 20, 21, 24, 25, 28, 29,
20740 -1, -1, -1, -1, -1, -1, -1, -1 };
20741 In = DAG.getBitcast(MVT::v32i8, In);
20742 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
20743 In = DAG.getBitcast(MVT::v4i64, In);
20744
20745 static const int ShufMask2[] = {0, 2, -1, -1};
20746 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
20747 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
20748 DAG.getIntPtrConstant(0, DL));
20749 return DAG.getBitcast(MVT::v8i16, In);
20750 }
20751
20752 return Subtarget.hasSSE41()
20753 ? truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG)
20754 : truncateVectorWithPACKSS(VT, In, DL, Subtarget, DAG);
20755 }
20756
20757 if (VT == MVT::v16i8 && InVT == MVT::v16i16)
20758 return truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG);
20759
20760 llvm_unreachable("All 256->128 cases should have been handled above!");
20761}
20762
20763// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
20764// behaves on out of range inputs to generate optimized conversions.
20766 SelectionDAG &DAG,
20767 const X86Subtarget &Subtarget) {
20768 MVT SrcVT = Src.getSimpleValueType();
20769 unsigned DstBits = VT.getScalarSizeInBits();
20770 assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported");
20771
20772 // Calculate the converted result for values in the range 0 to
20773 // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
20774 SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
20775 SDValue Big =
20776 DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
20777 DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
20778 DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
20779
20780 // The "CVTTP2SI" instruction conveniently sets the sign bit if
20781 // and only if the value was out of range. So we can use that
20782 // as our indicator that we rather use "Big" instead of "Small".
20783 //
20784 // Use "Small" if "IsOverflown" has all bits cleared
20785 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
20786
20787 // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
20788 // use the slightly slower blendv select instead.
20789 if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
20790 SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
20791 return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
20792 }
20793
20794 SDValue IsOverflown =
20795 DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
20796 DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
20797 return DAG.getNode(ISD::OR, dl, VT, Small,
20798 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
20799}
20800
20801SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
20802 bool IsStrict = Op->isStrictFPOpcode();
20803 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
20804 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
20805 MVT VT = Op->getSimpleValueType(0);
20806 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20807 SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
20808 MVT SrcVT = Src.getSimpleValueType();
20809 SDLoc dl(Op);
20810
20811 SDValue Res;
20812 if (isSoftF16(SrcVT, Subtarget)) {
20813 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
20814 if (IsStrict)
20815 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
20816 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
20817 {NVT, MVT::Other}, {Chain, Src})});
20818 return DAG.getNode(Op.getOpcode(), dl, VT,
20819 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
20820 } else if (isTypeLegal(SrcVT) && isLegalConversion(VT, IsSigned, Subtarget)) {
20821 return Op;
20822 }
20823
20824 if (VT.isVector()) {
20825 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
20826 MVT ResVT = MVT::v4i32;
20827 MVT TruncVT = MVT::v4i1;
20828 unsigned Opc;
20829 if (IsStrict)
20831 else
20832 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
20833
20834 if (!IsSigned && !Subtarget.hasVLX()) {
20835 assert(Subtarget.useAVX512Regs() && "Unexpected features!");
20836 // Widen to 512-bits.
20837 ResVT = MVT::v8i32;
20838 TruncVT = MVT::v8i1;
20839 Opc = Op.getOpcode();
20840 // Need to concat with zero vector for strict fp to avoid spurious
20841 // exceptions.
20842 // TODO: Should we just do this for non-strict as well?
20843 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
20844 : DAG.getUNDEF(MVT::v8f64);
20845 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
20846 DAG.getIntPtrConstant(0, dl));
20847 }
20848 if (IsStrict) {
20849 Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
20850 Chain = Res.getValue(1);
20851 } else {
20852 Res = DAG.getNode(Opc, dl, ResVT, Src);
20853 }
20854
20855 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
20856 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
20857 DAG.getIntPtrConstant(0, dl));
20858 if (IsStrict)
20859 return DAG.getMergeValues({Res, Chain}, dl);
20860 return Res;
20861 }
20862
20863 if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
20864 if (VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16)
20865 return Op;
20866
20867 MVT ResVT = VT;
20868 MVT EleVT = VT.getVectorElementType();
20869 if (EleVT != MVT::i64)
20870 ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
20871
20872 if (SrcVT != MVT::v8f16) {
20873 SDValue Tmp =
20874 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
20875 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
20876 Ops[0] = Src;
20877 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
20878 }
20879
20880 if (IsStrict) {
20881 Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
20883 dl, {ResVT, MVT::Other}, {Chain, Src});
20884 Chain = Res.getValue(1);
20885 } else {
20886 Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,
20887 ResVT, Src);
20888 }
20889
20890 // TODO: Need to add exception check code for strict FP.
20891 if (EleVT.getSizeInBits() < 16) {
20892 ResVT = MVT::getVectorVT(EleVT, 8);
20893 Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
20894 }
20895
20896 if (ResVT != VT)
20897 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
20898 DAG.getIntPtrConstant(0, dl));
20899
20900 if (IsStrict)
20901 return DAG.getMergeValues({Res, Chain}, dl);
20902 return Res;
20903 }
20904
20905 // v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first.
20906 if (VT.getVectorElementType() == MVT::i16) {
20907 assert((SrcVT.getVectorElementType() == MVT::f32 ||
20908 SrcVT.getVectorElementType() == MVT::f64) &&
20909 "Expected f32/f64 vector!");
20910 MVT NVT = VT.changeVectorElementType(MVT::i32);
20911 if (IsStrict) {
20912 Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT
20914 dl, {NVT, MVT::Other}, {Chain, Src});
20915 Chain = Res.getValue(1);
20916 } else {
20917 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
20918 NVT, Src);
20919 }
20920
20921 // TODO: Need to add exception check code for strict FP.
20922 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20923
20924 if (IsStrict)
20925 return DAG.getMergeValues({Res, Chain}, dl);
20926 return Res;
20927 }
20928
20929 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
20930 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
20931 assert(!IsSigned && "Expected unsigned conversion!");
20932 assert(Subtarget.useAVX512Regs() && "Requires avx512f");
20933 return Op;
20934 }
20935
20936 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
20937 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
20938 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
20939 Subtarget.useAVX512Regs()) {
20940 assert(!IsSigned && "Expected unsigned conversion!");
20941 assert(!Subtarget.hasVLX() && "Unexpected features!");
20942 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
20943 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
20944 // Need to concat with zero vector for strict fp to avoid spurious
20945 // exceptions.
20946 // TODO: Should we just do this for non-strict as well?
20947 SDValue Tmp =
20948 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
20949 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
20950 DAG.getIntPtrConstant(0, dl));
20951
20952 if (IsStrict) {
20953 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
20954 {Chain, Src});
20955 Chain = Res.getValue(1);
20956 } else {
20957 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
20958 }
20959
20960 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
20961 DAG.getIntPtrConstant(0, dl));
20962
20963 if (IsStrict)
20964 return DAG.getMergeValues({Res, Chain}, dl);
20965 return Res;
20966 }
20967
20968 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
20969 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
20970 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
20971 Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
20972 assert(!Subtarget.hasVLX() && "Unexpected features!");
20973 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
20974 // Need to concat with zero vector for strict fp to avoid spurious
20975 // exceptions.
20976 // TODO: Should we just do this for non-strict as well?
20977 SDValue Tmp =
20978 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
20979 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
20980 DAG.getIntPtrConstant(0, dl));
20981
20982 if (IsStrict) {
20983 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
20984 {Chain, Src});
20985 Chain = Res.getValue(1);
20986 } else {
20987 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
20988 }
20989
20990 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
20991 DAG.getIntPtrConstant(0, dl));
20992
20993 if (IsStrict)
20994 return DAG.getMergeValues({Res, Chain}, dl);
20995 return Res;
20996 }
20997
20998 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
20999 if (!Subtarget.hasVLX()) {
21000 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
21001 // legalizer and then widened again by vector op legalization.
21002 if (!IsStrict)
21003 return SDValue();
21004
21005 SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
21006 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
21007 {Src, Zero, Zero, Zero});
21008 Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21009 {Chain, Tmp});
21010 SDValue Chain = Tmp.getValue(1);
21011 Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
21012 DAG.getIntPtrConstant(0, dl));
21013 return DAG.getMergeValues({Tmp, Chain}, dl);
21014 }
21015
21016 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
21017 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
21018 DAG.getUNDEF(MVT::v2f32));
21019 if (IsStrict) {
21020 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
21022 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
21023 }
21024 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21025 return DAG.getNode(Opc, dl, VT, Tmp);
21026 }
21027
21028 // Generate optimized instructions for pre AVX512 unsigned conversions from
21029 // vXf32 to vXi32.
21030 if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
21031 (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
21032 (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
21033 assert(!IsSigned && "Expected unsigned conversion!");
21034 return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
21035 }
21036
21037 return SDValue();
21038 }
21039
21040 assert(!VT.isVector());
21041
21042 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
21043
21044 if (!IsSigned && UseSSEReg) {
21045 // Conversions from f32/f64 with AVX512 should be legal.
21046 if (Subtarget.hasAVX512())
21047 return Op;
21048
21049 // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
21050 // behaves on out of range inputs to generate optimized conversions.
21051 if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
21052 (VT == MVT::i64 && Subtarget.is64Bit()))) {
21053 unsigned DstBits = VT.getScalarSizeInBits();
21054 APInt UIntLimit = APInt::getSignMask(DstBits);
21055 SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
21056 DAG.getConstant(UIntLimit, dl, VT));
21057 MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
21058
21059 // Calculate the converted result for values in the range:
21060 // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21061 // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
21062 SDValue Small =
21063 DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
21064 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
21065 SDValue Big = DAG.getNode(
21066 X86ISD::CVTTS2SI, dl, VT,
21067 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
21068 DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
21069
21070 // The "CVTTS2SI" instruction conveniently sets the sign bit if
21071 // and only if the value was out of range. So we can use that
21072 // as our indicator that we rather use "Big" instead of "Small".
21073 //
21074 // Use "Small" if "IsOverflown" has all bits cleared
21075 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21076 SDValue IsOverflown = DAG.getNode(
21077 ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
21078 return DAG.getNode(ISD::OR, dl, VT, Small,
21079 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21080 }
21081
21082 // Use default expansion for i64.
21083 if (VT == MVT::i64)
21084 return SDValue();
21085
21086 assert(VT == MVT::i32 && "Unexpected VT!");
21087
21088 // Promote i32 to i64 and use a signed operation on 64-bit targets.
21089 // FIXME: This does not generate an invalid exception if the input does not
21090 // fit in i32. PR44019
21091 if (Subtarget.is64Bit()) {
21092 if (IsStrict) {
21093 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},
21094 {Chain, Src});
21095 Chain = Res.getValue(1);
21096 } else
21097 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
21098
21099 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21100 if (IsStrict)
21101 return DAG.getMergeValues({Res, Chain}, dl);
21102 return Res;
21103 }
21104
21105 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
21106 // use fisttp which will be handled later.
21107 if (!Subtarget.hasSSE3())
21108 return SDValue();
21109 }
21110
21111 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
21112 // FIXME: This does not generate an invalid exception if the input does not
21113 // fit in i16. PR44019
21114 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
21115 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
21116 if (IsStrict) {
21117 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},
21118 {Chain, Src});
21119 Chain = Res.getValue(1);
21120 } else
21121 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
21122
21123 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21124 if (IsStrict)
21125 return DAG.getMergeValues({Res, Chain}, dl);
21126 return Res;
21127 }
21128
21129 // If this is a FP_TO_SINT using SSEReg we're done.
21130 if (UseSSEReg && IsSigned)
21131 return Op;
21132
21133 // fp128 needs to use a libcall.
21134 if (SrcVT == MVT::f128) {
21135 RTLIB::Libcall LC;
21136 if (IsSigned)
21137 LC = RTLIB::getFPTOSINT(SrcVT, VT);
21138 else
21139 LC = RTLIB::getFPTOUINT(SrcVT, VT);
21140
21141 MakeLibCallOptions CallOptions;
21142 std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,
21143 SDLoc(Op), Chain);
21144
21145 if (IsStrict)
21146 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
21147
21148 return Tmp.first;
21149 }
21150
21151 // Fall back to X87.
21152 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
21153 if (IsStrict)
21154 return DAG.getMergeValues({V, Chain}, dl);
21155 return V;
21156 }
21157
21158 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
21159}
21160
21161SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
21162 SelectionDAG &DAG) const {
21163 SDValue Src = Op.getOperand(0);
21164 MVT SrcVT = Src.getSimpleValueType();
21165
21166 if (SrcVT == MVT::f16)
21167 return SDValue();
21168
21169 // If the source is in an SSE register, the node is Legal.
21170 if (isScalarFPTypeInSSEReg(SrcVT))
21171 return Op;
21172
21173 return LRINT_LLRINTHelper(Op.getNode(), DAG);
21174}
21175
21176SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
21177 SelectionDAG &DAG) const {
21178 EVT DstVT = N->getValueType(0);
21179 SDValue Src = N->getOperand(0);
21180 EVT SrcVT = Src.getValueType();
21181
21182 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
21183 // f16 must be promoted before using the lowering in this routine.
21184 // fp128 does not use this lowering.
21185 return SDValue();
21186 }
21187
21188 SDLoc DL(N);
21189 SDValue Chain = DAG.getEntryNode();
21190
21191 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
21192
21193 // If we're converting from SSE, the stack slot needs to hold both types.
21194 // Otherwise it only needs to hold the DstVT.
21195 EVT OtherVT = UseSSE ? SrcVT : DstVT;
21196 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
21197 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
21198 MachinePointerInfo MPI =
21200
21201 if (UseSSE) {
21202 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
21203 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
21204 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
21205 SDValue Ops[] = { Chain, StackPtr };
21206
21207 Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
21208 /*Align*/ std::nullopt,
21210 Chain = Src.getValue(1);
21211 }
21212
21213 SDValue StoreOps[] = { Chain, Src, StackPtr };
21214 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
21215 StoreOps, DstVT, MPI, /*Align*/ std::nullopt,
21217
21218 return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
21219}
21220
21221SDValue
21222X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
21223 // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
21224 // but making use of X86 specifics to produce better instruction sequences.
21225 SDNode *Node = Op.getNode();
21226 bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
21227 unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
21228 SDLoc dl(SDValue(Node, 0));
21229 SDValue Src = Node->getOperand(0);
21230
21231 // There are three types involved here: SrcVT is the source floating point
21232 // type, DstVT is the type of the result, and TmpVT is the result of the
21233 // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
21234 // DstVT).
21235 EVT SrcVT = Src.getValueType();
21236 EVT DstVT = Node->getValueType(0);
21237 EVT TmpVT = DstVT;
21238
21239 // This code is only for floats and doubles. Fall back to generic code for
21240 // anything else.
21241 if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftF16(SrcVT, Subtarget))
21242 return SDValue();
21243
21244 EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
21245 unsigned SatWidth = SatVT.getScalarSizeInBits();
21246 unsigned DstWidth = DstVT.getScalarSizeInBits();
21247 unsigned TmpWidth = TmpVT.getScalarSizeInBits();
21248 assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&
21249 "Expected saturation width smaller than result width");
21250
21251 // Promote result of FP_TO_*INT to at least 32 bits.
21252 if (TmpWidth < 32) {
21253 TmpVT = MVT::i32;
21254 TmpWidth = 32;
21255 }
21256
21257 // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
21258 // us to use a native signed conversion instead.
21259 if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
21260 TmpVT = MVT::i64;
21261 TmpWidth = 64;
21262 }
21263
21264 // If the saturation width is smaller than the size of the temporary result,
21265 // we can always use signed conversion, which is native.
21266 if (SatWidth < TmpWidth)
21267 FpToIntOpcode = ISD::FP_TO_SINT;
21268
21269 // Determine minimum and maximum integer values and their corresponding
21270 // floating-point values.
21271 APInt MinInt, MaxInt;
21272 if (IsSigned) {
21273 MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);
21274 MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);
21275 } else {
21276 MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);
21277 MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);
21278 }
21279
21280 APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));
21281 APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT));
21282
21283 APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
21284 MinInt, IsSigned, APFloat::rmTowardZero);
21285 APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
21286 MaxInt, IsSigned, APFloat::rmTowardZero);
21287 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
21288 && !(MaxStatus & APFloat::opStatus::opInexact);
21289
21290 SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
21291 SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
21292
21293 // If the integer bounds are exactly representable as floats, emit a
21294 // min+max+fptoi sequence. Otherwise use comparisons and selects.
21295 if (AreExactFloatBounds) {
21296 if (DstVT != TmpVT) {
21297 // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
21298 SDValue MinClamped = DAG.getNode(
21299 X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
21300 // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
21301 SDValue BothClamped = DAG.getNode(
21302 X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
21303 // Convert clamped value to integer.
21304 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
21305
21306 // NaN will become INDVAL, with the top bit set and the rest zero.
21307 // Truncation will discard the top bit, resulting in zero.
21308 return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
21309 }
21310
21311 // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
21312 SDValue MinClamped = DAG.getNode(
21313 X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
21314 // Clamp by MaxFloat from above. NaN cannot occur.
21315 SDValue BothClamped = DAG.getNode(
21316 X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
21317 // Convert clamped value to integer.
21318 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
21319
21320 if (!IsSigned) {
21321 // In the unsigned case we're done, because we mapped NaN to MinFloat,
21322 // which is zero.
21323 return FpToInt;
21324 }
21325
21326 // Otherwise, select zero if Src is NaN.
21327 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
21328 return DAG.getSelectCC(
21329 dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
21330 }
21331
21332 SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
21333 SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
21334
21335 // Result of direct conversion, which may be selected away.
21336 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
21337
21338 if (DstVT != TmpVT) {
21339 // NaN will become INDVAL, with the top bit set and the rest zero.
21340 // Truncation will discard the top bit, resulting in zero.
21341 FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
21342 }
21343
21344 SDValue Select = FpToInt;
21345 // For signed conversions where we saturate to the same size as the
21346 // result type of the fptoi instructions, INDVAL coincides with integer
21347 // minimum, so we don't need to explicitly check it.
21348 if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
21349 // If Src ULT MinFloat, select MinInt. In particular, this also selects
21350 // MinInt if Src is NaN.
21351 Select = DAG.getSelectCC(
21352 dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
21353 }
21354
21355 // If Src OGT MaxFloat, select MaxInt.
21356 Select = DAG.getSelectCC(
21357 dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
21358
21359 // In the unsigned case we are done, because we mapped NaN to MinInt, which
21360 // is already zero. The promoted case was already handled above.
21361 if (!IsSigned || DstVT != TmpVT) {
21362 return Select;
21363 }
21364
21365 // Otherwise, select 0 if Src is NaN.
21366 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
21367 return DAG.getSelectCC(
21368 dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
21369}
21370
21371SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
21372 bool IsStrict = Op->isStrictFPOpcode();
21373
21374 SDLoc DL(Op);
21375 MVT VT = Op.getSimpleValueType();
21376 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
21377 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
21378 MVT SVT = In.getSimpleValueType();
21379
21380 // Let f16->f80 get lowered to a libcall, except for darwin, where we should
21381 // lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available)
21382 if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 &&
21383 !Subtarget.getTargetTriple().isOSDarwin()))
21384 return SDValue();
21385
21386 if ((SVT == MVT::v8f16 && Subtarget.hasF16C()) ||
21387 (SVT == MVT::v16f16 && Subtarget.useAVX512Regs()))
21388 return Op;
21389
21390 if (SVT == MVT::f16) {
21391 if (Subtarget.hasFP16())
21392 return Op;
21393
21394 if (VT != MVT::f32) {
21395 if (IsStrict)
21396 return DAG.getNode(
21397 ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},
21398 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,
21399 {MVT::f32, MVT::Other}, {Chain, In})});
21400
21401 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
21402 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));
21403 }
21404
21405 if (!Subtarget.hasF16C()) {
21406 if (!Subtarget.getTargetTriple().isOSDarwin())
21407 return SDValue();
21408
21409 assert(VT == MVT::f32 && SVT == MVT::f16 && "unexpected extend libcall");
21410
21411 // Need a libcall, but ABI for f16 is soft-float on MacOS.
21413 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
21414
21415 In = DAG.getBitcast(MVT::i16, In);
21418 Entry.Node = In;
21419 Entry.Ty = EVT(MVT::i16).getTypeForEVT(*DAG.getContext());
21420 Entry.IsSExt = false;
21421 Entry.IsZExt = true;
21422 Args.push_back(Entry);
21423
21425 getLibcallName(RTLIB::FPEXT_F16_F32),
21427 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
21428 CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee,
21429 std::move(Args));
21430
21431 SDValue Res;
21432 std::tie(Res,Chain) = LowerCallTo(CLI);
21433 if (IsStrict)
21434 Res = DAG.getMergeValues({Res, Chain}, DL);
21435
21436 return Res;
21437 }
21438
21439 In = DAG.getBitcast(MVT::i16, In);
21440 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,
21441 getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,
21442 DAG.getIntPtrConstant(0, DL));
21443 SDValue Res;
21444 if (IsStrict) {
21445 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},
21446 {Chain, In});
21447 Chain = Res.getValue(1);
21448 } else {
21449 Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,
21450 DAG.getTargetConstant(4, DL, MVT::i32));
21451 }
21452 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,
21453 DAG.getIntPtrConstant(0, DL));
21454 if (IsStrict)
21455 return DAG.getMergeValues({Res, Chain}, DL);
21456 return Res;
21457 }
21458
21459 if (!SVT.isVector() || SVT.getVectorElementType() == MVT::bf16)
21460 return Op;
21461
21462 if (SVT.getVectorElementType() == MVT::f16) {
21463 if (Subtarget.hasFP16() && isTypeLegal(SVT))
21464 return Op;
21465 assert(Subtarget.hasF16C() && "Unexpected features!");
21466 if (SVT == MVT::v2f16)
21467 In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
21468 DAG.getUNDEF(MVT::v2f16));
21469 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,
21470 DAG.getUNDEF(MVT::v4f16));
21471 if (IsStrict)
21472 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
21473 {Op->getOperand(0), Res});
21474 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
21475 } else if (VT == MVT::v4f64 || VT == MVT::v8f64) {
21476 return Op;
21477 }
21478
21479 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
21480
21481 SDValue Res =
21482 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
21483 if (IsStrict)
21484 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
21485 {Op->getOperand(0), Res});
21486 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
21487}
21488
21489SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
21490 bool IsStrict = Op->isStrictFPOpcode();
21491
21492 SDLoc DL(Op);
21493 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
21494 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
21495 MVT VT = Op.getSimpleValueType();
21496 MVT SVT = In.getSimpleValueType();
21497
21498 if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))
21499 return SDValue();
21500
21501 if (VT == MVT::f16 && (SVT == MVT::f64 || SVT == MVT::f32) &&
21502 !Subtarget.hasFP16() && (SVT == MVT::f64 || !Subtarget.hasF16C())) {
21503 if (!Subtarget.getTargetTriple().isOSDarwin())
21504 return SDValue();
21505
21506 // We need a libcall but the ABI for f16 libcalls on MacOS is soft.
21508 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
21509
21512 Entry.Node = In;
21513 Entry.Ty = EVT(SVT).getTypeForEVT(*DAG.getContext());
21514 Entry.IsSExt = false;
21515 Entry.IsZExt = true;
21516 Args.push_back(Entry);
21517
21519 getLibcallName(SVT == MVT::f64 ? RTLIB::FPROUND_F64_F16
21520 : RTLIB::FPROUND_F32_F16),
21522 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
21523 CallingConv::C, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()), Callee,
21524 std::move(Args));
21525
21526 SDValue Res;
21527 std::tie(Res, Chain) = LowerCallTo(CLI);
21528
21529 Res = DAG.getBitcast(MVT::f16, Res);
21530
21531 if (IsStrict)
21532 Res = DAG.getMergeValues({Res, Chain}, DL);
21533
21534 return Res;
21535 }
21536
21537 if (VT.getScalarType() == MVT::bf16) {
21538 if (SVT.getScalarType() == MVT::f32 &&
21539 ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
21540 Subtarget.hasAVXNECONVERT()))
21541 return Op;
21542 return SDValue();
21543 }
21544
21545 if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {
21546 if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)
21547 return SDValue();
21548
21549 if (VT.isVector())
21550 return Op;
21551
21552 SDValue Res;
21554 MVT::i32);
21555 if (IsStrict) {
21556 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,
21557 DAG.getConstantFP(0, DL, MVT::v4f32), In,
21558 DAG.getIntPtrConstant(0, DL));
21559 Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},
21560 {Chain, Res, Rnd});
21561 Chain = Res.getValue(1);
21562 } else {
21563 // FIXME: Should we use zeros for upper elements for non-strict?
21564 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);
21565 Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);
21566 }
21567
21568 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
21569 DAG.getIntPtrConstant(0, DL));
21570 Res = DAG.getBitcast(MVT::f16, Res);
21571
21572 if (IsStrict)
21573 return DAG.getMergeValues({Res, Chain}, DL);
21574
21575 return Res;
21576 }
21577
21578 return Op;
21579}
21580
21582 bool IsStrict = Op->isStrictFPOpcode();
21583 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21584 assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&
21585 "Unexpected VT!");
21586
21587 SDLoc dl(Op);
21588 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
21589 DAG.getConstant(0, dl, MVT::v8i16), Src,
21590 DAG.getIntPtrConstant(0, dl));
21591
21592 SDValue Chain;
21593 if (IsStrict) {
21594 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
21595 {Op.getOperand(0), Res});
21596 Chain = Res.getValue(1);
21597 } else {
21598 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
21599 }
21600
21601 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
21602 DAG.getIntPtrConstant(0, dl));
21603
21604 if (IsStrict)
21605 return DAG.getMergeValues({Res, Chain}, dl);
21606
21607 return Res;
21608}
21609
21611 bool IsStrict = Op->isStrictFPOpcode();
21612 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21613 assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&
21614 "Unexpected VT!");
21615
21616 SDLoc dl(Op);
21617 SDValue Res, Chain;
21618 if (IsStrict) {
21619 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
21620 DAG.getConstantFP(0, dl, MVT::v4f32), Src,
21621 DAG.getIntPtrConstant(0, dl));
21622 Res = DAG.getNode(
21623 X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
21624 {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
21625 Chain = Res.getValue(1);
21626 } else {
21627 // FIXME: Should we use zeros for upper elements for non-strict?
21628 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
21629 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
21630 DAG.getTargetConstant(4, dl, MVT::i32));
21631 }
21632
21633 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
21634 DAG.getIntPtrConstant(0, dl));
21635
21636 if (IsStrict)
21637 return DAG.getMergeValues({Res, Chain}, dl);
21638
21639 return Res;
21640}
21641
21642SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,
21643 SelectionDAG &DAG) const {
21644 SDLoc DL(Op);
21645
21646 MVT SVT = Op.getOperand(0).getSimpleValueType();
21647 if (SVT == MVT::f32 && ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
21648 Subtarget.hasAVXNECONVERT())) {
21649 SDValue Res;
21650 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, Op.getOperand(0));
21651 Res = DAG.getNode(X86ISD::CVTNEPS2BF16, DL, MVT::v8bf16, Res);
21652 Res = DAG.getBitcast(MVT::v8i16, Res);
21653 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
21654 DAG.getIntPtrConstant(0, DL));
21655 }
21656
21657 MakeLibCallOptions CallOptions;
21658 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16);
21659 SDValue Res =
21660 makeLibCall(DAG, LC, MVT::f16, Op.getOperand(0), CallOptions, DL).first;
21661 return DAG.getBitcast(MVT::i16, Res);
21662}
21663
21664/// Depending on uarch and/or optimizing for size, we might prefer to use a
21665/// vector operation in place of the typical scalar operation.
21667 SelectionDAG &DAG,
21668 const X86Subtarget &Subtarget) {
21669 // If both operands have other uses, this is probably not profitable.
21670 SDValue LHS = Op.getOperand(0);
21671 SDValue RHS = Op.getOperand(1);
21672 if (!LHS.hasOneUse() && !RHS.hasOneUse())
21673 return Op;
21674
21675 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
21676 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
21677 if (IsFP && !Subtarget.hasSSE3())
21678 return Op;
21679 if (!IsFP && !Subtarget.hasSSSE3())
21680 return Op;
21681
21682 // Extract from a common vector.
21683 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21684 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21685 LHS.getOperand(0) != RHS.getOperand(0) ||
21686 !isa<ConstantSDNode>(LHS.getOperand(1)) ||
21687 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
21688 !shouldUseHorizontalOp(true, DAG, Subtarget))
21689 return Op;
21690
21691 // Allow commuted 'hadd' ops.
21692 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
21693 unsigned HOpcode;
21694 switch (Op.getOpcode()) {
21695 // clang-format off
21696 case ISD::ADD: HOpcode = X86ISD::HADD; break;
21697 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
21698 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
21699 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
21700 default:
21701 llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
21702 // clang-format on
21703 }
21704 unsigned LExtIndex = LHS.getConstantOperandVal(1);
21705 unsigned RExtIndex = RHS.getConstantOperandVal(1);
21706 if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
21707 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
21708 std::swap(LExtIndex, RExtIndex);
21709
21710 if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
21711 return Op;
21712
21713 SDValue X = LHS.getOperand(0);
21714 EVT VecVT = X.getValueType();
21715 unsigned BitWidth = VecVT.getSizeInBits();
21716 unsigned NumLanes = BitWidth / 128;
21717 unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
21718 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
21719 "Not expecting illegal vector widths here");
21720
21721 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
21722 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
21723 if (BitWidth == 256 || BitWidth == 512) {
21724 unsigned LaneIdx = LExtIndex / NumEltsPerLane;
21725 X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
21726 LExtIndex %= NumEltsPerLane;
21727 }
21728
21729 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
21730 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
21731 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
21732 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
21733 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
21734 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
21735 DAG.getIntPtrConstant(LExtIndex / 2, DL));
21736}
21737
21738/// Depending on uarch and/or optimizing for size, we might prefer to use a
21739/// vector operation in place of the typical scalar operation.
21740SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
21741 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
21742 "Only expecting float/double");
21743 return lowerAddSubToHorizontalOp(Op, SDLoc(Op), DAG, Subtarget);
21744}
21745
21746/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
21747/// This mode isn't supported in hardware on X86. But as long as we aren't
21748/// compiling with trapping math, we can emulate this with
21749/// trunc(X + copysign(nextafter(0.5, 0.0), X)).
21751 SDValue N0 = Op.getOperand(0);
21752 SDLoc dl(Op);
21753 MVT VT = Op.getSimpleValueType();
21754
21755 // N0 += copysign(nextafter(0.5, 0.0), N0)
21757 bool Ignored;
21758 APFloat Point5Pred = APFloat(0.5f);
21759 Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
21760 Point5Pred.next(/*nextDown*/true);
21761
21762 SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
21763 DAG.getConstantFP(Point5Pred, dl, VT), N0);
21764 N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
21765
21766 // Truncate the result to remove fraction.
21767 return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
21768}
21769
21770/// The only differences between FABS and FNEG are the mask and the logic op.
21771/// FNEG also has a folding opportunity for FNEG(FABS(x)).
21773 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
21774 "Wrong opcode for lowering FABS or FNEG.");
21775
21776 bool IsFABS = (Op.getOpcode() == ISD::FABS);
21777
21778 // If this is a FABS and it has an FNEG user, bail out to fold the combination
21779 // into an FNABS. We'll lower the FABS after that if it is still in use.
21780 if (IsFABS)
21781 for (SDNode *User : Op->uses())
21782 if (User->getOpcode() == ISD::FNEG)
21783 return Op;
21784
21785 SDLoc dl(Op);
21786 MVT VT = Op.getSimpleValueType();
21787
21788 bool IsF128 = (VT == MVT::f128);
21789 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
21791 "Unexpected type in LowerFABSorFNEG");
21792
21793 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOptLevel to
21794 // decide if we should generate a 16-byte constant mask when we only need 4 or
21795 // 8 bytes for the scalar case.
21796
21797 // There are no scalar bitwise logical SSE/AVX instructions, so we
21798 // generate a 16-byte vector constant and logic op even for the scalar case.
21799 // Using a 16-byte mask allows folding the load of the mask with
21800 // the logic op, so it can save (~4 bytes) on code size.
21801 bool IsFakeVector = !VT.isVector() && !IsF128;
21802 MVT LogicVT = VT;
21803 if (IsFakeVector)
21804 LogicVT = (VT == MVT::f64) ? MVT::v2f64
21805 : (VT == MVT::f32) ? MVT::v4f32
21806 : MVT::v8f16;
21807
21808 unsigned EltBits = VT.getScalarSizeInBits();
21809 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
21810 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
21811 APInt::getSignMask(EltBits);
21813 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
21814
21815 SDValue Op0 = Op.getOperand(0);
21816 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
21817 unsigned LogicOp = IsFABS ? X86ISD::FAND :
21818 IsFNABS ? X86ISD::FOR :
21820 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
21821
21822 if (VT.isVector() || IsF128)
21823 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
21824
21825 // For the scalar case extend to a 128-bit vector, perform the logic op,
21826 // and extract the scalar result back out.
21827 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
21828 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
21829 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
21830 DAG.getIntPtrConstant(0, dl));
21831}
21832
21834 SDValue Mag = Op.getOperand(0);
21835 SDValue Sign = Op.getOperand(1);
21836 SDLoc dl(Op);
21837
21838 // If the sign operand is smaller, extend it first.
21839 MVT VT = Op.getSimpleValueType();
21840 if (Sign.getSimpleValueType().bitsLT(VT))
21841 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
21842
21843 // And if it is bigger, shrink it first.
21844 if (Sign.getSimpleValueType().bitsGT(VT))
21845 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign,
21846 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
21847
21848 // At this point the operands and the result should have the same
21849 // type, and that won't be f80 since that is not custom lowered.
21850 bool IsF128 = (VT == MVT::f128);
21851 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
21853 "Unexpected type in LowerFCOPYSIGN");
21854
21856
21857 // Perform all scalar logic operations as 16-byte vectors because there are no
21858 // scalar FP logic instructions in SSE.
21859 // TODO: This isn't necessary. If we used scalar types, we might avoid some
21860 // unnecessary splats, but we might miss load folding opportunities. Should
21861 // this decision be based on OptimizeForSize?
21862 bool IsFakeVector = !VT.isVector() && !IsF128;
21863 MVT LogicVT = VT;
21864 if (IsFakeVector)
21865 LogicVT = (VT == MVT::f64) ? MVT::v2f64
21866 : (VT == MVT::f32) ? MVT::v4f32
21867 : MVT::v8f16;
21868
21869 // The mask constants are automatically splatted for vector types.
21870 unsigned EltSizeInBits = VT.getScalarSizeInBits();
21871 SDValue SignMask = DAG.getConstantFP(
21872 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
21873 SDValue MagMask = DAG.getConstantFP(
21874 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
21875
21876 // First, clear all bits but the sign bit from the second operand (sign).
21877 if (IsFakeVector)
21878 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
21879 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
21880
21881 // Next, clear the sign bit from the first operand (magnitude).
21882 // TODO: If we had general constant folding for FP logic ops, this check
21883 // wouldn't be necessary.
21884 SDValue MagBits;
21885 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
21886 APFloat APF = Op0CN->getValueAPF();
21887 APF.clearSign();
21888 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
21889 } else {
21890 // If the magnitude operand wasn't a constant, we need to AND out the sign.
21891 if (IsFakeVector)
21892 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
21893 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
21894 }
21895
21896 // OR the magnitude value with the sign bit.
21897 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
21898 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
21899 DAG.getIntPtrConstant(0, dl));
21900}
21901
21903 SDValue N0 = Op.getOperand(0);
21904 SDLoc dl(Op);
21905 MVT VT = Op.getSimpleValueType();
21906
21907 MVT OpVT = N0.getSimpleValueType();
21908 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
21909 "Unexpected type for FGETSIGN");
21910
21911 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
21912 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
21913 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
21914 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
21915 Res = DAG.getZExtOrTrunc(Res, dl, VT);
21916 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
21917 return Res;
21918}
21919
21920/// Helper for attempting to create a X86ISD::BT node.
21921static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {
21922 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
21923 // instruction. Since the shift amount is in-range-or-undefined, we know
21924 // that doing a bittest on the i32 value is ok. We extend to i32 because
21925 // the encoding for the i16 version is larger than the i32 version.
21926 // Also promote i16 to i32 for performance / code size reason.
21927 if (Src.getValueType().getScalarSizeInBits() < 32)
21928 Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
21929
21930 // No legal type found, give up.
21931 if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))
21932 return SDValue();
21933
21934 // See if we can use the 32-bit instruction instead of the 64-bit one for a
21935 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
21936 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
21937 // known to be zero.
21938 if (Src.getValueType() == MVT::i64 &&
21939 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
21940 Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
21941
21942 // If the operand types disagree, extend the shift amount to match. Since
21943 // BT ignores high bits (like shifts) we can use anyextend.
21944 if (Src.getValueType() != BitNo.getValueType()) {
21945 // Peek through a mask/modulo operation.
21946 // TODO: DAGCombine fails to do this as it just checks isTruncateFree, but
21947 // we probably need a better IsDesirableToPromoteOp to handle this as well.
21948 if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())
21949 BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(),
21950 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
21951 BitNo.getOperand(0)),
21952 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
21953 BitNo.getOperand(1)));
21954 else
21955 BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);
21956 }
21957
21958 return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);
21959}
21960
21961/// Helper for creating a X86ISD::SETCC node.
21963 SelectionDAG &DAG) {
21964 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
21965 DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
21966}
21967
21968/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
21969/// recognizable memcmp expansion.
21970static bool isOrXorXorTree(SDValue X, bool Root = true) {
21971 if (X.getOpcode() == ISD::OR)
21972 return isOrXorXorTree(X.getOperand(0), false) &&
21973 isOrXorXorTree(X.getOperand(1), false);
21974 if (Root)
21975 return false;
21976 return X.getOpcode() == ISD::XOR;
21977}
21978
21979/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
21980/// expansion.
21981template <typename F>
21983 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
21984 SDValue Op0 = X.getOperand(0);
21985 SDValue Op1 = X.getOperand(1);
21986 if (X.getOpcode() == ISD::OR) {
21987 SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
21988 SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
21989 if (VecVT != CmpVT)
21990 return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
21991 if (HasPT)
21992 return DAG.getNode(ISD::OR, DL, VecVT, A, B);
21993 return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
21994 }
21995 if (X.getOpcode() == ISD::XOR) {
21996 SDValue A = SToV(Op0);
21997 SDValue B = SToV(Op1);
21998 if (VecVT != CmpVT)
21999 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
22000 if (HasPT)
22001 return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
22002 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
22003 }
22004 llvm_unreachable("Impossible");
22005}
22006
22007/// Try to map a 128-bit or larger integer comparison to vector instructions
22008/// before type legalization splits it up into chunks.
22011 const SDLoc &DL,
22012 SelectionDAG &DAG,
22013 const X86Subtarget &Subtarget) {
22014 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
22015
22016 // We're looking for an oversized integer equality comparison.
22017 EVT OpVT = X.getValueType();
22018 unsigned OpSize = OpVT.getSizeInBits();
22019 if (!OpVT.isScalarInteger() || OpSize < 128)
22020 return SDValue();
22021
22022 // Ignore a comparison with zero because that gets special treatment in
22023 // EmitTest(). But make an exception for the special case of a pair of
22024 // logically-combined vector-sized operands compared to zero. This pattern may
22025 // be generated by the memcmp expansion pass with oversized integer compares
22026 // (see PR33325).
22027 bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
22028 if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
22029 return SDValue();
22030
22031 // Don't perform this combine if constructing the vector will be expensive.
22032 auto IsVectorBitCastCheap = [](SDValue X) {
22034 return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
22035 X.getOpcode() == ISD::LOAD;
22036 };
22037 if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
22038 !IsOrXorXorTreeCCZero)
22039 return SDValue();
22040
22041 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
22042 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
22043 // Otherwise use PCMPEQ (plus AND) and mask testing.
22044 bool NoImplicitFloatOps =
22046 Attribute::NoImplicitFloat);
22047 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
22048 ((OpSize == 128 && Subtarget.hasSSE2()) ||
22049 (OpSize == 256 && Subtarget.hasAVX()) ||
22050 (OpSize == 512 && Subtarget.useAVX512Regs()))) {
22051 bool HasPT = Subtarget.hasSSE41();
22052
22053 // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
22054 // vector registers are essentially free. (Technically, widening registers
22055 // prevents load folding, but the tradeoff is worth it.)
22056 bool PreferKOT = Subtarget.preferMaskRegisters();
22057 bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
22058
22059 EVT VecVT = MVT::v16i8;
22060 EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
22061 if (OpSize == 256) {
22062 VecVT = MVT::v32i8;
22063 CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
22064 }
22065 EVT CastVT = VecVT;
22066 bool NeedsAVX512FCast = false;
22067 if (OpSize == 512 || NeedZExt) {
22068 if (Subtarget.hasBWI()) {
22069 VecVT = MVT::v64i8;
22070 CmpVT = MVT::v64i1;
22071 if (OpSize == 512)
22072 CastVT = VecVT;
22073 } else {
22074 VecVT = MVT::v16i32;
22075 CmpVT = MVT::v16i1;
22076 CastVT = OpSize == 512 ? VecVT
22077 : OpSize == 256 ? MVT::v8i32
22078 : MVT::v4i32;
22079 NeedsAVX512FCast = true;
22080 }
22081 }
22082
22083 auto ScalarToVector = [&](SDValue X) -> SDValue {
22084 bool TmpZext = false;
22085 EVT TmpCastVT = CastVT;
22086 if (X.getOpcode() == ISD::ZERO_EXTEND) {
22087 SDValue OrigX = X.getOperand(0);
22088 unsigned OrigSize = OrigX.getScalarValueSizeInBits();
22089 if (OrigSize < OpSize) {
22090 if (OrigSize == 128) {
22091 TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
22092 X = OrigX;
22093 TmpZext = true;
22094 } else if (OrigSize == 256) {
22095 TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
22096 X = OrigX;
22097 TmpZext = true;
22098 }
22099 }
22100 }
22101 X = DAG.getBitcast(TmpCastVT, X);
22102 if (!NeedZExt && !TmpZext)
22103 return X;
22104 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
22105 DAG.getConstant(0, DL, VecVT), X,
22106 DAG.getVectorIdxConstant(0, DL));
22107 };
22108
22109 SDValue Cmp;
22110 if (IsOrXorXorTreeCCZero) {
22111 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
22112 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
22113 // Use 2 vector equality compares and 'and' the results before doing a
22114 // MOVMSK.
22115 Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
22116 } else {
22117 SDValue VecX = ScalarToVector(X);
22118 SDValue VecY = ScalarToVector(Y);
22119 if (VecVT != CmpVT) {
22120 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
22121 } else if (HasPT) {
22122 Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
22123 } else {
22124 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
22125 }
22126 }
22127 // AVX512 should emit a setcc that will lower to kortest.
22128 if (VecVT != CmpVT) {
22129 EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64
22130 : CmpVT == MVT::v32i1 ? MVT::i32
22131 : MVT::i16;
22132 return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
22133 DAG.getConstant(0, DL, KRegVT), CC);
22134 }
22135 if (HasPT) {
22136 SDValue BCCmp =
22137 DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, Cmp);
22138 SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
22140 SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
22141 return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
22142 }
22143 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
22144 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
22145 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
22146 assert(Cmp.getValueType() == MVT::v16i8 &&
22147 "Non 128-bit vector on pre-SSE41 target");
22148 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
22149 SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
22150 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
22151 }
22152
22153 return SDValue();
22154}
22155
22156/// Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...))
22157/// style scalarized (associative) reduction patterns. Partial reductions
22158/// are supported when the pointer SrcMask is non-null.
22159/// TODO - move this to SelectionDAG?
22162 SmallVectorImpl<APInt> *SrcMask = nullptr) {
22164 DenseMap<SDValue, APInt> SrcOpMap;
22165 EVT VT = MVT::Other;
22166
22167 // Recognize a special case where a vector is casted into wide integer to
22168 // test all 0s.
22169 assert(Op.getOpcode() == unsigned(BinOp) &&
22170 "Unexpected bit reduction opcode");
22171 Opnds.push_back(Op.getOperand(0));
22172 Opnds.push_back(Op.getOperand(1));
22173
22174 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
22176 // BFS traverse all BinOp operands.
22177 if (I->getOpcode() == unsigned(BinOp)) {
22178 Opnds.push_back(I->getOperand(0));
22179 Opnds.push_back(I->getOperand(1));
22180 // Re-evaluate the number of nodes to be traversed.
22181 e += 2; // 2 more nodes (LHS and RHS) are pushed.
22182 continue;
22183 }
22184
22185 // Quit if a non-EXTRACT_VECTOR_ELT
22186 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
22187 return false;
22188
22189 // Quit if without a constant index.
22190 auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
22191 if (!Idx)
22192 return false;
22193
22194 SDValue Src = I->getOperand(0);
22195 DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
22196 if (M == SrcOpMap.end()) {
22197 VT = Src.getValueType();
22198 // Quit if not the same type.
22199 if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
22200 return false;
22201 unsigned NumElts = VT.getVectorNumElements();
22202 APInt EltCount = APInt::getZero(NumElts);
22203 M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
22204 SrcOps.push_back(Src);
22205 }
22206
22207 // Quit if element already used.
22208 unsigned CIdx = Idx->getZExtValue();
22209 if (M->second[CIdx])
22210 return false;
22211 M->second.setBit(CIdx);
22212 }
22213
22214 if (SrcMask) {
22215 // Collect the source partial masks.
22216 for (SDValue &SrcOp : SrcOps)
22217 SrcMask->push_back(SrcOpMap[SrcOp]);
22218 } else {
22219 // Quit if not all elements are used.
22220 for (const auto &I : SrcOpMap)
22221 if (!I.second.isAllOnes())
22222 return false;
22223 }
22224
22225 return true;
22226}
22227
22228// Helper function for comparing all bits of two vectors.
22230 ISD::CondCode CC, const APInt &OriginalMask,
22231 const X86Subtarget &Subtarget,
22232 SelectionDAG &DAG, X86::CondCode &X86CC) {
22233 EVT VT = LHS.getValueType();
22234 unsigned ScalarSize = VT.getScalarSizeInBits();
22235 if (OriginalMask.getBitWidth() != ScalarSize) {
22236 assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch");
22237 return SDValue();
22238 }
22239
22240 // Quit if not convertable to legal scalar or 128/256-bit vector.
22241 if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))
22242 return SDValue();
22243
22244 // FCMP may use ISD::SETNE when nnan - early out if we manage to get here.
22245 if (VT.isFloatingPoint())
22246 return SDValue();
22247
22248 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
22249 X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
22250
22251 APInt Mask = OriginalMask;
22252
22253 auto MaskBits = [&](SDValue Src) {
22254 if (Mask.isAllOnes())
22255 return Src;
22256 EVT SrcVT = Src.getValueType();
22257 SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
22258 return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
22259 };
22260
22261 // For sub-128-bit vector, cast to (legal) integer and compare with zero.
22262 if (VT.getSizeInBits() < 128) {
22263 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
22264 if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT)) {
22265 if (IntVT != MVT::i64)
22266 return SDValue();
22267 auto SplitLHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(LHS)), DL,
22268 MVT::i32, MVT::i32);
22269 auto SplitRHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(RHS)), DL,
22270 MVT::i32, MVT::i32);
22271 SDValue Lo =
22272 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.first, SplitRHS.first);
22273 SDValue Hi =
22274 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.second, SplitRHS.second);
22275 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
22276 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi),
22277 DAG.getConstant(0, DL, MVT::i32));
22278 }
22279 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
22280 DAG.getBitcast(IntVT, MaskBits(LHS)),
22281 DAG.getBitcast(IntVT, MaskBits(RHS)));
22282 }
22283
22284 // Without PTEST, a masked v2i64 or-reduction is not faster than
22285 // scalarization.
22286 bool UseKORTEST = Subtarget.useAVX512Regs();
22287 bool UsePTEST = Subtarget.hasSSE41();
22288 if (!UsePTEST && !Mask.isAllOnes() && ScalarSize > 32)
22289 return SDValue();
22290
22291 // Split down to 128/256/512-bit vector.
22292 unsigned TestSize = UseKORTEST ? 512 : (Subtarget.hasAVX() ? 256 : 128);
22293
22294 // If the input vector has vector elements wider than the target test size,
22295 // then cast to <X x i64> so it will safely split.
22296 if (ScalarSize > TestSize) {
22297 if (!Mask.isAllOnes())
22298 return SDValue();
22299 VT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, VT.getSizeInBits() / 64);
22300 LHS = DAG.getBitcast(VT, LHS);
22301 RHS = DAG.getBitcast(VT, RHS);
22302 Mask = APInt::getAllOnes(64);
22303 }
22304
22305 if (VT.getSizeInBits() > TestSize) {
22306 KnownBits KnownRHS = DAG.computeKnownBits(RHS);
22307 if (KnownRHS.isConstant() && KnownRHS.getConstant() == Mask) {
22308 // If ICMP(AND(LHS,MASK),MASK) - reduce using AND splits.
22309 while (VT.getSizeInBits() > TestSize) {
22310 auto Split = DAG.SplitVector(LHS, DL);
22311 VT = Split.first.getValueType();
22312 LHS = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
22313 }
22314 RHS = DAG.getAllOnesConstant(DL, VT);
22315 } else if (!UsePTEST && !KnownRHS.isZero()) {
22316 // MOVMSK Special Case:
22317 // ALLOF(CMPEQ(X,Y)) -> AND(CMPEQ(X[0],Y[0]),CMPEQ(X[1],Y[1]),....)
22318 MVT SVT = ScalarSize >= 32 ? MVT::i32 : MVT::i8;
22319 VT = MVT::getVectorVT(SVT, VT.getSizeInBits() / SVT.getSizeInBits());
22320 LHS = DAG.getBitcast(VT, MaskBits(LHS));
22321 RHS = DAG.getBitcast(VT, MaskBits(RHS));
22322 EVT BoolVT = VT.changeVectorElementType(MVT::i1);
22323 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETEQ);
22324 V = DAG.getSExtOrTrunc(V, DL, VT);
22325 while (VT.getSizeInBits() > TestSize) {
22326 auto Split = DAG.SplitVector(V, DL);
22327 VT = Split.first.getValueType();
22328 V = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
22329 }
22330 V = DAG.getNOT(DL, V, VT);
22331 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
22332 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
22333 DAG.getConstant(0, DL, MVT::i32));
22334 } else {
22335 // Convert to a ICMP_EQ(XOR(LHS,RHS),0) pattern.
22336 SDValue V = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
22337 while (VT.getSizeInBits() > TestSize) {
22338 auto Split = DAG.SplitVector(V, DL);
22339 VT = Split.first.getValueType();
22340 V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
22341 }
22342 LHS = V;
22343 RHS = DAG.getConstant(0, DL, VT);
22344 }
22345 }
22346
22347 if (UseKORTEST && VT.is512BitVector()) {
22348 MVT TestVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
22349 MVT BoolVT = TestVT.changeVectorElementType(MVT::i1);
22350 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
22351 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
22352 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETNE);
22353 return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V);
22354 }
22355
22356 if (UsePTEST) {
22357 MVT TestVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
22358 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
22359 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
22360 SDValue V = DAG.getNode(ISD::XOR, DL, TestVT, LHS, RHS);
22361 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
22362 }
22363
22364 assert(VT.getSizeInBits() == 128 && "Failure to split to 128-bits");
22365 MVT MaskVT = ScalarSize >= 32 ? MVT::v4i32 : MVT::v16i8;
22366 LHS = DAG.getBitcast(MaskVT, MaskBits(LHS));
22367 RHS = DAG.getBitcast(MaskVT, MaskBits(RHS));
22368 SDValue V = DAG.getNode(X86ISD::PCMPEQ, DL, MaskVT, LHS, RHS);
22369 V = DAG.getNOT(DL, V, MaskVT);
22370 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
22371 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
22372 DAG.getConstant(0, DL, MVT::i32));
22373}
22374
22375// Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback
22376// to CMP(MOVMSK(PCMPEQB(X,Y))).
22378 ISD::CondCode CC, const SDLoc &DL,
22379 const X86Subtarget &Subtarget,
22380 SelectionDAG &DAG,
22381 X86::CondCode &X86CC) {
22382 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
22383
22384 bool CmpNull = isNullConstant(RHS);
22385 bool CmpAllOnes = isAllOnesConstant(RHS);
22386 if (!CmpNull && !CmpAllOnes)
22387 return SDValue();
22388
22389 SDValue Op = LHS;
22390 if (!Subtarget.hasSSE2() || !Op->hasOneUse())
22391 return SDValue();
22392
22393 // Check whether we're masking/truncating an OR-reduction result, in which
22394 // case track the masked bits.
22395 // TODO: Add CmpAllOnes support.
22396 APInt Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());
22397 if (CmpNull) {
22398 switch (Op.getOpcode()) {
22399 case ISD::TRUNCATE: {
22400 SDValue Src = Op.getOperand(0);
22401 Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
22402 Op.getScalarValueSizeInBits());
22403 Op = Src;
22404 break;
22405 }
22406 case ISD::AND: {
22407 if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
22408 Mask = Cst->getAPIntValue();
22409 Op = Op.getOperand(0);
22410 }
22411 break;
22412 }
22413 }
22414 }
22415
22416 ISD::NodeType LogicOp = CmpNull ? ISD::OR : ISD::AND;
22417
22418 // Match icmp(or(extract(X,0),extract(X,1)),0) anyof reduction patterns.
22419 // Match icmp(and(extract(X,0),extract(X,1)),-1) allof reduction patterns.
22421 if (Op.getOpcode() == LogicOp && matchScalarReduction(Op, LogicOp, VecIns)) {
22422 EVT VT = VecIns[0].getValueType();
22423 assert(llvm::all_of(VecIns,
22424 [VT](SDValue V) { return VT == V.getValueType(); }) &&
22425 "Reduction source vector mismatch");
22426
22427 // Quit if not splittable to scalar/128/256/512-bit vector.
22428 if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))
22429 return SDValue();
22430
22431 // If more than one full vector is evaluated, AND/OR them first before
22432 // PTEST.
22433 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
22434 Slot += 2, e += 1) {
22435 // Each iteration will AND/OR 2 nodes and append the result until there is
22436 // only 1 node left, i.e. the final value of all vectors.
22437 SDValue LHS = VecIns[Slot];
22438 SDValue RHS = VecIns[Slot + 1];
22439 VecIns.push_back(DAG.getNode(LogicOp, DL, VT, LHS, RHS));
22440 }
22441
22442 return LowerVectorAllEqual(DL, VecIns.back(),
22443 CmpNull ? DAG.getConstant(0, DL, VT)
22444 : DAG.getAllOnesConstant(DL, VT),
22445 CC, Mask, Subtarget, DAG, X86CC);
22446 }
22447
22448 // Match icmp(reduce_or(X),0) anyof reduction patterns.
22449 // Match icmp(reduce_and(X),-1) allof reduction patterns.
22450 if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
22451 ISD::NodeType BinOp;
22452 if (SDValue Match =
22453 DAG.matchBinOpReduction(Op.getNode(), BinOp, {LogicOp})) {
22454 EVT MatchVT = Match.getValueType();
22456 CmpNull ? DAG.getConstant(0, DL, MatchVT)
22457 : DAG.getAllOnesConstant(DL, MatchVT),
22458 CC, Mask, Subtarget, DAG, X86CC);
22459 }
22460 }
22461
22462 if (Mask.isAllOnes()) {
22463 assert(!Op.getValueType().isVector() &&
22464 "Illegal vector type for reduction pattern");
22466 if (Src.getValueType().isFixedLengthVector() &&
22467 Src.getValueType().getScalarType() == MVT::i1) {
22468 // Match icmp(bitcast(icmp_ne(X,Y)),0) reduction patterns.
22469 // Match icmp(bitcast(icmp_eq(X,Y)),-1) reduction patterns.
22470 if (Src.getOpcode() == ISD::SETCC) {
22471 SDValue LHS = Src.getOperand(0);
22472 SDValue RHS = Src.getOperand(1);
22473 EVT LHSVT = LHS.getValueType();
22474 ISD::CondCode SrcCC = cast<CondCodeSDNode>(Src.getOperand(2))->get();
22475 if (SrcCC == (CmpNull ? ISD::SETNE : ISD::SETEQ) &&
22476 llvm::has_single_bit<uint32_t>(LHSVT.getSizeInBits())) {
22477 APInt SrcMask = APInt::getAllOnes(LHSVT.getScalarSizeInBits());
22478 return LowerVectorAllEqual(DL, LHS, RHS, CC, SrcMask, Subtarget, DAG,
22479 X86CC);
22480 }
22481 }
22482 // Match icmp(bitcast(vXi1 trunc(Y)),0) reduction patterns.
22483 // Match icmp(bitcast(vXi1 trunc(Y)),-1) reduction patterns.
22484 // Peek through truncation, mask the LSB and compare against zero/LSB.
22485 if (Src.getOpcode() == ISD::TRUNCATE) {
22486 SDValue Inner = Src.getOperand(0);
22487 EVT InnerVT = Inner.getValueType();
22488 if (llvm::has_single_bit<uint32_t>(InnerVT.getSizeInBits())) {
22489 unsigned BW = InnerVT.getScalarSizeInBits();
22490 APInt SrcMask = APInt(BW, 1);
22491 APInt Cmp = CmpNull ? APInt::getZero(BW) : SrcMask;
22492 return LowerVectorAllEqual(DL, Inner,
22493 DAG.getConstant(Cmp, DL, InnerVT), CC,
22494 SrcMask, Subtarget, DAG, X86CC);
22495 }
22496 }
22497 }
22498 }
22499
22500 return SDValue();
22501}
22502
22503/// return true if \c Op has a use that doesn't just read flags.
22505 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
22506 ++UI) {
22507 SDNode *User = *UI;
22508 unsigned UOpNo = UI.getOperandNo();
22509 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
22510 // Look pass truncate.
22511 UOpNo = User->use_begin().getOperandNo();
22512 User = *User->use_begin();
22513 }
22514
22515 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
22516 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
22517 return true;
22518 }
22519 return false;
22520}
22521
22522// Transform to an x86-specific ALU node with flags if there is a chance of
22523// using an RMW op or only the flags are used. Otherwise, leave
22524// the node alone and emit a 'cmp' or 'test' instruction.
22526 for (SDNode *U : Op->uses())
22527 if (U->getOpcode() != ISD::CopyToReg &&
22528 U->getOpcode() != ISD::SETCC &&
22529 U->getOpcode() != ISD::STORE)
22530 return false;
22531
22532 return true;
22533}
22534
22535/// Emit nodes that will be selected as "test Op0,Op0", or something
22536/// equivalent.
22537static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
22538 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
22539 // CF and OF aren't always set the way we want. Determine which
22540 // of these we need.
22541 bool NeedCF = false;
22542 bool NeedOF = false;
22543 switch (X86CC) {
22544 default: break;
22545 case X86::COND_A: case X86::COND_AE:
22546 case X86::COND_B: case X86::COND_BE:
22547 NeedCF = true;
22548 break;
22549 case X86::COND_G: case X86::COND_GE:
22550 case X86::COND_L: case X86::COND_LE:
22551 case X86::COND_O: case X86::COND_NO: {
22552 // Check if we really need to set the
22553 // Overflow flag. If NoSignedWrap is present
22554 // that is not actually needed.
22555 switch (Op->getOpcode()) {
22556 case ISD::ADD:
22557 case ISD::SUB:
22558 case ISD::MUL:
22559 case ISD::SHL:
22560 if (Op.getNode()->getFlags().hasNoSignedWrap())
22561 break;
22562 [[fallthrough]];
22563 default:
22564 NeedOF = true;
22565 break;
22566 }
22567 break;
22568 }
22569 }
22570 // See if we can use the EFLAGS value from the operand instead of
22571 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
22572 // we prove that the arithmetic won't overflow, we can't use OF or CF.
22573 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
22574 // Emit a CMP with 0, which is the TEST pattern.
22575 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
22576 DAG.getConstant(0, dl, Op.getValueType()));
22577 }
22578 unsigned Opcode = 0;
22579 unsigned NumOperands = 0;
22580
22581 SDValue ArithOp = Op;
22582
22583 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
22584 // which may be the result of a CAST. We use the variable 'Op', which is the
22585 // non-casted variable when we check for possible users.
22586 switch (ArithOp.getOpcode()) {
22587 case ISD::AND:
22588 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
22589 // because a TEST instruction will be better.
22590 if (!hasNonFlagsUse(Op))
22591 break;
22592
22593 [[fallthrough]];
22594 case ISD::ADD:
22595 case ISD::SUB:
22596 case ISD::OR:
22597 case ISD::XOR:
22599 break;
22600
22601 // Otherwise use a regular EFLAGS-setting instruction.
22602 switch (ArithOp.getOpcode()) {
22603 // clang-format off
22604 default: llvm_unreachable("unexpected operator!");
22605 case ISD::ADD: Opcode = X86ISD::ADD; break;
22606 case ISD::SUB: Opcode = X86ISD::SUB; break;
22607 case ISD::XOR: Opcode = X86ISD::XOR; break;
22608 case ISD::AND: Opcode = X86ISD::AND; break;
22609 case ISD::OR: Opcode = X86ISD::OR; break;
22610 // clang-format on
22611 }
22612
22613 NumOperands = 2;
22614 break;
22615 case X86ISD::ADD:
22616 case X86ISD::SUB:
22617 case X86ISD::OR:
22618 case X86ISD::XOR:
22619 case X86ISD::AND:
22620 return SDValue(Op.getNode(), 1);
22621 case ISD::SSUBO:
22622 case ISD::USUBO: {
22623 // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
22624 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
22625 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
22626 Op->getOperand(1)).getValue(1);
22627 }
22628 default:
22629 break;
22630 }
22631
22632 if (Opcode == 0) {
22633 // Emit a CMP with 0, which is the TEST pattern.
22634 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
22635 DAG.getConstant(0, dl, Op.getValueType()));
22636 }
22637 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
22638 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
22639
22640 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
22641 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
22642 return SDValue(New.getNode(), 1);
22643}
22644
22645/// Emit nodes that will be selected as "cmp Op0,Op1", or something
22646/// equivalent.
22647static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
22648 const SDLoc &dl, SelectionDAG &DAG,
22649 const X86Subtarget &Subtarget) {
22650 if (isNullConstant(Op1))
22651 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
22652
22653 EVT CmpVT = Op0.getValueType();
22654
22655 assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
22656 CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
22657
22658 // Only promote the compare up to I32 if it is a 16 bit operation
22659 // with an immediate. 16 bit immediates are to be avoided.
22660 if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
22662 ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
22663 ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
22664 // Don't do this if the immediate can fit in 8-bits.
22665 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
22666 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
22667 unsigned ExtendOp =
22669 if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
22670 // For equality comparisons try to use SIGN_EXTEND if the input was
22671 // truncate from something with enough sign bits.
22672 if (Op0.getOpcode() == ISD::TRUNCATE) {
22673 if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)
22674 ExtendOp = ISD::SIGN_EXTEND;
22675 } else if (Op1.getOpcode() == ISD::TRUNCATE) {
22676 if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)
22677 ExtendOp = ISD::SIGN_EXTEND;
22678 }
22679 }
22680
22681 CmpVT = MVT::i32;
22682 Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
22683 Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
22684 }
22685 }
22686
22687 // Try to shrink i64 compares if the input has enough zero bits.
22688 // FIXME: Do this for non-constant compares for constant on LHS?
22689 if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&
22690 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
22691 Op1->getAsAPIntVal().getActiveBits() <= 32 &&
22692 DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
22693 CmpVT = MVT::i32;
22694 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
22695 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
22696 }
22697
22698 // 0-x == y --> x+y == 0
22699 // 0-x != y --> x+y != 0
22700 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
22701 Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
22702 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22703 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
22704 return Add.getValue(1);
22705 }
22706
22707 // x == 0-y --> x+y == 0
22708 // x != 0-y --> x+y != 0
22709 if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
22710 Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
22711 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22712 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
22713 return Add.getValue(1);
22714 }
22715
22716 // Use SUB instead of CMP to enable CSE between SUB and CMP.
22717 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22718 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
22719 return Sub.getValue(1);
22720}
22721
22723 EVT VT) const {
22724 return !VT.isVector() || Cond != ISD::CondCode::SETEQ;
22725}
22726
22727bool X86TargetLowering::optimizeFMulOrFDivAsShiftAddBitcast(
22728 SDNode *N, SDValue, SDValue IntPow2) const {
22729 if (N->getOpcode() == ISD::FDIV)
22730 return true;
22731
22732 EVT FPVT = N->getValueType(0);
22733 EVT IntVT = IntPow2.getValueType();
22734
22735 // This indicates a non-free bitcast.
22736 // TODO: This is probably overly conservative as we will need to scale the
22737 // integer vector anyways for the int->fp cast.
22738 if (FPVT.isVector() &&
22739 FPVT.getScalarSizeInBits() != IntVT.getScalarSizeInBits())
22740 return false;
22741
22742 return true;
22743}
22744
22745/// Check if replacement of SQRT with RSQRT should be disabled.
22746bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
22747 EVT VT = Op.getValueType();
22748
22749 // We don't need to replace SQRT with RSQRT for half type.
22750 if (VT.getScalarType() == MVT::f16)
22751 return true;
22752
22753 // We never want to use both SQRT and RSQRT instructions for the same input.
22754 if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
22755 return false;
22756
22757 if (VT.isVector())
22758 return Subtarget.hasFastVectorFSQRT();
22759 return Subtarget.hasFastScalarFSQRT();
22760}
22761
22762/// The minimum architected relative accuracy is 2^-12. We need one
22763/// Newton-Raphson step to have a good float result (24 bits of precision).
22764SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
22765 SelectionDAG &DAG, int Enabled,
22766 int &RefinementSteps,
22767 bool &UseOneConstNR,
22768 bool Reciprocal) const {
22769 SDLoc DL(Op);
22770 EVT VT = Op.getValueType();
22771
22772 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
22773 // It is likely not profitable to do this for f64 because a double-precision
22774 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
22775 // instructions: convert to single, rsqrtss, convert back to double, refine
22776 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
22777 // along with FMA, this could be a throughput win.
22778 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
22779 // after legalize types.
22780 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
22781 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
22782 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
22783 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
22784 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
22785 if (RefinementSteps == ReciprocalEstimate::Unspecified)
22786 RefinementSteps = 1;
22787
22788 UseOneConstNR = false;
22789 // There is no FSQRT for 512-bits, but there is RSQRT14.
22790 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
22791 SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);
22792 if (RefinementSteps == 0 && !Reciprocal)
22793 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);
22794 return Estimate;
22795 }
22796
22797 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
22798 Subtarget.hasFP16()) {
22799 assert(Reciprocal && "Don't replace SQRT with RSQRT for half type");
22800 if (RefinementSteps == ReciprocalEstimate::Unspecified)
22801 RefinementSteps = 0;
22802
22803 if (VT == MVT::f16) {
22804 SDValue Zero = DAG.getIntPtrConstant(0, DL);
22805 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
22806 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
22807 Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);
22808 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
22809 }
22810
22811 return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);
22812 }
22813 return SDValue();
22814}
22815
22816/// The minimum architected relative accuracy is 2^-12. We need one
22817/// Newton-Raphson step to have a good float result (24 bits of precision).
22818SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
22819 int Enabled,
22820 int &RefinementSteps) const {
22821 SDLoc DL(Op);
22822 EVT VT = Op.getValueType();
22823
22824 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
22825 // It is likely not profitable to do this for f64 because a double-precision
22826 // reciprocal estimate with refinement on x86 prior to FMA requires
22827 // 15 instructions: convert to single, rcpss, convert back to double, refine
22828 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
22829 // along with FMA, this could be a throughput win.
22830
22831 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
22832 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
22833 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
22834 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
22835 // Enable estimate codegen with 1 refinement step for vector division.
22836 // Scalar division estimates are disabled because they break too much
22837 // real-world code. These defaults are intended to match GCC behavior.
22838 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
22839 return SDValue();
22840
22841 if (RefinementSteps == ReciprocalEstimate::Unspecified)
22842 RefinementSteps = 1;
22843
22844 // There is no FSQRT for 512-bits, but there is RCP14.
22845 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
22846 return DAG.getNode(Opcode, DL, VT, Op);
22847 }
22848
22849 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
22850 Subtarget.hasFP16()) {
22851 if (RefinementSteps == ReciprocalEstimate::Unspecified)
22852 RefinementSteps = 0;
22853
22854 if (VT == MVT::f16) {
22855 SDValue Zero = DAG.getIntPtrConstant(0, DL);
22856 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
22857 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
22858 Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);
22859 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
22860 }
22861
22862 return DAG.getNode(X86ISD::RCP14, DL, VT, Op);
22863 }
22864 return SDValue();
22865}
22866
22867/// If we have at least two divisions that use the same divisor, convert to
22868/// multiplication by a reciprocal. This may need to be adjusted for a given
22869/// CPU if a division's cost is not at least twice the cost of a multiplication.
22870/// This is because we still need one division to calculate the reciprocal and
22871/// then we need two multiplies by that reciprocal as replacements for the
22872/// original divisions.
22873unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
22874 return 2;
22875}
22876
22877SDValue
22878X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
22879 SelectionDAG &DAG,
22880 SmallVectorImpl<SDNode *> &Created) const {
22882 if (isIntDivCheap(N->getValueType(0), Attr))
22883 return SDValue(N,0); // Lower SDIV as SDIV
22884
22885 assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&
22886 "Unexpected divisor!");
22887
22888 // Only perform this transform if CMOV is supported otherwise the select
22889 // below will become a branch.
22890 if (!Subtarget.canUseCMOV())
22891 return SDValue();
22892
22893 // fold (sdiv X, pow2)
22894 EVT VT = N->getValueType(0);
22895 // FIXME: Support i8.
22896 if (VT != MVT::i16 && VT != MVT::i32 &&
22897 !(Subtarget.is64Bit() && VT == MVT::i64))
22898 return SDValue();
22899
22900 // If the divisor is 2 or -2, the default expansion is better.
22901 if (Divisor == 2 ||
22902 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
22903 return SDValue();
22904
22905 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
22906}
22907
22908/// Result of 'and' is compared against zero. Change to a BT node if possible.
22909/// Returns the BT node and the condition code needed to use it.
22911 SelectionDAG &DAG, X86::CondCode &X86CC) {
22912 assert(And.getOpcode() == ISD::AND && "Expected AND node!");
22913 SDValue Op0 = And.getOperand(0);
22914 SDValue Op1 = And.getOperand(1);
22915 if (Op0.getOpcode() == ISD::TRUNCATE)
22916 Op0 = Op0.getOperand(0);
22917 if (Op1.getOpcode() == ISD::TRUNCATE)
22918 Op1 = Op1.getOperand(0);
22919
22920 SDValue Src, BitNo;
22921 if (Op1.getOpcode() == ISD::SHL)
22922 std::swap(Op0, Op1);
22923 if (Op0.getOpcode() == ISD::SHL) {
22924 if (isOneConstant(Op0.getOperand(0))) {
22925 // If we looked past a truncate, check that it's only truncating away
22926 // known zeros.
22927 unsigned BitWidth = Op0.getValueSizeInBits();
22928 unsigned AndBitWidth = And.getValueSizeInBits();
22929 if (BitWidth > AndBitWidth) {
22930 KnownBits Known = DAG.computeKnownBits(Op0);
22931 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
22932 return SDValue();
22933 }
22934 Src = Op1;
22935 BitNo = Op0.getOperand(1);
22936 }
22937 } else if (Op1.getOpcode() == ISD::Constant) {
22938 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
22939 uint64_t AndRHSVal = AndRHS->getZExtValue();
22940 SDValue AndLHS = Op0;
22941
22942 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
22943 Src = AndLHS.getOperand(0);
22944 BitNo = AndLHS.getOperand(1);
22945 } else {
22946 // Use BT if the immediate can't be encoded in a TEST instruction or we
22947 // are optimizing for size and the immedaite won't fit in a byte.
22948 bool OptForSize = DAG.shouldOptForSize();
22949 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
22950 isPowerOf2_64(AndRHSVal)) {
22951 Src = AndLHS;
22952 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
22953 Src.getValueType());
22954 }
22955 }
22956 }
22957
22958 // No patterns found, give up.
22959 if (!Src.getNode())
22960 return SDValue();
22961
22962 // Remove any bit flip.
22963 if (isBitwiseNot(Src)) {
22964 Src = Src.getOperand(0);
22966 }
22967
22968 // Attempt to create the X86ISD::BT node.
22969 if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {
22970 X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
22971 return BT;
22972 }
22973
22974 return SDValue();
22975}
22976
22977// Check if pre-AVX condcode can be performed by a single FCMP op.
22978static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {
22979 return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);
22980}
22981
22982/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
22983/// CMPs.
22984static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
22985 SDValue &Op1, bool &IsAlwaysSignaling) {
22986 unsigned SSECC;
22987 bool Swap = false;
22988
22989 // SSE Condition code mapping:
22990 // 0 - EQ
22991 // 1 - LT
22992 // 2 - LE
22993 // 3 - UNORD
22994 // 4 - NEQ
22995 // 5 - NLT
22996 // 6 - NLE
22997 // 7 - ORD
22998 switch (SetCCOpcode) {
22999 // clang-format off
23000 default: llvm_unreachable("Unexpected SETCC condition");
23001 case ISD::SETOEQ:
23002 case ISD::SETEQ: SSECC = 0; break;
23003 case ISD::SETOGT:
23004 case ISD::SETGT: Swap = true; [[fallthrough]];
23005 case ISD::SETLT:
23006 case ISD::SETOLT: SSECC = 1; break;
23007 case ISD::SETOGE:
23008 case ISD::SETGE: Swap = true; [[fallthrough]];
23009 case ISD::SETLE:
23010 case ISD::SETOLE: SSECC = 2; break;
23011 case ISD::SETUO: SSECC = 3; break;
23012 case ISD::SETUNE:
23013 case ISD::SETNE: SSECC = 4; break;
23014 case ISD::SETULE: Swap = true; [[fallthrough]];
23015 case ISD::SETUGE: SSECC = 5; break;
23016 case ISD::SETULT: Swap = true; [[fallthrough]];
23017 case ISD::SETUGT: SSECC = 6; break;
23018 case ISD::SETO: SSECC = 7; break;
23019 case ISD::SETUEQ: SSECC = 8; break;
23020 case ISD::SETONE: SSECC = 12; break;
23021 // clang-format on
23022 }
23023 if (Swap)
23024 std::swap(Op0, Op1);
23025
23026 switch (SetCCOpcode) {
23027 default:
23028 IsAlwaysSignaling = true;
23029 break;
23030 case ISD::SETEQ:
23031 case ISD::SETOEQ:
23032 case ISD::SETUEQ:
23033 case ISD::SETNE:
23034 case ISD::SETONE:
23035 case ISD::SETUNE:
23036 case ISD::SETO:
23037 case ISD::SETUO:
23038 IsAlwaysSignaling = false;
23039 break;
23040 }
23041
23042 return SSECC;
23043}
23044
23045/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
23046/// concatenate the result back.
23049 const SDLoc &dl) {
23050 assert(VT.isInteger() && VT == LHS.getValueType() &&
23051 VT == RHS.getValueType() && "Unsupported VTs!");
23052
23053 SDValue CC = DAG.getCondCode(Cond);
23054
23055 // Extract the LHS Lo/Hi vectors
23056 SDValue LHS1, LHS2;
23057 std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
23058
23059 // Extract the RHS Lo/Hi vectors
23060 SDValue RHS1, RHS2;
23061 std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
23062
23063 // Issue the operation on the smaller types and concatenate the result back
23064 EVT LoVT, HiVT;
23065 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
23066 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
23067 DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
23068 DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
23069}
23070
23072
23073 SDValue Op0 = Op.getOperand(0);
23074 SDValue Op1 = Op.getOperand(1);
23075 SDValue CC = Op.getOperand(2);
23076 MVT VT = Op.getSimpleValueType();
23077 SDLoc dl(Op);
23078
23079 assert(VT.getVectorElementType() == MVT::i1 &&
23080 "Cannot set masked compare for this operation");
23081
23082 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
23083
23084 // Prefer SETGT over SETLT.
23085 if (SetCCOpcode == ISD::SETLT) {
23086 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
23087 std::swap(Op0, Op1);
23088 }
23089
23090 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
23091}
23092
23093/// Given a buildvector constant, return a new vector constant with each element
23094/// incremented or decremented. If incrementing or decrementing would result in
23095/// unsigned overflow or underflow or this is not a simple vector constant,
23096/// return an empty value.
23098 bool NSW) {
23099 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
23100 if (!BV || !V.getValueType().isSimple())
23101 return SDValue();
23102
23103 MVT VT = V.getSimpleValueType();
23104 MVT EltVT = VT.getVectorElementType();
23105 unsigned NumElts = VT.getVectorNumElements();
23107 SDLoc DL(V);
23108 for (unsigned i = 0; i < NumElts; ++i) {
23109 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
23110 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
23111 return SDValue();
23112
23113 // Avoid overflow/underflow.
23114 const APInt &EltC = Elt->getAPIntValue();
23115 if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))
23116 return SDValue();
23117 if (NSW && ((IsInc && EltC.isMaxSignedValue()) ||
23118 (!IsInc && EltC.isMinSignedValue())))
23119 return SDValue();
23120
23121 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
23122 }
23123
23124 return DAG.getBuildVector(VT, DL, NewVecC);
23125}
23126
23127/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
23128/// Op0 u<= Op1:
23129/// t = psubus Op0, Op1
23130/// pcmpeq t, <0..0>
23132 ISD::CondCode Cond, const SDLoc &dl,
23133 const X86Subtarget &Subtarget,
23134 SelectionDAG &DAG) {
23135 if (!Subtarget.hasSSE2())
23136 return SDValue();
23137
23138 MVT VET = VT.getVectorElementType();
23139 if (VET != MVT::i8 && VET != MVT::i16)
23140 return SDValue();
23141
23142 switch (Cond) {
23143 default:
23144 return SDValue();
23145 case ISD::SETULT: {
23146 // If the comparison is against a constant we can turn this into a
23147 // setule. With psubus, setule does not require a swap. This is
23148 // beneficial because the constant in the register is no longer
23149 // destructed as the destination so it can be hoisted out of a loop.
23150 // Only do this pre-AVX since vpcmp* is no longer destructive.
23151 if (Subtarget.hasAVX())
23152 return SDValue();
23153 SDValue ULEOp1 =
23154 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false);
23155 if (!ULEOp1)
23156 return SDValue();
23157 Op1 = ULEOp1;
23158 break;
23159 }
23160 case ISD::SETUGT: {
23161 // If the comparison is against a constant, we can turn this into a setuge.
23162 // This is beneficial because materializing a constant 0 for the PCMPEQ is
23163 // probably cheaper than XOR+PCMPGT using 2 different vector constants:
23164 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
23165 SDValue UGEOp1 =
23166 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false);
23167 if (!UGEOp1)
23168 return SDValue();
23169 Op1 = Op0;
23170 Op0 = UGEOp1;
23171 break;
23172 }
23173 // Psubus is better than flip-sign because it requires no inversion.
23174 case ISD::SETUGE:
23175 std::swap(Op0, Op1);
23176 break;
23177 case ISD::SETULE:
23178 break;
23179 }
23180
23181 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
23182 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
23183 DAG.getConstant(0, dl, VT));
23184}
23185
23186static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
23187 SelectionDAG &DAG) {
23188 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
23189 Op.getOpcode() == ISD::STRICT_FSETCCS;
23190 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
23191 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
23192 SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
23193 MVT VT = Op->getSimpleValueType(0);
23194 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
23195 bool isFP = Op1.getSimpleValueType().isFloatingPoint();
23196 SDLoc dl(Op);
23197
23198 if (isFP) {
23200 assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64);
23201 if (isSoftF16(EltVT, Subtarget))
23202 return SDValue();
23203
23204 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
23205 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23206
23207 // If we have a strict compare with a vXi1 result and the input is 128/256
23208 // bits we can't use a masked compare unless we have VLX. If we use a wider
23209 // compare like we do for non-strict, we might trigger spurious exceptions
23210 // from the upper elements. Instead emit a AVX compare and convert to mask.
23211 unsigned Opc;
23212 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
23213 (!IsStrict || Subtarget.hasVLX() ||
23215#ifndef NDEBUG
23216 unsigned Num = VT.getVectorNumElements();
23217 assert(Num <= 16 || (Num == 32 && EltVT == MVT::f16));
23218#endif
23219 Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
23220 } else {
23221 Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
23222 // The SSE/AVX packed FP comparison nodes are defined with a
23223 // floating-point vector result that matches the operand type. This allows
23224 // them to work with an SSE1 target (integer vector types are not legal).
23225 VT = Op0.getSimpleValueType();
23226 }
23227
23228 SDValue Cmp;
23229 bool IsAlwaysSignaling;
23230 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
23231 if (!Subtarget.hasAVX()) {
23232 // TODO: We could use following steps to handle a quiet compare with
23233 // signaling encodings.
23234 // 1. Get ordered masks from a quiet ISD::SETO
23235 // 2. Use the masks to mask potential unordered elements in operand A, B
23236 // 3. Get the compare results of masked A, B
23237 // 4. Calculating final result using the mask and result from 3
23238 // But currently, we just fall back to scalar operations.
23239 if (IsStrict && IsAlwaysSignaling && !IsSignaling)
23240 return SDValue();
23241
23242 // Insert an extra signaling instruction to raise exception.
23243 if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
23244 SDValue SignalCmp = DAG.getNode(
23245 Opc, dl, {VT, MVT::Other},
23246 {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
23247 // FIXME: It seems we need to update the flags of all new strict nodes.
23248 // Otherwise, mayRaiseFPException in MI will return false due to
23249 // NoFPExcept = false by default. However, I didn't find it in other
23250 // patches.
23251 SignalCmp->setFlags(Op->getFlags());
23252 Chain = SignalCmp.getValue(1);
23253 }
23254
23255 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
23256 // emit two comparisons and a logic op to tie them together.
23257 if (!cheapX86FSETCC_SSE(Cond)) {
23258 // LLVM predicate is SETUEQ or SETONE.
23259 unsigned CC0, CC1;
23260 unsigned CombineOpc;
23261 if (Cond == ISD::SETUEQ) {
23262 CC0 = 3; // UNORD
23263 CC1 = 0; // EQ
23264 CombineOpc = X86ISD::FOR;
23265 } else {
23267 CC0 = 7; // ORD
23268 CC1 = 4; // NEQ
23269 CombineOpc = X86ISD::FAND;
23270 }
23271
23272 SDValue Cmp0, Cmp1;
23273 if (IsStrict) {
23274 Cmp0 = DAG.getNode(
23275 Opc, dl, {VT, MVT::Other},
23276 {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
23277 Cmp1 = DAG.getNode(
23278 Opc, dl, {VT, MVT::Other},
23279 {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
23280 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
23281 Cmp1.getValue(1));
23282 } else {
23283 Cmp0 = DAG.getNode(
23284 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
23285 Cmp1 = DAG.getNode(
23286 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
23287 }
23288 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
23289 } else {
23290 if (IsStrict) {
23291 Cmp = DAG.getNode(
23292 Opc, dl, {VT, MVT::Other},
23293 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23294 Chain = Cmp.getValue(1);
23295 } else
23296 Cmp = DAG.getNode(
23297 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23298 }
23299 } else {
23300 // Handle all other FP comparisons here.
23301 if (IsStrict) {
23302 // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
23303 SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
23304 Cmp = DAG.getNode(
23305 Opc, dl, {VT, MVT::Other},
23306 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23307 Chain = Cmp.getValue(1);
23308 } else
23309 Cmp = DAG.getNode(
23310 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23311 }
23312
23313 if (VT.getFixedSizeInBits() >
23314 Op.getSimpleValueType().getFixedSizeInBits()) {
23315 // We emitted a compare with an XMM/YMM result. Finish converting to a
23316 // mask register using a vptestm.
23318 Cmp = DAG.getBitcast(CastVT, Cmp);
23319 Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
23320 DAG.getConstant(0, dl, CastVT), ISD::SETNE);
23321 } else {
23322 // If this is SSE/AVX CMPP, bitcast the result back to integer to match
23323 // the result type of SETCC. The bitcast is expected to be optimized
23324 // away during combining/isel.
23325 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
23326 }
23327
23328 if (IsStrict)
23329 return DAG.getMergeValues({Cmp, Chain}, dl);
23330
23331 return Cmp;
23332 }
23333
23334 assert(!IsStrict && "Strict SETCC only handles FP operands.");
23335
23336 MVT VTOp0 = Op0.getSimpleValueType();
23337 (void)VTOp0;
23338 assert(VTOp0 == Op1.getSimpleValueType() &&
23339 "Expected operands with same type!");
23341 "Invalid number of packed elements for source and destination!");
23342
23343 // The non-AVX512 code below works under the assumption that source and
23344 // destination types are the same.
23345 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
23346 "Value types for source and destination must be the same!");
23347
23348 // The result is boolean, but operands are int/float
23349 if (VT.getVectorElementType() == MVT::i1) {
23350 // In AVX-512 architecture setcc returns mask with i1 elements,
23351 // But there is no compare instruction for i8 and i16 elements in KNL.
23352 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
23353 "Unexpected operand type");
23354 return LowerIntVSETCC_AVX512(Op, DAG);
23355 }
23356
23357 // Lower using XOP integer comparisons.
23358 if (VT.is128BitVector() && Subtarget.hasXOP()) {
23359 // Translate compare code to XOP PCOM compare mode.
23360 unsigned CmpMode = 0;
23361 switch (Cond) {
23362 // clang-format off
23363 default: llvm_unreachable("Unexpected SETCC condition");
23364 case ISD::SETULT:
23365 case ISD::SETLT: CmpMode = 0x00; break;
23366 case ISD::SETULE:
23367 case ISD::SETLE: CmpMode = 0x01; break;
23368 case ISD::SETUGT:
23369 case ISD::SETGT: CmpMode = 0x02; break;
23370 case ISD::SETUGE:
23371 case ISD::SETGE: CmpMode = 0x03; break;
23372 case ISD::SETEQ: CmpMode = 0x04; break;
23373 case ISD::SETNE: CmpMode = 0x05; break;
23374 // clang-format on
23375 }
23376
23377 // Are we comparing unsigned or signed integers?
23378 unsigned Opc =
23380
23381 return DAG.getNode(Opc, dl, VT, Op0, Op1,
23382 DAG.getTargetConstant(CmpMode, dl, MVT::i8));
23383 }
23384
23385 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
23386 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
23388 SDValue BC0 = peekThroughBitcasts(Op0);
23389 if (BC0.getOpcode() == ISD::AND) {
23390 APInt UndefElts;
23391 SmallVector<APInt, 64> EltBits;
23393 BC0.getOperand(1), VT.getScalarSizeInBits(), UndefElts, EltBits,
23394 /*AllowWholeUndefs*/ false, /*AllowPartialUndefs*/ false)) {
23395 if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
23396 Cond = ISD::SETEQ;
23397 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
23398 }
23399 }
23400 }
23401 }
23402
23403 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
23404 if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
23405 Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
23407 if (C1 && C1->getAPIntValue().isPowerOf2()) {
23408 unsigned BitWidth = VT.getScalarSizeInBits();
23409 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
23410
23411 SDValue Result = Op0.getOperand(0);
23412 Result = DAG.getNode(ISD::SHL, dl, VT, Result,
23413 DAG.getConstant(ShiftAmt, dl, VT));
23414 Result = DAG.getNode(ISD::SRA, dl, VT, Result,
23415 DAG.getConstant(BitWidth - 1, dl, VT));
23416 return Result;
23417 }
23418 }
23419
23420 // Break 256-bit integer vector compare into smaller ones.
23421 if (VT.is256BitVector() && !Subtarget.hasInt256())
23422 return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23423
23424 // Break 512-bit integer vector compare into smaller ones.
23425 // TODO: Try harder to use VPCMPx + VPMOV2x?
23426 if (VT.is512BitVector())
23427 return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23428
23429 // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
23430 // not-of-PCMPEQ:
23431 // X != INT_MIN --> X >s INT_MIN
23432 // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
23433 // +X != 0 --> +X >s 0
23434 APInt ConstValue;
23435 if (Cond == ISD::SETNE &&
23436 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
23437 if (ConstValue.isMinSignedValue())
23438 Cond = ISD::SETGT;
23439 else if (ConstValue.isMaxSignedValue())
23440 Cond = ISD::SETLT;
23441 else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))
23442 Cond = ISD::SETGT;
23443 }
23444
23445 // If both operands are known non-negative, then an unsigned compare is the
23446 // same as a signed compare and there's no need to flip signbits.
23447 // TODO: We could check for more general simplifications here since we're
23448 // computing known bits.
23449 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
23450 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
23451
23452 // Special case: Use min/max operations for unsigned compares.
23453 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23455 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
23456 TLI.isOperationLegal(ISD::UMIN, VT)) {
23457 // If we have a constant operand, increment/decrement it and change the
23458 // condition to avoid an invert.
23459 if (Cond == ISD::SETUGT) {
23460 // X > C --> X >= (C+1) --> X == umax(X, C+1)
23461 if (SDValue UGTOp1 =
23462 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false)) {
23463 Op1 = UGTOp1;
23464 Cond = ISD::SETUGE;
23465 }
23466 }
23467 if (Cond == ISD::SETULT) {
23468 // X < C --> X <= (C-1) --> X == umin(X, C-1)
23469 if (SDValue ULTOp1 =
23470 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false)) {
23471 Op1 = ULTOp1;
23472 Cond = ISD::SETULE;
23473 }
23474 }
23475 bool Invert = false;
23476 unsigned Opc;
23477 switch (Cond) {
23478 // clang-format off
23479 default: llvm_unreachable("Unexpected condition code");
23480 case ISD::SETUGT: Invert = true; [[fallthrough]];
23481 case ISD::SETULE: Opc = ISD::UMIN; break;
23482 case ISD::SETULT: Invert = true; [[fallthrough]];
23483 case ISD::SETUGE: Opc = ISD::UMAX; break;
23484 // clang-format on
23485 }
23486
23487 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
23488 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
23489
23490 // If the logical-not of the result is required, perform that now.
23491 if (Invert)
23492 Result = DAG.getNOT(dl, Result, VT);
23493
23494 return Result;
23495 }
23496
23497 // Try to use SUBUS and PCMPEQ.
23498 if (FlipSigns)
23499 if (SDValue V =
23500 LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
23501 return V;
23502
23503 // We are handling one of the integer comparisons here. Since SSE only has
23504 // GT and EQ comparisons for integer, swapping operands and multiple
23505 // operations may be required for some comparisons.
23506 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
23508 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
23510 bool Invert = Cond == ISD::SETNE ||
23512
23513 if (Swap)
23514 std::swap(Op0, Op1);
23515
23516 // Check that the operation in question is available (most are plain SSE2,
23517 // but PCMPGTQ and PCMPEQQ have different requirements).
23518 if (VT == MVT::v2i64) {
23519 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
23520 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
23521
23522 // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
23523 // the odd elements over the even elements.
23524 if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
23525 Op0 = DAG.getConstant(0, dl, MVT::v4i32);
23526 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23527
23528 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23529 static const int MaskHi[] = { 1, 1, 3, 3 };
23530 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23531
23532 return DAG.getBitcast(VT, Result);
23533 }
23534
23535 if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
23536 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23537 Op1 = DAG.getConstant(-1, dl, MVT::v4i32);
23538
23539 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23540 static const int MaskHi[] = { 1, 1, 3, 3 };
23541 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23542
23543 return DAG.getBitcast(VT, Result);
23544 }
23545
23546 // If the i64 elements are sign-extended enough to be representable as i32
23547 // then we can compare the lower i32 bits and splat.
23548 if (!FlipSigns && !Invert && DAG.ComputeNumSignBits(Op0) > 32 &&
23549 DAG.ComputeNumSignBits(Op1) > 32) {
23550 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23551 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23552
23553 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23554 static const int MaskLo[] = {0, 0, 2, 2};
23555 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
23556
23557 return DAG.getBitcast(VT, Result);
23558 }
23559
23560 // Since SSE has no unsigned integer comparisons, we need to flip the sign
23561 // bits of the inputs before performing those operations. The lower
23562 // compare is always unsigned.
23563 SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL
23564 : 0x0000000080000000ULL,
23565 dl, MVT::v2i64);
23566
23567 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
23568 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
23569
23570 // Cast everything to the right type.
23571 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23572 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23573
23574 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
23575 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23576 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
23577
23578 // Create masks for only the low parts/high parts of the 64 bit integers.
23579 static const int MaskHi[] = { 1, 1, 3, 3 };
23580 static const int MaskLo[] = { 0, 0, 2, 2 };
23581 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
23582 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
23583 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23584
23585 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
23586 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
23587
23588 if (Invert)
23589 Result = DAG.getNOT(dl, Result, MVT::v4i32);
23590
23591 return DAG.getBitcast(VT, Result);
23592 }
23593
23594 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
23595 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
23596 // pcmpeqd + pshufd + pand.
23597 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
23598
23599 // First cast everything to the right type.
23600 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23601 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23602
23603 // Do the compare.
23604 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
23605
23606 // Make sure the lower and upper halves are both all-ones.
23607 static const int Mask[] = { 1, 0, 3, 2 };
23608 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
23609 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
23610
23611 if (Invert)
23612 Result = DAG.getNOT(dl, Result, MVT::v4i32);
23613
23614 return DAG.getBitcast(VT, Result);
23615 }
23616 }
23617
23618 // Since SSE has no unsigned integer comparisons, we need to flip the sign
23619 // bits of the inputs before performing those operations.
23620 if (FlipSigns) {
23621 MVT EltVT = VT.getVectorElementType();
23623 VT);
23624 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
23625 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
23626 }
23627
23628 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
23629
23630 // If the logical-not of the result is required, perform that now.
23631 if (Invert)
23632 Result = DAG.getNOT(dl, Result, VT);
23633
23634 return Result;
23635}
23636
23637// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
23639 const SDLoc &dl, SelectionDAG &DAG,
23640 const X86Subtarget &Subtarget,
23641 SDValue &X86CC) {
23642 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
23643
23644 // Must be a bitcast from vXi1.
23645 if (Op0.getOpcode() != ISD::BITCAST)
23646 return SDValue();
23647
23648 Op0 = Op0.getOperand(0);
23649 MVT VT = Op0.getSimpleValueType();
23650 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
23651 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
23652 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
23653 return SDValue();
23654
23655 X86::CondCode X86Cond;
23656 if (isNullConstant(Op1)) {
23657 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
23658 } else if (isAllOnesConstant(Op1)) {
23659 // C flag is set for all ones.
23660 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
23661 } else
23662 return SDValue();
23663
23664 // If the input is an AND, we can combine it's operands into the KTEST.
23665 bool KTestable = false;
23666 if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
23667 KTestable = true;
23668 if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
23669 KTestable = true;
23670 if (!isNullConstant(Op1))
23671 KTestable = false;
23672 if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
23673 SDValue LHS = Op0.getOperand(0);
23674 SDValue RHS = Op0.getOperand(1);
23675 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
23676 return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
23677 }
23678
23679 // If the input is an OR, we can combine it's operands into the KORTEST.
23680 SDValue LHS = Op0;
23681 SDValue RHS = Op0;
23682 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
23683 LHS = Op0.getOperand(0);
23684 RHS = Op0.getOperand(1);
23685 }
23686
23687 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
23688 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
23689}
23690
23691/// Emit flags for the given setcc condition and operands. Also returns the
23692/// corresponding X86 condition code constant in X86CC.
23693SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
23694 ISD::CondCode CC, const SDLoc &dl,
23695 SelectionDAG &DAG,
23696 SDValue &X86CC) const {
23697 // Equality Combines.
23698 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
23699 X86::CondCode X86CondCode;
23700
23701 // Optimize to BT if possible.
23702 // Lower (X & (1 << N)) == 0 to BT(X, N).
23703 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
23704 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
23705 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1)) {
23706 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {
23707 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
23708 return BT;
23709 }
23710 }
23711
23712 // Try to use PTEST/PMOVMSKB for a tree AND/ORs equality compared with -1/0.
23713 if (SDValue CmpZ = MatchVectorAllEqualTest(Op0, Op1, CC, dl, Subtarget, DAG,
23714 X86CondCode)) {
23715 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
23716 return CmpZ;
23717 }
23718
23719 // Try to lower using KORTEST or KTEST.
23720 if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
23721 return Test;
23722
23723 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms
23724 // of these.
23725 if (isOneConstant(Op1) || isNullConstant(Op1)) {
23726 // If the input is a setcc, then reuse the input setcc or use a new one
23727 // with the inverted condition.
23728 if (Op0.getOpcode() == X86ISD::SETCC) {
23729 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
23730
23731 X86CC = Op0.getOperand(0);
23732 if (Invert) {
23733 X86CondCode = (X86::CondCode)Op0.getConstantOperandVal(0);
23734 X86CondCode = X86::GetOppositeBranchCondition(X86CondCode);
23735 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
23736 }
23737
23738 return Op0.getOperand(1);
23739 }
23740 }
23741
23742 // Try to use the carry flag from the add in place of an separate CMP for:
23743 // (seteq (add X, -1), -1). Similar for setne.
23744 if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
23745 Op0.getOperand(1) == Op1) {
23746 if (isProfitableToUseFlagOp(Op0)) {
23747 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
23748
23749 SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
23750 Op0.getOperand(1));
23751 DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
23752 X86CondCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
23753 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
23754 return SDValue(New.getNode(), 1);
23755 }
23756 }
23757 }
23758
23760 TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
23761 assert(CondCode != X86::COND_INVALID && "Unexpected condition code!");
23762
23763 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
23764 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
23765 return EFLAGS;
23766}
23767
23768SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
23769
23770 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
23771 Op.getOpcode() == ISD::STRICT_FSETCCS;
23772 MVT VT = Op->getSimpleValueType(0);
23773
23774 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
23775
23776 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
23777 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23778 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
23779 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
23780 SDLoc dl(Op);
23782 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
23783
23784 if (isSoftF16(Op0.getValueType(), Subtarget))
23785 return SDValue();
23786
23787 // Handle f128 first, since one possible outcome is a normal integer
23788 // comparison which gets handled by emitFlagsForSetcc.
23789 if (Op0.getValueType() == MVT::f128) {
23790 softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
23791 Op.getOpcode() == ISD::STRICT_FSETCCS);
23792
23793 // If softenSetCCOperands returned a scalar, use it.
23794 if (!Op1.getNode()) {
23795 assert(Op0.getValueType() == Op.getValueType() &&
23796 "Unexpected setcc expansion!");
23797 if (IsStrict)
23798 return DAG.getMergeValues({Op0, Chain}, dl);
23799 return Op0;
23800 }
23801 }
23802
23803 if (Op0.getSimpleValueType().isInteger()) {
23804 // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
23805 // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
23806 // this may translate to less uops depending on uarch implementation. The
23807 // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
23808 // canonicalize to that CondCode.
23809 // NOTE: Only do this if incrementing the constant doesn't increase the bit
23810 // encoding size - so it must either already be a i8 or i32 immediate, or it
23811 // shrinks down to that. We don't do this for any i64's to avoid additional
23812 // constant materializations.
23813 // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
23814 if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
23815 const APInt &Op1Val = Op1C->getAPIntValue();
23816 if (!Op1Val.isZero()) {
23817 // Ensure the constant+1 doesn't overflow.
23818 if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
23819 (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
23820 APInt Op1ValPlusOne = Op1Val + 1;
23821 if (Op1ValPlusOne.isSignedIntN(32) &&
23822 (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
23823 Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
23826 }
23827 }
23828 }
23829 }
23830
23831 SDValue X86CC;
23832 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
23833 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
23834 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
23835 }
23836
23837 // Handle floating point.
23838 X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
23839 if (CondCode == X86::COND_INVALID)
23840 return SDValue();
23841
23842 SDValue EFLAGS;
23843 if (IsStrict) {
23844 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
23845 EFLAGS =
23847 dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
23848 Chain = EFLAGS.getValue(1);
23849 } else {
23850 EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
23851 }
23852
23853 SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
23854 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
23855 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
23856}
23857
23858SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
23859 SDValue LHS = Op.getOperand(0);
23860 SDValue RHS = Op.getOperand(1);
23861 SDValue Carry = Op.getOperand(2);
23862 SDValue Cond = Op.getOperand(3);
23863 SDLoc DL(Op);
23864
23865 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
23866 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
23867
23868 // Recreate the carry if needed.
23869 EVT CarryVT = Carry.getValueType();
23870 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
23871 Carry, DAG.getAllOnesConstant(DL, CarryVT));
23872
23873 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
23874 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
23875 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
23876}
23877
23878// This function returns three things: the arithmetic computation itself
23879// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
23880// flag and the condition code define the case in which the arithmetic
23881// computation overflows.
23882static std::pair<SDValue, SDValue>
23884 assert(Op.getResNo() == 0 && "Unexpected result number!");
23885 SDValue Value, Overflow;
23886 SDValue LHS = Op.getOperand(0);
23887 SDValue RHS = Op.getOperand(1);
23888 unsigned BaseOp = 0;
23889 SDLoc DL(Op);
23890 switch (Op.getOpcode()) {
23891 default: llvm_unreachable("Unknown ovf instruction!");
23892 case ISD::SADDO:
23893 BaseOp = X86ISD::ADD;
23894 Cond = X86::COND_O;
23895 break;
23896 case ISD::UADDO:
23897 BaseOp = X86ISD::ADD;
23899 break;
23900 case ISD::SSUBO:
23901 BaseOp = X86ISD::SUB;
23902 Cond = X86::COND_O;
23903 break;
23904 case ISD::USUBO:
23905 BaseOp = X86ISD::SUB;
23906 Cond = X86::COND_B;
23907 break;
23908 case ISD::SMULO:
23909 BaseOp = X86ISD::SMUL;
23910 Cond = X86::COND_O;
23911 break;
23912 case ISD::UMULO:
23913 BaseOp = X86ISD::UMUL;
23914 Cond = X86::COND_O;
23915 break;
23916 }
23917
23918 if (BaseOp) {
23919 // Also sets EFLAGS.
23920 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23921 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
23922 Overflow = Value.getValue(1);
23923 }
23924
23925 return std::make_pair(Value, Overflow);
23926}
23927
23929 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
23930 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
23931 // looks for this combo and may remove the "setcc" instruction if the "setcc"
23932 // has only one use.
23933 SDLoc DL(Op);
23935 SDValue Value, Overflow;
23936 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
23937
23938 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
23939 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
23940 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
23941}
23942
23943/// Return true if opcode is a X86 logical comparison.
23945 unsigned Opc = Op.getOpcode();
23946 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
23947 Opc == X86ISD::FCMP)
23948 return true;
23949 if (Op.getResNo() == 1 &&
23950 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
23951 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
23952 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
23953 return true;
23954
23955 return false;
23956}
23957
23959 if (V.getOpcode() != ISD::TRUNCATE)
23960 return false;
23961
23962 SDValue VOp0 = V.getOperand(0);
23963 unsigned InBits = VOp0.getValueSizeInBits();
23964 unsigned Bits = V.getValueSizeInBits();
23965 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
23966}
23967
23968SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
23969 bool AddTest = true;
23970 SDValue Cond = Op.getOperand(0);
23971 SDValue Op1 = Op.getOperand(1);
23972 SDValue Op2 = Op.getOperand(2);
23973 SDLoc DL(Op);
23974 MVT VT = Op1.getSimpleValueType();
23975 SDValue CC;
23976
23977 if (isSoftF16(VT, Subtarget)) {
23978 MVT NVT = VT.changeTypeToInteger();
23979 return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,
23980 DAG.getBitcast(NVT, Op1),
23981 DAG.getBitcast(NVT, Op2)));
23982 }
23983
23984 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
23985 // are available or VBLENDV if AVX is available.
23986 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
23987 if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
23988 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
23989 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
23990 bool IsAlwaysSignaling;
23991 unsigned SSECC =
23992 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
23993 CondOp0, CondOp1, IsAlwaysSignaling);
23994
23995 if (Subtarget.hasAVX512()) {
23996 SDValue Cmp =
23997 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
23998 DAG.getTargetConstant(SSECC, DL, MVT::i8));
23999 assert(!VT.isVector() && "Not a scalar type?");
24000 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
24001 }
24002
24003 if (SSECC < 8 || Subtarget.hasAVX()) {
24004 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
24005 DAG.getTargetConstant(SSECC, DL, MVT::i8));
24006
24007 // If we have AVX, we can use a variable vector select (VBLENDV) instead
24008 // of 3 logic instructions for size savings and potentially speed.
24009 // Unfortunately, there is no scalar form of VBLENDV.
24010
24011 // If either operand is a +0.0 constant, don't try this. We can expect to
24012 // optimize away at least one of the logic instructions later in that
24013 // case, so that sequence would be faster than a variable blend.
24014
24015 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
24016 // uses XMM0 as the selection register. That may need just as many
24017 // instructions as the AND/ANDN/OR sequence due to register moves, so
24018 // don't bother.
24019 if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
24020 !isNullFPConstant(Op2)) {
24021 // Convert to vectors, do a VSELECT, and convert back to scalar.
24022 // All of the conversions should be optimized away.
24023 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
24024 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
24025 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
24026 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
24027
24028 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
24029 VCmp = DAG.getBitcast(VCmpVT, VCmp);
24030
24031 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
24032
24033 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
24034 VSel, DAG.getIntPtrConstant(0, DL));
24035 }
24036 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
24037 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
24038 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
24039 }
24040 }
24041
24042 // AVX512 fallback is to lower selects of scalar floats to masked moves.
24043 if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
24044 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
24045 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
24046 }
24047
24048 if (Cond.getOpcode() == ISD::SETCC &&
24049 !isSoftF16(Cond.getOperand(0).getSimpleValueType(), Subtarget)) {
24050 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
24051 Cond = NewCond;
24052 // If the condition was updated, it's possible that the operands of the
24053 // select were also updated (for example, EmitTest has a RAUW). Refresh
24054 // the local references to the select operands in case they got stale.
24055 Op1 = Op.getOperand(1);
24056 Op2 = Op.getOperand(2);
24057 }
24058 }
24059
24060 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
24061 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
24062 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
24063 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
24064 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
24065 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
24066 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
24067 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
24068 if (Cond.getOpcode() == X86ISD::SETCC &&
24069 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
24070 isNullConstant(Cond.getOperand(1).getOperand(1))) {
24071 SDValue Cmp = Cond.getOperand(1);
24072 SDValue CmpOp0 = Cmp.getOperand(0);
24073 unsigned CondCode = Cond.getConstantOperandVal(0);
24074
24075 // Special handling for __builtin_ffs(X) - 1 pattern which looks like
24076 // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
24077 // handle to keep the CMP with 0. This should be removed by
24078 // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
24079 // cttz_zero_undef.
24080 auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
24081 return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
24082 Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
24083 };
24084 if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&
24085 ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
24086 (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
24087 // Keep Cmp.
24088 } else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
24089 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
24090 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
24091 SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
24092
24093 // 'X - 1' sets the carry flag if X == 0.
24094 // '0 - X' sets the carry flag if X != 0.
24095 // Convert the carry flag to a -1/0 mask with sbb:
24096 // select (X != 0), -1, Y --> 0 - X; or (sbb), Y
24097 // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y
24098 // select (X != 0), Y, -1 --> X - 1; or (sbb), Y
24099 // select (X == 0), -1, Y --> X - 1; or (sbb), Y
24100 SDValue Sub;
24101 if (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE)) {
24102 SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
24103 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);
24104 } else {
24105 SDValue One = DAG.getConstant(1, DL, CmpOp0.getValueType());
24106 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpOp0, One);
24107 }
24109 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
24110 Sub.getValue(1));
24111 return DAG.getNode(ISD::OR, DL, VT, SBB, Y);
24112 } else if (!Subtarget.canUseCMOV() && CondCode == X86::COND_E &&
24113 CmpOp0.getOpcode() == ISD::AND &&
24114 isOneConstant(CmpOp0.getOperand(1))) {
24115 SDValue Src1, Src2;
24116 // true if Op2 is XOR or OR operator and one of its operands
24117 // is equal to Op1
24118 // ( a , a op b) || ( b , a op b)
24119 auto isOrXorPattern = [&]() {
24120 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
24121 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
24122 Src1 =
24123 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
24124 Src2 = Op1;
24125 return true;
24126 }
24127 return false;
24128 };
24129
24130 if (isOrXorPattern()) {
24131 SDValue Neg;
24132 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
24133 // we need mask of all zeros or ones with same size of the other
24134 // operands.
24135 if (CmpSz > VT.getSizeInBits())
24136 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
24137 else if (CmpSz < VT.getSizeInBits())
24138 Neg = DAG.getNode(ISD::AND, DL, VT,
24139 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
24140 DAG.getConstant(1, DL, VT));
24141 else
24142 Neg = CmpOp0;
24143 SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
24144 Neg); // -(and (x, 0x1))
24145 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
24146 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
24147 }
24148 } else if ((VT == MVT::i32 || VT == MVT::i64) && isNullConstant(Op2) &&
24149 Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&
24150 ((CondCode == X86::COND_S) || // smin(x, 0)
24151 (CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)
24152 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
24153 //
24154 // If the comparison is testing for a positive value, we have to invert
24155 // the sign bit mask, so only do that transform if the target has a
24156 // bitwise 'and not' instruction (the invert is free).
24157 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
24158 unsigned ShCt = VT.getSizeInBits() - 1;
24159 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);
24160 SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);
24161 if (CondCode == X86::COND_G)
24162 Shift = DAG.getNOT(DL, Shift, VT);
24163 return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);
24164 }
24165 }
24166
24167 // Look past (and (setcc_carry (cmp ...)), 1).
24168 if (Cond.getOpcode() == ISD::AND &&
24169 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
24170 isOneConstant(Cond.getOperand(1)))
24171 Cond = Cond.getOperand(0);
24172
24173 // If condition flag is set by a X86ISD::CMP, then use it as the condition
24174 // setting operand in place of the X86ISD::SETCC.
24175 unsigned CondOpcode = Cond.getOpcode();
24176 if (CondOpcode == X86ISD::SETCC ||
24177 CondOpcode == X86ISD::SETCC_CARRY) {
24178 CC = Cond.getOperand(0);
24179
24180 SDValue Cmp = Cond.getOperand(1);
24181 bool IllegalFPCMov = false;
24182 if (VT.isFloatingPoint() && !VT.isVector() &&
24183 !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack?
24184 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
24185
24186 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
24187 Cmp.getOpcode() == X86ISD::BT) { // FIXME
24188 Cond = Cmp;
24189 AddTest = false;
24190 }
24191 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
24192 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
24193 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
24194 SDValue Value;
24195 X86::CondCode X86Cond;
24196 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
24197
24198 CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
24199 AddTest = false;
24200 }
24201
24202 if (AddTest) {
24203 // Look past the truncate if the high bits are known zero.
24205 Cond = Cond.getOperand(0);
24206
24207 // We know the result of AND is compared against zero. Try to match
24208 // it to BT.
24209 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
24210 X86::CondCode X86CondCode;
24211 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
24212 CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
24213 Cond = BT;
24214 AddTest = false;
24215 }
24216 }
24217 }
24218
24219 if (AddTest) {
24220 CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
24221 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
24222 }
24223
24224 // a < b ? -1 : 0 -> RES = ~setcc_carry
24225 // a < b ? 0 : -1 -> RES = setcc_carry
24226 // a >= b ? -1 : 0 -> RES = setcc_carry
24227 // a >= b ? 0 : -1 -> RES = ~setcc_carry
24228 if (Cond.getOpcode() == X86ISD::SUB) {
24229 unsigned CondCode = CC->getAsZExtVal();
24230
24231 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
24232 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
24233 (isNullConstant(Op1) || isNullConstant(Op2))) {
24234 SDValue Res =
24235 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
24236 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
24237 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
24238 return DAG.getNOT(DL, Res, Res.getValueType());
24239 return Res;
24240 }
24241 }
24242
24243 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
24244 // widen the cmov and push the truncate through. This avoids introducing a new
24245 // branch during isel and doesn't add any extensions.
24246 if (Op.getValueType() == MVT::i8 &&
24247 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
24248 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
24249 if (T1.getValueType() == T2.getValueType() &&
24250 // Exclude CopyFromReg to avoid partial register stalls.
24251 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
24252 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
24253 CC, Cond);
24254 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
24255 }
24256 }
24257
24258 // Or finally, promote i8 cmovs if we have CMOV,
24259 // or i16 cmovs if it won't prevent folding a load.
24260 // FIXME: we should not limit promotion of i8 case to only when the CMOV is
24261 // legal, but EmitLoweredSelect() can not deal with these extensions
24262 // being inserted between two CMOV's. (in i16 case too TBN)
24263 // https://bugs.llvm.org/show_bug.cgi?id=40974
24264 if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||
24265 (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&
24266 !X86::mayFoldLoad(Op2, Subtarget))) {
24267 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
24268 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
24269 SDValue Ops[] = { Op2, Op1, CC, Cond };
24270 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
24271 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
24272 }
24273
24274 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
24275 // condition is true.
24276 SDValue Ops[] = { Op2, Op1, CC, Cond };
24277 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops, Op->getFlags());
24278}
24279
24281 const X86Subtarget &Subtarget,
24282 SelectionDAG &DAG) {
24283 MVT VT = Op->getSimpleValueType(0);
24284 SDValue In = Op->getOperand(0);
24285 MVT InVT = In.getSimpleValueType();
24286 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
24287 MVT VTElt = VT.getVectorElementType();
24288 SDLoc dl(Op);
24289
24290 unsigned NumElts = VT.getVectorNumElements();
24291
24292 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
24293 MVT ExtVT = VT;
24294 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
24295 // If v16i32 is to be avoided, we'll need to split and concatenate.
24296 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
24297 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
24298
24299 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
24300 }
24301
24302 // Widen to 512-bits if VLX is not supported.
24303 MVT WideVT = ExtVT;
24304 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
24305 NumElts *= 512 / ExtVT.getSizeInBits();
24306 InVT = MVT::getVectorVT(MVT::i1, NumElts);
24307 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
24308 In, DAG.getIntPtrConstant(0, dl));
24309 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
24310 }
24311
24312 SDValue V;
24313 MVT WideEltVT = WideVT.getVectorElementType();
24314 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
24315 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
24316 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
24317 } else {
24318 SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
24319 SDValue Zero = DAG.getConstant(0, dl, WideVT);
24320 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
24321 }
24322
24323 // Truncate if we had to extend i16/i8 above.
24324 if (VT != ExtVT) {
24325 WideVT = MVT::getVectorVT(VTElt, NumElts);
24326 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
24327 }
24328
24329 // Extract back to 128/256-bit if we widened.
24330 if (WideVT != VT)
24331 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
24332 DAG.getIntPtrConstant(0, dl));
24333
24334 return V;
24335}
24336
24338 SelectionDAG &DAG) {
24339 SDValue In = Op->getOperand(0);
24340 MVT InVT = In.getSimpleValueType();
24341
24342 if (InVT.getVectorElementType() == MVT::i1)
24343 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
24344
24345 assert(Subtarget.hasAVX() && "Expected AVX support");
24346 return LowerAVXExtend(Op, DAG, Subtarget);
24347}
24348
24349// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
24350// For sign extend this needs to handle all vector sizes and SSE4.1 and
24351// non-SSE4.1 targets. For zero extend this should only handle inputs of
24352// MVT::v64i8 when BWI is not supported, but AVX512 is.
24354 const X86Subtarget &Subtarget,
24355 SelectionDAG &DAG) {
24356 SDValue In = Op->getOperand(0);
24357 MVT VT = Op->getSimpleValueType(0);
24358 MVT InVT = In.getSimpleValueType();
24359
24360 MVT SVT = VT.getVectorElementType();
24361 MVT InSVT = InVT.getVectorElementType();
24363
24364 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
24365 return SDValue();
24366 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
24367 return SDValue();
24368 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
24369 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
24370 !(VT.is512BitVector() && Subtarget.hasAVX512()))
24371 return SDValue();
24372
24373 SDLoc dl(Op);
24374 unsigned Opc = Op.getOpcode();
24375 unsigned NumElts = VT.getVectorNumElements();
24376
24377 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
24378 // For 512-bit vectors, we need 128-bits or 256-bits.
24379 if (InVT.getSizeInBits() > 128) {
24380 // Input needs to be at least the same number of elements as output, and
24381 // at least 128-bits.
24382 int InSize = InSVT.getSizeInBits() * NumElts;
24383 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
24384 InVT = In.getSimpleValueType();
24385 }
24386
24387 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
24388 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
24389 // need to be handled here for 256/512-bit results.
24390 if (Subtarget.hasInt256()) {
24391 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
24392
24393 if (InVT.getVectorNumElements() != NumElts)
24394 return DAG.getNode(Op.getOpcode(), dl, VT, In);
24395
24396 // FIXME: Apparently we create inreg operations that could be regular
24397 // extends.
24398 unsigned ExtOpc =
24401 return DAG.getNode(ExtOpc, dl, VT, In);
24402 }
24403
24404 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
24405 if (Subtarget.hasAVX()) {
24406 assert(VT.is256BitVector() && "256-bit vector expected");
24407 MVT HalfVT = VT.getHalfNumVectorElementsVT();
24408 int HalfNumElts = HalfVT.getVectorNumElements();
24409
24410 unsigned NumSrcElts = InVT.getVectorNumElements();
24411 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
24412 for (int i = 0; i != HalfNumElts; ++i)
24413 HiMask[i] = HalfNumElts + i;
24414
24415 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
24416 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
24417 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
24418 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
24419 }
24420
24421 // We should only get here for sign extend.
24422 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
24423 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");
24424 unsigned InNumElts = InVT.getVectorNumElements();
24425
24426 // If the source elements are already all-signbits, we don't need to extend,
24427 // just splat the elements.
24428 APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts);
24429 if (DAG.ComputeNumSignBits(In, DemandedElts) == InVT.getScalarSizeInBits()) {
24430 unsigned Scale = InNumElts / NumElts;
24431 SmallVector<int, 16> ShuffleMask;
24432 for (unsigned I = 0; I != NumElts; ++I)
24433 ShuffleMask.append(Scale, I);
24434 return DAG.getBitcast(VT,
24435 DAG.getVectorShuffle(InVT, dl, In, In, ShuffleMask));
24436 }
24437
24438 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
24439 SDValue Curr = In;
24440 SDValue SignExt = Curr;
24441
24442 // As SRAI is only available on i16/i32 types, we expand only up to i32
24443 // and handle i64 separately.
24444 if (InVT != MVT::v4i32) {
24445 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
24446
24447 unsigned DestWidth = DestVT.getScalarSizeInBits();
24448 unsigned Scale = DestWidth / InSVT.getSizeInBits();
24449 unsigned DestElts = DestVT.getVectorNumElements();
24450
24451 // Build a shuffle mask that takes each input element and places it in the
24452 // MSBs of the new element size.
24453 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
24454 for (unsigned i = 0; i != DestElts; ++i)
24455 Mask[i * Scale + (Scale - 1)] = i;
24456
24457 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
24458 Curr = DAG.getBitcast(DestVT, Curr);
24459
24460 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
24461 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
24462 DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
24463 }
24464
24465 if (VT == MVT::v2i64) {
24466 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
24467 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
24468 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
24469 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
24470 SignExt = DAG.getBitcast(VT, SignExt);
24471 }
24472
24473 return SignExt;
24474}
24475
24477 SelectionDAG &DAG) {
24478 MVT VT = Op->getSimpleValueType(0);
24479 SDValue In = Op->getOperand(0);
24480 MVT InVT = In.getSimpleValueType();
24481 SDLoc dl(Op);
24482
24483 if (InVT.getVectorElementType() == MVT::i1)
24484 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
24485
24486 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
24488 "Expected same number of elements");
24489 assert((VT.getVectorElementType() == MVT::i16 ||
24490 VT.getVectorElementType() == MVT::i32 ||
24491 VT.getVectorElementType() == MVT::i64) &&
24492 "Unexpected element type");
24493 assert((InVT.getVectorElementType() == MVT::i8 ||
24494 InVT.getVectorElementType() == MVT::i16 ||
24495 InVT.getVectorElementType() == MVT::i32) &&
24496 "Unexpected element type");
24497
24498 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
24499 assert(InVT == MVT::v32i8 && "Unexpected VT!");
24500 return splitVectorIntUnary(Op, DAG, dl);
24501 }
24502
24503 if (Subtarget.hasInt256())
24504 return Op;
24505
24506 // Optimize vectors in AVX mode
24507 // Sign extend v8i16 to v8i32 and
24508 // v4i32 to v4i64
24509 //
24510 // Divide input vector into two parts
24511 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
24512 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
24513 // concat the vectors to original VT
24514 MVT HalfVT = VT.getHalfNumVectorElementsVT();
24515 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
24516
24517 unsigned NumElems = InVT.getVectorNumElements();
24518 SmallVector<int,8> ShufMask(NumElems, -1);
24519 for (unsigned i = 0; i != NumElems/2; ++i)
24520 ShufMask[i] = i + NumElems/2;
24521
24522 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
24523 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
24524
24525 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
24526}
24527
24528/// Change a vector store into a pair of half-size vector stores.
24530 SDValue StoredVal = Store->getValue();
24531 assert((StoredVal.getValueType().is256BitVector() ||
24532 StoredVal.getValueType().is512BitVector()) &&
24533 "Expecting 256/512-bit op");
24534
24535 // Splitting volatile memory ops is not allowed unless the operation was not
24536 // legal to begin with. Assume the input store is legal (this transform is
24537 // only used for targets with AVX). Note: It is possible that we have an
24538 // illegal type like v2i128, and so we could allow splitting a volatile store
24539 // in that case if that is important.
24540 if (!Store->isSimple())
24541 return SDValue();
24542
24543 SDLoc DL(Store);
24544 SDValue Value0, Value1;
24545 std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
24546 unsigned HalfOffset = Value0.getValueType().getStoreSize();
24547 SDValue Ptr0 = Store->getBasePtr();
24548 SDValue Ptr1 =
24549 DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(HalfOffset), DL);
24550 SDValue Ch0 =
24551 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
24552 Store->getOriginalAlign(),
24553 Store->getMemOperand()->getFlags());
24554 SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
24555 Store->getPointerInfo().getWithOffset(HalfOffset),
24556 Store->getOriginalAlign(),
24557 Store->getMemOperand()->getFlags());
24558 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
24559}
24560
24561/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
24562/// type.
24564 SelectionDAG &DAG) {
24565 SDValue StoredVal = Store->getValue();
24566 assert(StoreVT.is128BitVector() &&
24567 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
24568 StoredVal = DAG.getBitcast(StoreVT, StoredVal);
24569
24570 // Splitting volatile memory ops is not allowed unless the operation was not
24571 // legal to begin with. We are assuming the input op is legal (this transform
24572 // is only used for targets with AVX).
24573 if (!Store->isSimple())
24574 return SDValue();
24575
24576 MVT StoreSVT = StoreVT.getScalarType();
24577 unsigned NumElems = StoreVT.getVectorNumElements();
24578 unsigned ScalarSize = StoreSVT.getStoreSize();
24579
24580 SDLoc DL(Store);
24582 for (unsigned i = 0; i != NumElems; ++i) {
24583 unsigned Offset = i * ScalarSize;
24584 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
24586 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
24587 DAG.getIntPtrConstant(i, DL));
24588 SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
24589 Store->getPointerInfo().getWithOffset(Offset),
24590 Store->getOriginalAlign(),
24591 Store->getMemOperand()->getFlags());
24592 Stores.push_back(Ch);
24593 }
24594 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
24595}
24596
24597static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
24598 SelectionDAG &DAG) {
24599 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
24600 SDLoc dl(St);
24601 SDValue StoredVal = St->getValue();
24602
24603 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
24604 if (StoredVal.getValueType().isVector() &&
24605 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
24606 unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
24607 assert(NumElts <= 8 && "Unexpected VT");
24608 assert(!St->isTruncatingStore() && "Expected non-truncating store");
24609 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
24610 "Expected AVX512F without AVX512DQI");
24611
24612 // We must pad with zeros to ensure we store zeroes to any unused bits.
24613 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
24614 DAG.getUNDEF(MVT::v16i1), StoredVal,
24615 DAG.getIntPtrConstant(0, dl));
24616 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
24617 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
24618 // Make sure we store zeros in the extra bits.
24619 if (NumElts < 8)
24620 StoredVal = DAG.getZeroExtendInReg(
24621 StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
24622
24623 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
24624 St->getPointerInfo(), St->getOriginalAlign(),
24625 St->getMemOperand()->getFlags());
24626 }
24627
24628 if (St->isTruncatingStore())
24629 return SDValue();
24630
24631 // If this is a 256-bit store of concatenated ops, we are better off splitting
24632 // that store into two 128-bit stores. This avoids spurious use of 256-bit ops
24633 // and each half can execute independently. Some cores would split the op into
24634 // halves anyway, so the concat (vinsertf128) is purely an extra op.
24635 MVT StoreVT = StoredVal.getSimpleValueType();
24636 if (StoreVT.is256BitVector() ||
24637 ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&
24638 !Subtarget.hasBWI())) {
24639 if (StoredVal.hasOneUse() && isFreeToSplitVector(StoredVal.getNode(), DAG))
24640 return splitVectorStore(St, DAG);
24641 return SDValue();
24642 }
24643
24644 if (StoreVT.is32BitVector())
24645 return SDValue();
24646
24647 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24648 assert(StoreVT.is64BitVector() && "Unexpected VT");
24649 assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
24651 "Unexpected type action!");
24652
24653 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
24654 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
24655 DAG.getUNDEF(StoreVT));
24656
24657 if (Subtarget.hasSSE2()) {
24658 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
24659 // and store it.
24660 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
24661 MVT CastVT = MVT::getVectorVT(StVT, 2);
24662 StoredVal = DAG.getBitcast(CastVT, StoredVal);
24663 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
24664 DAG.getIntPtrConstant(0, dl));
24665
24666 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
24667 St->getPointerInfo(), St->getOriginalAlign(),
24668 St->getMemOperand()->getFlags());
24669 }
24670 assert(Subtarget.hasSSE1() && "Expected SSE");
24671 SDVTList Tys = DAG.getVTList(MVT::Other);
24672 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
24673 return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
24674 St->getMemOperand());
24675}
24676
24677// Lower vector extended loads using a shuffle. If SSSE3 is not available we
24678// may emit an illegal shuffle but the expansion is still better than scalar
24679// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
24680// we'll emit a shuffle and a arithmetic shift.
24681// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
24682// TODO: It is possible to support ZExt by zeroing the undef values during
24683// the shuffle phase or after the shuffle.
24684static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
24685 SelectionDAG &DAG) {
24686 MVT RegVT = Op.getSimpleValueType();
24687 assert(RegVT.isVector() && "We only custom lower vector loads.");
24688 assert(RegVT.isInteger() &&
24689 "We only custom lower integer vector loads.");
24690
24691 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
24692 SDLoc dl(Ld);
24693
24694 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
24695 if (RegVT.getVectorElementType() == MVT::i1) {
24696 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
24697 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
24698 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
24699 "Expected AVX512F without AVX512DQI");
24700
24701 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
24702 Ld->getPointerInfo(), Ld->getOriginalAlign(),
24703 Ld->getMemOperand()->getFlags());
24704
24705 // Replace chain users with the new chain.
24706 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
24707
24708 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
24709 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
24710 DAG.getBitcast(MVT::v16i1, Val),
24711 DAG.getIntPtrConstant(0, dl));
24712 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
24713 }
24714
24715 return SDValue();
24716}
24717
24718/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
24719/// each of which has no other use apart from the AND / OR.
24720static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
24721 Opc = Op.getOpcode();
24722 if (Opc != ISD::OR && Opc != ISD::AND)
24723 return false;
24724 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
24725 Op.getOperand(0).hasOneUse() &&
24726 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
24727 Op.getOperand(1).hasOneUse());
24728}
24729
24730SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
24731 SDValue Chain = Op.getOperand(0);
24732 SDValue Cond = Op.getOperand(1);
24733 SDValue Dest = Op.getOperand(2);
24734 SDLoc dl(Op);
24735
24736 // Bail out when we don't have native compare instructions.
24737 if (Cond.getOpcode() == ISD::SETCC &&
24738 Cond.getOperand(0).getValueType() != MVT::f128 &&
24739 !isSoftF16(Cond.getOperand(0).getValueType(), Subtarget)) {
24740 SDValue LHS = Cond.getOperand(0);
24741 SDValue RHS = Cond.getOperand(1);
24742 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
24743
24744 // Special case for
24745 // setcc([su]{add,sub,mul}o == 0)
24746 // setcc([su]{add,sub,mul}o != 1)
24747 if (ISD::isOverflowIntrOpRes(LHS) &&
24748 (CC == ISD::SETEQ || CC == ISD::SETNE) &&
24749 (isNullConstant(RHS) || isOneConstant(RHS))) {
24750 SDValue Value, Overflow;
24751 X86::CondCode X86Cond;
24752 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
24753
24754 if ((CC == ISD::SETEQ) == isNullConstant(RHS))
24755 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
24756
24757 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24758 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24759 Overflow);
24760 }
24761
24762 if (LHS.getSimpleValueType().isInteger()) {
24763 SDValue CCVal;
24764 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
24765 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24766 EFLAGS);
24767 }
24768
24769 if (CC == ISD::SETOEQ) {
24770 // For FCMP_OEQ, we can emit
24771 // two branches instead of an explicit AND instruction with a
24772 // separate test. However, we only do this if this block doesn't
24773 // have a fall-through edge, because this requires an explicit
24774 // jmp when the condition is false.
24775 if (Op.getNode()->hasOneUse()) {
24776 SDNode *User = *Op.getNode()->use_begin();
24777 // Look for an unconditional branch following this conditional branch.
24778 // We need this because we need to reverse the successors in order
24779 // to implement FCMP_OEQ.
24780 if (User->getOpcode() == ISD::BR) {
24781 SDValue FalseBB = User->getOperand(1);
24782 SDNode *NewBR =
24783 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
24784 assert(NewBR == User);
24785 (void)NewBR;
24786 Dest = FalseBB;
24787
24788 SDValue Cmp =
24789 DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24790 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
24791 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
24792 CCVal, Cmp);
24793 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
24794 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24795 Cmp);
24796 }
24797 }
24798 } else if (CC == ISD::SETUNE) {
24799 // For FCMP_UNE, we can emit
24800 // two branches instead of an explicit OR instruction with a
24801 // separate test.
24802 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24803 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
24804 Chain =
24805 DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);
24806 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
24807 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24808 Cmp);
24809 } else {
24810 X86::CondCode X86Cond =
24811 TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
24812 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24813 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24814 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24815 Cmp);
24816 }
24817 }
24818
24820 SDValue Value, Overflow;
24821 X86::CondCode X86Cond;
24822 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
24823
24824 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24825 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24826 Overflow);
24827 }
24828
24829 // Look past the truncate if the high bits are known zero.
24831 Cond = Cond.getOperand(0);
24832
24833 EVT CondVT = Cond.getValueType();
24834
24835 // Add an AND with 1 if we don't already have one.
24836 if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
24837 Cond =
24838 DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
24839
24840 SDValue LHS = Cond;
24841 SDValue RHS = DAG.getConstant(0, dl, CondVT);
24842
24843 SDValue CCVal;
24844 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
24845 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24846 EFLAGS);
24847}
24848
24849// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
24850// Calls to _alloca are needed to probe the stack when allocating more than 4k
24851// bytes in one go. Touching the stack at 4K increments is necessary to ensure
24852// that the guard pages used by the OS virtual memory manager are allocated in
24853// correct sequence.
24854SDValue
24855X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
24856 SelectionDAG &DAG) const {
24858 bool SplitStack = MF.shouldSplitStack();
24859 bool EmitStackProbeCall = hasStackProbeSymbol(MF);
24860 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
24861 SplitStack || EmitStackProbeCall;
24862 SDLoc dl(Op);
24863
24864 // Get the inputs.
24865 SDNode *Node = Op.getNode();
24866 SDValue Chain = Op.getOperand(0);
24867 SDValue Size = Op.getOperand(1);
24868 MaybeAlign Alignment(Op.getConstantOperandVal(2));
24869 EVT VT = Node->getValueType(0);
24870
24871 // Chain the dynamic stack allocation so that it doesn't modify the stack
24872 // pointer when other instructions are using the stack.
24873 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
24874
24875 bool Is64Bit = Subtarget.is64Bit();
24876 MVT SPTy = getPointerTy(DAG.getDataLayout());
24877
24879 if (!Lower) {
24880 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24882 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
24883 " not tell us which reg is the stack pointer!");
24884
24885 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
24886 const Align StackAlign = TFI.getStackAlign();
24887 if (hasInlineStackProbe(MF)) {
24889
24890 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
24891 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
24892 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
24893 Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,
24894 DAG.getRegister(Vreg, SPTy));
24895 } else {
24896 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
24897 Chain = SP.getValue(1);
24898 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
24899 }
24900 if (Alignment && *Alignment > StackAlign)
24901 Result =
24902 DAG.getNode(ISD::AND, dl, VT, Result,
24903 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
24904 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
24905 } else if (SplitStack) {
24907
24908 if (Is64Bit) {
24909 // The 64 bit implementation of segmented stacks needs to clobber both r10
24910 // r11. This makes it impossible to use it along with nested parameters.
24911 const Function &F = MF.getFunction();
24912 for (const auto &A : F.args()) {
24913 if (A.hasNestAttr())
24914 report_fatal_error("Cannot use segmented stacks with functions that "
24915 "have nested arguments.");
24916 }
24917 }
24918
24919 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
24920 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
24921 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
24922 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
24923 DAG.getRegister(Vreg, SPTy));
24924 } else {
24925 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
24926 Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);
24927 MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);
24928
24929 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
24930 Register SPReg = RegInfo->getStackRegister();
24931 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
24932 Chain = SP.getValue(1);
24933
24934 if (Alignment) {
24935 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
24936 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
24937 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
24938 }
24939
24940 Result = SP;
24941 }
24942
24943 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
24944
24945 SDValue Ops[2] = {Result, Chain};
24946 return DAG.getMergeValues(Ops, dl);
24947}
24948
24949SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
24951 auto PtrVT = getPointerTy(MF.getDataLayout());
24953
24954 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
24955 SDLoc DL(Op);
24956
24957 if (!Subtarget.is64Bit() ||
24958 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
24959 // vastart just stores the address of the VarArgsFrameIndex slot into the
24960 // memory location argument.
24961 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
24962 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
24963 MachinePointerInfo(SV));
24964 }
24965
24966 // __va_list_tag:
24967 // gp_offset (0 - 6 * 8)
24968 // fp_offset (48 - 48 + 8 * 16)
24969 // overflow_arg_area (point to parameters coming in memory).
24970 // reg_save_area
24972 SDValue FIN = Op.getOperand(1);
24973 // Store gp_offset
24974 SDValue Store = DAG.getStore(
24975 Op.getOperand(0), DL,
24976 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
24977 MachinePointerInfo(SV));
24978 MemOps.push_back(Store);
24979
24980 // Store fp_offset
24981 FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::getFixed(4), DL);
24982 Store = DAG.getStore(
24983 Op.getOperand(0), DL,
24984 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
24985 MachinePointerInfo(SV, 4));
24986 MemOps.push_back(Store);
24987
24988 // Store ptr to overflow_arg_area
24989 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
24990 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
24991 Store =
24992 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
24993 MemOps.push_back(Store);
24994
24995 // Store ptr to reg_save_area.
24996 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
24997 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
24998 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
24999 Store = DAG.getStore(
25000 Op.getOperand(0), DL, RSFIN, FIN,
25001 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
25002 MemOps.push_back(Store);
25003 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
25004}
25005
25006SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
25007 assert(Subtarget.is64Bit() &&
25008 "LowerVAARG only handles 64-bit va_arg!");
25009 assert(Op.getNumOperands() == 4);
25010
25012 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
25013 // The Win64 ABI uses char* instead of a structure.
25014 return DAG.expandVAArg(Op.getNode());
25015
25016 SDValue Chain = Op.getOperand(0);
25017 SDValue SrcPtr = Op.getOperand(1);
25018 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25019 unsigned Align = Op.getConstantOperandVal(3);
25020 SDLoc dl(Op);
25021
25022 EVT ArgVT = Op.getNode()->getValueType(0);
25023 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
25024 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
25025 uint8_t ArgMode;
25026
25027 // Decide which area this value should be read from.
25028 // TODO: Implement the AMD64 ABI in its entirety. This simple
25029 // selection mechanism works only for the basic types.
25030 assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
25031 if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
25032 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
25033 } else {
25034 assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&
25035 "Unhandled argument type in LowerVAARG");
25036 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
25037 }
25038
25039 if (ArgMode == 2) {
25040 // Make sure using fp_offset makes sense.
25041 assert(!Subtarget.useSoftFloat() &&
25042 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
25043 Subtarget.hasSSE1());
25044 }
25045
25046 // Insert VAARG node into the DAG
25047 // VAARG returns two values: Variable Argument Address, Chain
25048 SDValue InstOps[] = {Chain, SrcPtr,
25049 DAG.getTargetConstant(ArgSize, dl, MVT::i32),
25050 DAG.getTargetConstant(ArgMode, dl, MVT::i8),
25051 DAG.getTargetConstant(Align, dl, MVT::i32)};
25052 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
25055 VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
25056 /*Alignment=*/std::nullopt,
25058 Chain = VAARG.getValue(1);
25059
25060 // Load the next argument and return it
25061 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
25062}
25063
25064static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
25065 SelectionDAG &DAG) {
25066 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
25067 // where a va_list is still an i8*.
25068 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
25069 if (Subtarget.isCallingConvWin64(
25071 // Probably a Win64 va_copy.
25072 return DAG.expandVACopy(Op.getNode());
25073
25074 SDValue Chain = Op.getOperand(0);
25075 SDValue DstPtr = Op.getOperand(1);
25076 SDValue SrcPtr = Op.getOperand(2);
25077 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
25078 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
25079 SDLoc DL(Op);
25080
25081 return DAG.getMemcpy(
25082 Chain, DL, DstPtr, SrcPtr,
25083 DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
25084 Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
25085 false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
25086}
25087
25088// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
25089static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
25090 switch (Opc) {
25091 case ISD::SHL:
25092 case X86ISD::VSHL:
25093 case X86ISD::VSHLI:
25094 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
25095 case ISD::SRL:
25096 case X86ISD::VSRL:
25097 case X86ISD::VSRLI:
25098 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
25099 case ISD::SRA:
25100 case X86ISD::VSRA:
25101 case X86ISD::VSRAI:
25102 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
25103 }
25104 llvm_unreachable("Unknown target vector shift node");
25105}
25106
25107/// Handle vector element shifts where the shift amount is a constant.
25108/// Takes immediate version of shift as input.
25109static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
25110 SDValue SrcOp, uint64_t ShiftAmt,
25111 SelectionDAG &DAG) {
25112 MVT ElementType = VT.getVectorElementType();
25113
25114 // Bitcast the source vector to the output type, this is mainly necessary for
25115 // vXi8/vXi64 shifts.
25116 if (VT != SrcOp.getSimpleValueType())
25117 SrcOp = DAG.getBitcast(VT, SrcOp);
25118
25119 // Fold this packed shift into its first operand if ShiftAmt is 0.
25120 if (ShiftAmt == 0)
25121 return SrcOp;
25122
25123 // Check for ShiftAmt >= element width
25124 if (ShiftAmt >= ElementType.getSizeInBits()) {
25125 if (Opc == X86ISD::VSRAI)
25126 ShiftAmt = ElementType.getSizeInBits() - 1;
25127 else
25128 return DAG.getConstant(0, dl, VT);
25129 }
25130
25131 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
25132 && "Unknown target vector shift-by-constant node");
25133
25134 // Fold this packed vector shift into a build vector if SrcOp is a
25135 // vector of Constants or UNDEFs.
25137 unsigned ShiftOpc;
25138 switch (Opc) {
25139 default: llvm_unreachable("Unknown opcode!");
25140 case X86ISD::VSHLI:
25141 ShiftOpc = ISD::SHL;
25142 break;
25143 case X86ISD::VSRLI:
25144 ShiftOpc = ISD::SRL;
25145 break;
25146 case X86ISD::VSRAI:
25147 ShiftOpc = ISD::SRA;
25148 break;
25149 }
25150
25151 SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);
25152 if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))
25153 return C;
25154 }
25155
25156 return DAG.getNode(Opc, dl, VT, SrcOp,
25157 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
25158}
25159
25160/// Handle vector element shifts by a splat shift amount
25161static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
25162 SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,
25163 const X86Subtarget &Subtarget,
25164 SelectionDAG &DAG) {
25165 MVT AmtVT = ShAmt.getSimpleValueType();
25166 assert(AmtVT.isVector() && "Vector shift type mismatch");
25167 assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&
25168 "Illegal vector splat index");
25169
25170 // Move the splat element to the bottom element.
25171 if (ShAmtIdx != 0) {
25172 SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
25173 Mask[0] = ShAmtIdx;
25174 ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);
25175 }
25176
25177 // Peek through any zext node if we can get back to a 128-bit source.
25178 if (AmtVT.getScalarSizeInBits() == 64 &&
25179 (ShAmt.getOpcode() == ISD::ZERO_EXTEND ||
25181 ShAmt.getOperand(0).getValueType().isSimple() &&
25182 ShAmt.getOperand(0).getValueType().is128BitVector()) {
25183 ShAmt = ShAmt.getOperand(0);
25184 AmtVT = ShAmt.getSimpleValueType();
25185 }
25186
25187 // See if we can mask off the upper elements using the existing source node.
25188 // The shift uses the entire lower 64-bits of the amount vector, so no need to
25189 // do this for vXi64 types.
25190 bool IsMasked = false;
25191 if (AmtVT.getScalarSizeInBits() < 64) {
25192 if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||
25193 ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {
25194 // If the shift amount has come from a scalar, then zero-extend the scalar
25195 // before moving to the vector.
25196 ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);
25197 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
25198 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
25199 AmtVT = MVT::v4i32;
25200 IsMasked = true;
25201 } else if (ShAmt.getOpcode() == ISD::AND) {
25202 // See if the shift amount is already masked (e.g. for rotation modulo),
25203 // then we can zero-extend it by setting all the other mask elements to
25204 // zero.
25205 SmallVector<SDValue> MaskElts(
25206 AmtVT.getVectorNumElements(),
25207 DAG.getConstant(0, dl, AmtVT.getScalarType()));
25208 MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());
25209 SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);
25210 if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,
25211 {ShAmt.getOperand(1), Mask}))) {
25212 ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);
25213 IsMasked = true;
25214 }
25215 }
25216 }
25217
25218 // Extract if the shift amount vector is larger than 128-bits.
25219 if (AmtVT.getSizeInBits() > 128) {
25220 ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);
25221 AmtVT = ShAmt.getSimpleValueType();
25222 }
25223
25224 // Zero-extend bottom element to v2i64 vector type, either by extension or
25225 // shuffle masking.
25226 if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {
25227 if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||
25228 ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {
25229 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);
25230 } else if (Subtarget.hasSSE41()) {
25231 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
25232 MVT::v2i64, ShAmt);
25233 } else {
25234 SDValue ByteShift = DAG.getTargetConstant(
25235 (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
25236 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
25237 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
25238 ByteShift);
25239 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
25240 ByteShift);
25241 }
25242 }
25243
25244 // Change opcode to non-immediate version.
25245 Opc = getTargetVShiftUniformOpcode(Opc, true);
25246
25247 // The return type has to be a 128-bit type with the same element
25248 // type as the input type.
25249 MVT EltVT = VT.getVectorElementType();
25250 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
25251
25252 ShAmt = DAG.getBitcast(ShVT, ShAmt);
25253 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
25254}
25255
25256/// Return Mask with the necessary casting or extending
25257/// for \p Mask according to \p MaskVT when lowering masking intrinsics
25258static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
25259 const X86Subtarget &Subtarget, SelectionDAG &DAG,
25260 const SDLoc &dl) {
25261
25262 if (isAllOnesConstant(Mask))
25263 return DAG.getConstant(1, dl, MaskVT);
25264 if (X86::isZeroNode(Mask))
25265 return DAG.getConstant(0, dl, MaskVT);
25266
25267 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");
25268
25269 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
25270 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
25271 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
25272 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
25273 SDValue Lo, Hi;
25274 std::tie(Lo, Hi) = DAG.SplitScalar(Mask, dl, MVT::i32, MVT::i32);
25275 Lo = DAG.getBitcast(MVT::v32i1, Lo);
25276 Hi = DAG.getBitcast(MVT::v32i1, Hi);
25277 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
25278 } else {
25279 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
25280 Mask.getSimpleValueType().getSizeInBits());
25281 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
25282 // are extracted by EXTRACT_SUBVECTOR.
25283 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
25284 DAG.getBitcast(BitcastVT, Mask),
25285 DAG.getIntPtrConstant(0, dl));
25286 }
25287}
25288
25289/// Return (and \p Op, \p Mask) for compare instructions or
25290/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
25291/// necessary casting or extending for \p Mask when lowering masking intrinsics
25293 SDValue PreservedSrc,
25294 const X86Subtarget &Subtarget,
25295 SelectionDAG &DAG) {
25296 MVT VT = Op.getSimpleValueType();
25297 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
25298 unsigned OpcodeSelect = ISD::VSELECT;
25299 SDLoc dl(Op);
25300
25301 if (isAllOnesConstant(Mask))
25302 return Op;
25303
25304 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25305
25306 if (PreservedSrc.isUndef())
25307 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
25308 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
25309}
25310
25311/// Creates an SDNode for a predicated scalar operation.
25312/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
25313/// The mask is coming as MVT::i8 and it should be transformed
25314/// to MVT::v1i1 while lowering masking intrinsics.
25315/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
25316/// "X86select" instead of "vselect". We just can't create the "vselect" node
25317/// for a scalar instruction.
25319 SDValue PreservedSrc,
25320 const X86Subtarget &Subtarget,
25321 SelectionDAG &DAG) {
25322
25323 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
25324 if (MaskConst->getZExtValue() & 0x1)
25325 return Op;
25326
25327 MVT VT = Op.getSimpleValueType();
25328 SDLoc dl(Op);
25329
25330 assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
25331 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
25332 DAG.getBitcast(MVT::v8i1, Mask),
25333 DAG.getIntPtrConstant(0, dl));
25334 if (Op.getOpcode() == X86ISD::FSETCCM ||
25335 Op.getOpcode() == X86ISD::FSETCCM_SAE ||
25336 Op.getOpcode() == X86ISD::VFPCLASSS)
25337 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
25338
25339 if (PreservedSrc.isUndef())
25340 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
25341 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
25342}
25343
25345 if (!Fn->hasPersonalityFn())
25347 "querying registration node size for function without personality");
25348 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
25349 // WinEHStatePass for the full struct definition.
25350 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
25351 case EHPersonality::MSVC_X86SEH: return 24;
25352 case EHPersonality::MSVC_CXX: return 16;
25353 default: break;
25354 }
25356 "can only recover FP for 32-bit MSVC EH personality functions");
25357}
25358
25359/// When the MSVC runtime transfers control to us, either to an outlined
25360/// function or when returning to a parent frame after catching an exception, we
25361/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
25362/// Here's the math:
25363/// RegNodeBase = EntryEBP - RegNodeSize
25364/// ParentFP = RegNodeBase - ParentFrameOffset
25365/// Subtracting RegNodeSize takes us to the offset of the registration node, and
25366/// subtracting the offset (negative on x86) takes us back to the parent FP.
25368 SDValue EntryEBP) {
25370 SDLoc dl;
25371
25372 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25373 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
25374
25375 // It's possible that the parent function no longer has a personality function
25376 // if the exceptional code was optimized away, in which case we just return
25377 // the incoming EBP.
25378 if (!Fn->hasPersonalityFn())
25379 return EntryEBP;
25380
25381 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
25382 // registration, or the .set_setframe offset.
25383 MCSymbol *OffsetSym =
25386 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
25387 SDValue ParentFrameOffset =
25388 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
25389
25390 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
25391 // prologue to RBP in the parent function.
25392 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
25393 if (Subtarget.is64Bit())
25394 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
25395
25396 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
25397 // RegNodeBase = EntryEBP - RegNodeSize
25398 // ParentFP = RegNodeBase - ParentFrameOffset
25399 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
25400 DAG.getConstant(RegNodeSize, dl, PtrVT));
25401 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
25402}
25403
25404SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
25405 SelectionDAG &DAG) const {
25406 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
25407 auto isRoundModeCurDirection = [](SDValue Rnd) {
25408 if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
25409 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
25410
25411 return false;
25412 };
25413 auto isRoundModeSAE = [](SDValue Rnd) {
25414 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
25415 unsigned RC = C->getZExtValue();
25417 // Clear the NO_EXC bit and check remaining bits.
25419 // As a convenience we allow no other bits or explicitly
25420 // current direction.
25421 return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
25422 }
25423 }
25424
25425 return false;
25426 };
25427 auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
25428 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
25429 RC = C->getZExtValue();
25431 // Clear the NO_EXC bit and check remaining bits.
25437 }
25438 }
25439
25440 return false;
25441 };
25442
25443 SDLoc dl(Op);
25444 unsigned IntNo = Op.getConstantOperandVal(0);
25445 MVT VT = Op.getSimpleValueType();
25446 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
25447
25448 // Propagate flags from original node to transformed node(s).
25449 SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
25450
25451 if (IntrData) {
25452 switch(IntrData->Type) {
25453 case INTR_TYPE_1OP: {
25454 // We specify 2 possible opcodes for intrinsics with rounding modes.
25455 // First, we check if the intrinsic may have non-default rounding mode,
25456 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25457 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25458 if (IntrWithRoundingModeOpcode != 0) {
25459 SDValue Rnd = Op.getOperand(2);
25460 unsigned RC = 0;
25461 if (isRoundModeSAEToX(Rnd, RC))
25462 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25463 Op.getOperand(1),
25464 DAG.getTargetConstant(RC, dl, MVT::i32));
25465 if (!isRoundModeCurDirection(Rnd))
25466 return SDValue();
25467 }
25468 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25469 Op.getOperand(1));
25470 }
25471 case INTR_TYPE_1OP_SAE: {
25472 SDValue Sae = Op.getOperand(2);
25473
25474 unsigned Opc;
25475 if (isRoundModeCurDirection(Sae))
25476 Opc = IntrData->Opc0;
25477 else if (isRoundModeSAE(Sae))
25478 Opc = IntrData->Opc1;
25479 else
25480 return SDValue();
25481
25482 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
25483 }
25484 case INTR_TYPE_2OP: {
25485 SDValue Src2 = Op.getOperand(2);
25486
25487 // We specify 2 possible opcodes for intrinsics with rounding modes.
25488 // First, we check if the intrinsic may have non-default rounding mode,
25489 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25490 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25491 if (IntrWithRoundingModeOpcode != 0) {
25492 SDValue Rnd = Op.getOperand(3);
25493 unsigned RC = 0;
25494 if (isRoundModeSAEToX(Rnd, RC))
25495 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25496 Op.getOperand(1), Src2,
25497 DAG.getTargetConstant(RC, dl, MVT::i32));
25498 if (!isRoundModeCurDirection(Rnd))
25499 return SDValue();
25500 }
25501
25502 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25503 Op.getOperand(1), Src2);
25504 }
25505 case INTR_TYPE_2OP_SAE: {
25506 SDValue Sae = Op.getOperand(3);
25507
25508 unsigned Opc;
25509 if (isRoundModeCurDirection(Sae))
25510 Opc = IntrData->Opc0;
25511 else if (isRoundModeSAE(Sae))
25512 Opc = IntrData->Opc1;
25513 else
25514 return SDValue();
25515
25516 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
25517 Op.getOperand(2));
25518 }
25519 case INTR_TYPE_3OP:
25520 case INTR_TYPE_3OP_IMM8: {
25521 SDValue Src1 = Op.getOperand(1);
25522 SDValue Src2 = Op.getOperand(2);
25523 SDValue Src3 = Op.getOperand(3);
25524
25525 if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
25526 Src3.getValueType() != MVT::i8) {
25527 Src3 = DAG.getTargetConstant(Src3->getAsZExtVal() & 0xff, dl, MVT::i8);
25528 }
25529
25530 // We specify 2 possible opcodes for intrinsics with rounding modes.
25531 // First, we check if the intrinsic may have non-default rounding mode,
25532 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25533 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25534 if (IntrWithRoundingModeOpcode != 0) {
25535 SDValue Rnd = Op.getOperand(4);
25536 unsigned RC = 0;
25537 if (isRoundModeSAEToX(Rnd, RC))
25538 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25539 Src1, Src2, Src3,
25540 DAG.getTargetConstant(RC, dl, MVT::i32));
25541 if (!isRoundModeCurDirection(Rnd))
25542 return SDValue();
25543 }
25544
25545 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25546 {Src1, Src2, Src3});
25547 }
25548 case INTR_TYPE_4OP_IMM8: {
25549 assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);
25550 SDValue Src4 = Op.getOperand(4);
25551 if (Src4.getValueType() != MVT::i8) {
25552 Src4 = DAG.getTargetConstant(Src4->getAsZExtVal() & 0xff, dl, MVT::i8);
25553 }
25554
25555 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25556 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
25557 Src4);
25558 }
25559 case INTR_TYPE_1OP_MASK: {
25560 SDValue Src = Op.getOperand(1);
25561 SDValue PassThru = Op.getOperand(2);
25562 SDValue Mask = Op.getOperand(3);
25563 // We add rounding mode to the Node when
25564 // - RC Opcode is specified and
25565 // - RC is not "current direction".
25566 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25567 if (IntrWithRoundingModeOpcode != 0) {
25568 SDValue Rnd = Op.getOperand(4);
25569 unsigned RC = 0;
25570 if (isRoundModeSAEToX(Rnd, RC))
25571 return getVectorMaskingNode(
25572 DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25573 Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
25574 Mask, PassThru, Subtarget, DAG);
25575 if (!isRoundModeCurDirection(Rnd))
25576 return SDValue();
25577 }
25578 return getVectorMaskingNode(
25579 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
25580 Subtarget, DAG);
25581 }
25583 SDValue Src = Op.getOperand(1);
25584 SDValue PassThru = Op.getOperand(2);
25585 SDValue Mask = Op.getOperand(3);
25586 SDValue Rnd = Op.getOperand(4);
25587
25588 unsigned Opc;
25589 if (isRoundModeCurDirection(Rnd))
25590 Opc = IntrData->Opc0;
25591 else if (isRoundModeSAE(Rnd))
25592 Opc = IntrData->Opc1;
25593 else
25594 return SDValue();
25595
25596 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
25597 Subtarget, DAG);
25598 }
25599 case INTR_TYPE_SCALAR_MASK: {
25600 SDValue Src1 = Op.getOperand(1);
25601 SDValue Src2 = Op.getOperand(2);
25602 SDValue passThru = Op.getOperand(3);
25603 SDValue Mask = Op.getOperand(4);
25604 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25605 // There are 2 kinds of intrinsics in this group:
25606 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
25607 // (2) With rounding mode and sae - 7 operands.
25608 bool HasRounding = IntrWithRoundingModeOpcode != 0;
25609 if (Op.getNumOperands() == (5U + HasRounding)) {
25610 if (HasRounding) {
25611 SDValue Rnd = Op.getOperand(5);
25612 unsigned RC = 0;
25613 if (isRoundModeSAEToX(Rnd, RC))
25614 return getScalarMaskingNode(
25615 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
25616 DAG.getTargetConstant(RC, dl, MVT::i32)),
25617 Mask, passThru, Subtarget, DAG);
25618 if (!isRoundModeCurDirection(Rnd))
25619 return SDValue();
25620 }
25621 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
25622 Src2),
25623 Mask, passThru, Subtarget, DAG);
25624 }
25625
25626 assert(Op.getNumOperands() == (6U + HasRounding) &&
25627 "Unexpected intrinsic form");
25628 SDValue RoundingMode = Op.getOperand(5);
25629 unsigned Opc = IntrData->Opc0;
25630 if (HasRounding) {
25631 SDValue Sae = Op.getOperand(6);
25632 if (isRoundModeSAE(Sae))
25633 Opc = IntrWithRoundingModeOpcode;
25634 else if (!isRoundModeCurDirection(Sae))
25635 return SDValue();
25636 }
25637 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
25638 Src2, RoundingMode),
25639 Mask, passThru, Subtarget, DAG);
25640 }
25642 SDValue Src1 = Op.getOperand(1);
25643 SDValue Src2 = Op.getOperand(2);
25644 SDValue passThru = Op.getOperand(3);
25645 SDValue Mask = Op.getOperand(4);
25646 SDValue Rnd = Op.getOperand(5);
25647
25648 SDValue NewOp;
25649 unsigned RC = 0;
25650 if (isRoundModeCurDirection(Rnd))
25651 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
25652 else if (isRoundModeSAEToX(Rnd, RC))
25653 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
25654 DAG.getTargetConstant(RC, dl, MVT::i32));
25655 else
25656 return SDValue();
25657
25658 return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
25659 }
25661 SDValue Src1 = Op.getOperand(1);
25662 SDValue Src2 = Op.getOperand(2);
25663 SDValue passThru = Op.getOperand(3);
25664 SDValue Mask = Op.getOperand(4);
25665 SDValue Sae = Op.getOperand(5);
25666 unsigned Opc;
25667 if (isRoundModeCurDirection(Sae))
25668 Opc = IntrData->Opc0;
25669 else if (isRoundModeSAE(Sae))
25670 Opc = IntrData->Opc1;
25671 else
25672 return SDValue();
25673
25674 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
25675 Mask, passThru, Subtarget, DAG);
25676 }
25677 case INTR_TYPE_2OP_MASK: {
25678 SDValue Src1 = Op.getOperand(1);
25679 SDValue Src2 = Op.getOperand(2);
25680 SDValue PassThru = Op.getOperand(3);
25681 SDValue Mask = Op.getOperand(4);
25682 SDValue NewOp;
25683 if (IntrData->Opc1 != 0) {
25684 SDValue Rnd = Op.getOperand(5);
25685 unsigned RC = 0;
25686 if (isRoundModeSAEToX(Rnd, RC))
25687 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
25688 DAG.getTargetConstant(RC, dl, MVT::i32));
25689 else if (!isRoundModeCurDirection(Rnd))
25690 return SDValue();
25691 }
25692 if (!NewOp)
25693 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
25694 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
25695 }
25697 SDValue Src1 = Op.getOperand(1);
25698 SDValue Src2 = Op.getOperand(2);
25699 SDValue PassThru = Op.getOperand(3);
25700 SDValue Mask = Op.getOperand(4);
25701
25702 unsigned Opc = IntrData->Opc0;
25703 if (IntrData->Opc1 != 0) {
25704 SDValue Sae = Op.getOperand(5);
25705 if (isRoundModeSAE(Sae))
25706 Opc = IntrData->Opc1;
25707 else if (!isRoundModeCurDirection(Sae))
25708 return SDValue();
25709 }
25710
25711 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
25712 Mask, PassThru, Subtarget, DAG);
25713 }
25715 SDValue Src1 = Op.getOperand(1);
25716 SDValue Src2 = Op.getOperand(2);
25717 SDValue Src3 = Op.getOperand(3);
25718 SDValue PassThru = Op.getOperand(4);
25719 SDValue Mask = Op.getOperand(5);
25720 SDValue Sae = Op.getOperand(6);
25721 unsigned Opc;
25722 if (isRoundModeCurDirection(Sae))
25723 Opc = IntrData->Opc0;
25724 else if (isRoundModeSAE(Sae))
25725 Opc = IntrData->Opc1;
25726 else
25727 return SDValue();
25728
25729 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
25730 Mask, PassThru, Subtarget, DAG);
25731 }
25733 SDValue Src1 = Op.getOperand(1);
25734 SDValue Src2 = Op.getOperand(2);
25735 SDValue Src3 = Op.getOperand(3);
25736 SDValue PassThru = Op.getOperand(4);
25737 SDValue Mask = Op.getOperand(5);
25738
25739 unsigned Opc = IntrData->Opc0;
25740 if (IntrData->Opc1 != 0) {
25741 SDValue Sae = Op.getOperand(6);
25742 if (isRoundModeSAE(Sae))
25743 Opc = IntrData->Opc1;
25744 else if (!isRoundModeCurDirection(Sae))
25745 return SDValue();
25746 }
25747 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
25748 Mask, PassThru, Subtarget, DAG);
25749 }
25750 case BLENDV: {
25751 SDValue Src1 = Op.getOperand(1);
25752 SDValue Src2 = Op.getOperand(2);
25753 SDValue Src3 = Op.getOperand(3);
25754
25756 Src3 = DAG.getBitcast(MaskVT, Src3);
25757
25758 // Reverse the operands to match VSELECT order.
25759 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
25760 }
25761 case VPERM_2OP : {
25762 SDValue Src1 = Op.getOperand(1);
25763 SDValue Src2 = Op.getOperand(2);
25764
25765 // Swap Src1 and Src2 in the node creation
25766 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
25767 }
25768 case CFMA_OP_MASKZ:
25769 case CFMA_OP_MASK: {
25770 SDValue Src1 = Op.getOperand(1);
25771 SDValue Src2 = Op.getOperand(2);
25772 SDValue Src3 = Op.getOperand(3);
25773 SDValue Mask = Op.getOperand(4);
25774 MVT VT = Op.getSimpleValueType();
25775
25776 SDValue PassThru = Src3;
25777 if (IntrData->Type == CFMA_OP_MASKZ)
25778 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
25779
25780 // We add rounding mode to the Node when
25781 // - RC Opcode is specified and
25782 // - RC is not "current direction".
25783 SDValue NewOp;
25784 if (IntrData->Opc1 != 0) {
25785 SDValue Rnd = Op.getOperand(5);
25786 unsigned RC = 0;
25787 if (isRoundModeSAEToX(Rnd, RC))
25788 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,
25789 DAG.getTargetConstant(RC, dl, MVT::i32));
25790 else if (!isRoundModeCurDirection(Rnd))
25791 return SDValue();
25792 }
25793 if (!NewOp)
25794 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
25795 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
25796 }
25797 case IFMA_OP:
25798 // NOTE: We need to swizzle the operands to pass the multiply operands
25799 // first.
25800 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25801 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
25802 case FPCLASSS: {
25803 SDValue Src1 = Op.getOperand(1);
25804 SDValue Imm = Op.getOperand(2);
25805 SDValue Mask = Op.getOperand(3);
25806 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
25807 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
25808 Subtarget, DAG);
25809 // Need to fill with zeros to ensure the bitcast will produce zeroes
25810 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25811 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
25812 DAG.getConstant(0, dl, MVT::v8i1),
25813 FPclassMask, DAG.getIntPtrConstant(0, dl));
25814 return DAG.getBitcast(MVT::i8, Ins);
25815 }
25816
25817 case CMP_MASK_CC: {
25818 MVT MaskVT = Op.getSimpleValueType();
25819 SDValue CC = Op.getOperand(3);
25820 SDValue Mask = Op.getOperand(4);
25821 // We specify 2 possible opcodes for intrinsics with rounding modes.
25822 // First, we check if the intrinsic may have non-default rounding mode,
25823 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25824 if (IntrData->Opc1 != 0) {
25825 SDValue Sae = Op.getOperand(5);
25826 if (isRoundModeSAE(Sae))
25827 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
25828 Op.getOperand(2), CC, Mask, Sae);
25829 if (!isRoundModeCurDirection(Sae))
25830 return SDValue();
25831 }
25832 //default rounding mode
25833 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
25834 {Op.getOperand(1), Op.getOperand(2), CC, Mask});
25835 }
25836 case CMP_MASK_SCALAR_CC: {
25837 SDValue Src1 = Op.getOperand(1);
25838 SDValue Src2 = Op.getOperand(2);
25839 SDValue CC = Op.getOperand(3);
25840 SDValue Mask = Op.getOperand(4);
25841
25842 SDValue Cmp;
25843 if (IntrData->Opc1 != 0) {
25844 SDValue Sae = Op.getOperand(5);
25845 if (isRoundModeSAE(Sae))
25846 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
25847 else if (!isRoundModeCurDirection(Sae))
25848 return SDValue();
25849 }
25850 //default rounding mode
25851 if (!Cmp.getNode())
25852 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
25853
25854 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
25855 Subtarget, DAG);
25856 // Need to fill with zeros to ensure the bitcast will produce zeroes
25857 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25858 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
25859 DAG.getConstant(0, dl, MVT::v8i1),
25860 CmpMask, DAG.getIntPtrConstant(0, dl));
25861 return DAG.getBitcast(MVT::i8, Ins);
25862 }
25863 case COMI: { // Comparison intrinsics
25864 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
25865 SDValue LHS = Op.getOperand(1);
25866 SDValue RHS = Op.getOperand(2);
25867 // Some conditions require the operands to be swapped.
25868 if (CC == ISD::SETLT || CC == ISD::SETLE)
25869 std::swap(LHS, RHS);
25870
25871 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
25872 SDValue SetCC;
25873 switch (CC) {
25874 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
25875 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
25876 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
25877 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
25878 break;
25879 }
25880 case ISD::SETNE: { // (ZF = 1 or PF = 1)
25881 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
25882 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
25883 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
25884 break;
25885 }
25886 case ISD::SETGT: // (CF = 0 and ZF = 0)
25887 case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
25888 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
25889 break;
25890 }
25891 case ISD::SETGE: // CF = 0
25892 case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
25893 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
25894 break;
25895 default:
25896 llvm_unreachable("Unexpected illegal condition!");
25897 }
25898 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
25899 }
25900 case COMI_RM: { // Comparison intrinsics with Sae
25901 SDValue LHS = Op.getOperand(1);
25902 SDValue RHS = Op.getOperand(2);
25903 unsigned CondVal = Op.getConstantOperandVal(3);
25904 SDValue Sae = Op.getOperand(4);
25905
25906 SDValue FCmp;
25907 if (isRoundModeCurDirection(Sae))
25908 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
25909 DAG.getTargetConstant(CondVal, dl, MVT::i8));
25910 else if (isRoundModeSAE(Sae))
25911 FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
25912 DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
25913 else
25914 return SDValue();
25915 // Need to fill with zeros to ensure the bitcast will produce zeroes
25916 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25917 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
25918 DAG.getConstant(0, dl, MVT::v16i1),
25919 FCmp, DAG.getIntPtrConstant(0, dl));
25920 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
25921 DAG.getBitcast(MVT::i16, Ins));
25922 }
25923 case VSHIFT: {
25924 SDValue SrcOp = Op.getOperand(1);
25925 SDValue ShAmt = Op.getOperand(2);
25926 assert(ShAmt.getValueType() == MVT::i32 &&
25927 "Unexpected VSHIFT amount type");
25928
25929 // Catch shift-by-constant.
25930 if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
25931 return getTargetVShiftByConstNode(IntrData->Opc0, dl,
25932 Op.getSimpleValueType(), SrcOp,
25933 CShAmt->getZExtValue(), DAG);
25934
25935 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
25936 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
25937 SrcOp, ShAmt, 0, Subtarget, DAG);
25938 }
25940 SDValue Mask = Op.getOperand(3);
25941 SDValue DataToCompress = Op.getOperand(1);
25942 SDValue PassThru = Op.getOperand(2);
25943 if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
25944 return Op.getOperand(1);
25945
25946 // Avoid false dependency.
25947 if (PassThru.isUndef())
25948 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
25949
25950 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
25951 Mask);
25952 }
25953 case FIXUPIMM:
25954 case FIXUPIMM_MASKZ: {
25955 SDValue Src1 = Op.getOperand(1);
25956 SDValue Src2 = Op.getOperand(2);
25957 SDValue Src3 = Op.getOperand(3);
25958 SDValue Imm = Op.getOperand(4);
25959 SDValue Mask = Op.getOperand(5);
25960 SDValue Passthru = (IntrData->Type == FIXUPIMM)
25961 ? Src1
25962 : getZeroVector(VT, Subtarget, DAG, dl);
25963
25964 unsigned Opc = IntrData->Opc0;
25965 if (IntrData->Opc1 != 0) {
25966 SDValue Sae = Op.getOperand(6);
25967 if (isRoundModeSAE(Sae))
25968 Opc = IntrData->Opc1;
25969 else if (!isRoundModeCurDirection(Sae))
25970 return SDValue();
25971 }
25972
25973 SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
25974
25975 if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)
25976 return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
25977
25978 return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
25979 }
25980 case ROUNDP: {
25981 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
25982 // Clear the upper bits of the rounding immediate so that the legacy
25983 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
25984 uint64_t Round = Op.getConstantOperandVal(2);
25985 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
25986 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25987 Op.getOperand(1), RoundingMode);
25988 }
25989 case ROUNDS: {
25990 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
25991 // Clear the upper bits of the rounding immediate so that the legacy
25992 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
25993 uint64_t Round = Op.getConstantOperandVal(3);
25994 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
25995 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25996 Op.getOperand(1), Op.getOperand(2), RoundingMode);
25997 }
25998 case BEXTRI: {
25999 assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
26000
26001 uint64_t Imm = Op.getConstantOperandVal(2);
26002 SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
26003 Op.getValueType());
26004 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26005 Op.getOperand(1), Control);
26006 }
26007 // ADC/SBB
26008 case ADX: {
26009 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
26010 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
26011
26012 SDValue Res;
26013 // If the carry in is zero, then we should just use ADD/SUB instead of
26014 // ADC/SBB.
26015 if (isNullConstant(Op.getOperand(1))) {
26016 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
26017 Op.getOperand(3));
26018 } else {
26019 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
26020 DAG.getConstant(-1, dl, MVT::i8));
26021 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
26022 Op.getOperand(3), GenCF.getValue(1));
26023 }
26024 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
26025 SDValue Results[] = { SetCC, Res };
26026 return DAG.getMergeValues(Results, dl);
26027 }
26028 case CVTPD2PS_MASK:
26029 case CVTPD2DQ_MASK:
26030 case CVTQQ2PS_MASK:
26031 case TRUNCATE_TO_REG: {
26032 SDValue Src = Op.getOperand(1);
26033 SDValue PassThru = Op.getOperand(2);
26034 SDValue Mask = Op.getOperand(3);
26035
26036 if (isAllOnesConstant(Mask))
26037 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
26038
26039 MVT SrcVT = Src.getSimpleValueType();
26040 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
26041 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26042 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
26043 {Src, PassThru, Mask});
26044 }
26045 case CVTPS2PH_MASK: {
26046 SDValue Src = Op.getOperand(1);
26047 SDValue Rnd = Op.getOperand(2);
26048 SDValue PassThru = Op.getOperand(3);
26049 SDValue Mask = Op.getOperand(4);
26050
26051 unsigned RC = 0;
26052 unsigned Opc = IntrData->Opc0;
26053 bool SAE = Src.getValueType().is512BitVector() &&
26054 (isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd));
26055 if (SAE) {
26057 Rnd = DAG.getTargetConstant(RC, dl, MVT::i32);
26058 }
26059
26060 if (isAllOnesConstant(Mask))
26061 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd);
26062
26063 if (SAE)
26065 else
26066 Opc = IntrData->Opc1;
26067 MVT SrcVT = Src.getSimpleValueType();
26068 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
26069 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26070 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask);
26071 }
26072 case CVTNEPS2BF16_MASK: {
26073 SDValue Src = Op.getOperand(1);
26074 SDValue PassThru = Op.getOperand(2);
26075 SDValue Mask = Op.getOperand(3);
26076
26077 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
26078 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
26079
26080 // Break false dependency.
26081 if (PassThru.isUndef())
26082 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
26083
26084 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
26085 Mask);
26086 }
26087 default:
26088 break;
26089 }
26090 }
26091
26092 switch (IntNo) {
26093 default: return SDValue(); // Don't custom lower most intrinsics.
26094
26095 // ptest and testp intrinsics. The intrinsic these come from are designed to
26096 // return an integer value, not just an instruction so lower it to the ptest
26097 // or testp pattern and a setcc for the result.
26098 case Intrinsic::x86_avx512_ktestc_b:
26099 case Intrinsic::x86_avx512_ktestc_w:
26100 case Intrinsic::x86_avx512_ktestc_d:
26101 case Intrinsic::x86_avx512_ktestc_q:
26102 case Intrinsic::x86_avx512_ktestz_b:
26103 case Intrinsic::x86_avx512_ktestz_w:
26104 case Intrinsic::x86_avx512_ktestz_d:
26105 case Intrinsic::x86_avx512_ktestz_q:
26106 case Intrinsic::x86_sse41_ptestz:
26107 case Intrinsic::x86_sse41_ptestc:
26108 case Intrinsic::x86_sse41_ptestnzc:
26109 case Intrinsic::x86_avx_ptestz_256:
26110 case Intrinsic::x86_avx_ptestc_256:
26111 case Intrinsic::x86_avx_ptestnzc_256:
26112 case Intrinsic::x86_avx_vtestz_ps:
26113 case Intrinsic::x86_avx_vtestc_ps:
26114 case Intrinsic::x86_avx_vtestnzc_ps:
26115 case Intrinsic::x86_avx_vtestz_pd:
26116 case Intrinsic::x86_avx_vtestc_pd:
26117 case Intrinsic::x86_avx_vtestnzc_pd:
26118 case Intrinsic::x86_avx_vtestz_ps_256:
26119 case Intrinsic::x86_avx_vtestc_ps_256:
26120 case Intrinsic::x86_avx_vtestnzc_ps_256:
26121 case Intrinsic::x86_avx_vtestz_pd_256:
26122 case Intrinsic::x86_avx_vtestc_pd_256:
26123 case Intrinsic::x86_avx_vtestnzc_pd_256: {
26124 unsigned TestOpc = X86ISD::PTEST;
26125 X86::CondCode X86CC;
26126 switch (IntNo) {
26127 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
26128 case Intrinsic::x86_avx512_ktestc_b:
26129 case Intrinsic::x86_avx512_ktestc_w:
26130 case Intrinsic::x86_avx512_ktestc_d:
26131 case Intrinsic::x86_avx512_ktestc_q:
26132 // CF = 1
26133 TestOpc = X86ISD::KTEST;
26134 X86CC = X86::COND_B;
26135 break;
26136 case Intrinsic::x86_avx512_ktestz_b:
26137 case Intrinsic::x86_avx512_ktestz_w:
26138 case Intrinsic::x86_avx512_ktestz_d:
26139 case Intrinsic::x86_avx512_ktestz_q:
26140 TestOpc = X86ISD::KTEST;
26141 X86CC = X86::COND_E;
26142 break;
26143 case Intrinsic::x86_avx_vtestz_ps:
26144 case Intrinsic::x86_avx_vtestz_pd:
26145 case Intrinsic::x86_avx_vtestz_ps_256:
26146 case Intrinsic::x86_avx_vtestz_pd_256:
26147 TestOpc = X86ISD::TESTP;
26148 [[fallthrough]];
26149 case Intrinsic::x86_sse41_ptestz:
26150 case Intrinsic::x86_avx_ptestz_256:
26151 // ZF = 1
26152 X86CC = X86::COND_E;
26153 break;
26154 case Intrinsic::x86_avx_vtestc_ps:
26155 case Intrinsic::x86_avx_vtestc_pd:
26156 case Intrinsic::x86_avx_vtestc_ps_256:
26157 case Intrinsic::x86_avx_vtestc_pd_256:
26158 TestOpc = X86ISD::TESTP;
26159 [[fallthrough]];
26160 case Intrinsic::x86_sse41_ptestc:
26161 case Intrinsic::x86_avx_ptestc_256:
26162 // CF = 1
26163 X86CC = X86::COND_B;
26164 break;
26165 case Intrinsic::x86_avx_vtestnzc_ps:
26166 case Intrinsic::x86_avx_vtestnzc_pd:
26167 case Intrinsic::x86_avx_vtestnzc_ps_256:
26168 case Intrinsic::x86_avx_vtestnzc_pd_256:
26169 TestOpc = X86ISD::TESTP;
26170 [[fallthrough]];
26171 case Intrinsic::x86_sse41_ptestnzc:
26172 case Intrinsic::x86_avx_ptestnzc_256:
26173 // ZF and CF = 0
26174 X86CC = X86::COND_A;
26175 break;
26176 }
26177
26178 SDValue LHS = Op.getOperand(1);
26179 SDValue RHS = Op.getOperand(2);
26180 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
26181 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
26182 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26183 }
26184
26185 case Intrinsic::x86_sse42_pcmpistria128:
26186 case Intrinsic::x86_sse42_pcmpestria128:
26187 case Intrinsic::x86_sse42_pcmpistric128:
26188 case Intrinsic::x86_sse42_pcmpestric128:
26189 case Intrinsic::x86_sse42_pcmpistrio128:
26190 case Intrinsic::x86_sse42_pcmpestrio128:
26191 case Intrinsic::x86_sse42_pcmpistris128:
26192 case Intrinsic::x86_sse42_pcmpestris128:
26193 case Intrinsic::x86_sse42_pcmpistriz128:
26194 case Intrinsic::x86_sse42_pcmpestriz128: {
26195 unsigned Opcode;
26196 X86::CondCode X86CC;
26197 switch (IntNo) {
26198 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
26199 case Intrinsic::x86_sse42_pcmpistria128:
26200 Opcode = X86ISD::PCMPISTR;
26201 X86CC = X86::COND_A;
26202 break;
26203 case Intrinsic::x86_sse42_pcmpestria128:
26204 Opcode = X86ISD::PCMPESTR;
26205 X86CC = X86::COND_A;
26206 break;
26207 case Intrinsic::x86_sse42_pcmpistric128:
26208 Opcode = X86ISD::PCMPISTR;
26209 X86CC = X86::COND_B;
26210 break;
26211 case Intrinsic::x86_sse42_pcmpestric128:
26212 Opcode = X86ISD::PCMPESTR;
26213 X86CC = X86::COND_B;
26214 break;
26215 case Intrinsic::x86_sse42_pcmpistrio128:
26216 Opcode = X86ISD::PCMPISTR;
26217 X86CC = X86::COND_O;
26218 break;
26219 case Intrinsic::x86_sse42_pcmpestrio128:
26220 Opcode = X86ISD::PCMPESTR;
26221 X86CC = X86::COND_O;
26222 break;
26223 case Intrinsic::x86_sse42_pcmpistris128:
26224 Opcode = X86ISD::PCMPISTR;
26225 X86CC = X86::COND_S;
26226 break;
26227 case Intrinsic::x86_sse42_pcmpestris128:
26228 Opcode = X86ISD::PCMPESTR;
26229 X86CC = X86::COND_S;
26230 break;
26231 case Intrinsic::x86_sse42_pcmpistriz128:
26232 Opcode = X86ISD::PCMPISTR;
26233 X86CC = X86::COND_E;
26234 break;
26235 case Intrinsic::x86_sse42_pcmpestriz128:
26236 Opcode = X86ISD::PCMPESTR;
26237 X86CC = X86::COND_E;
26238 break;
26239 }
26241 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26242 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
26243 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
26244 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26245 }
26246
26247 case Intrinsic::x86_sse42_pcmpistri128:
26248 case Intrinsic::x86_sse42_pcmpestri128: {
26249 unsigned Opcode;
26250 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
26251 Opcode = X86ISD::PCMPISTR;
26252 else
26253 Opcode = X86ISD::PCMPESTR;
26254
26256 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26257 return DAG.getNode(Opcode, dl, VTs, NewOps);
26258 }
26259
26260 case Intrinsic::x86_sse42_pcmpistrm128:
26261 case Intrinsic::x86_sse42_pcmpestrm128: {
26262 unsigned Opcode;
26263 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
26264 Opcode = X86ISD::PCMPISTR;
26265 else
26266 Opcode = X86ISD::PCMPESTR;
26267
26269 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26270 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
26271 }
26272
26273 case Intrinsic::eh_sjlj_lsda: {
26275 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26276 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
26277 auto &Context = MF.getMMI().getContext();
26278 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
26279 Twine(MF.getFunctionNumber()));
26280 return DAG.getNode(getGlobalWrapperKind(nullptr, /*OpFlags=*/0), dl, VT,
26281 DAG.getMCSymbol(S, PtrVT));
26282 }
26283
26284 case Intrinsic::x86_seh_lsda: {
26285 // Compute the symbol for the LSDA. We know it'll get emitted later.
26287 SDValue Op1 = Op.getOperand(1);
26288 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
26291
26292 // Generate a simple absolute symbol reference. This intrinsic is only
26293 // supported on 32-bit Windows, which isn't PIC.
26294 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
26295 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
26296 }
26297
26298 case Intrinsic::eh_recoverfp: {
26299 SDValue FnOp = Op.getOperand(1);
26300 SDValue IncomingFPOp = Op.getOperand(2);
26301 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
26302 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
26303 if (!Fn)
26305 "llvm.eh.recoverfp must take a function as the first argument");
26306 return recoverFramePointer(DAG, Fn, IncomingFPOp);
26307 }
26308
26309 case Intrinsic::localaddress: {
26310 // Returns one of the stack, base, or frame pointer registers, depending on
26311 // which is used to reference local variables.
26313 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26314 unsigned Reg;
26315 if (RegInfo->hasBasePointer(MF))
26316 Reg = RegInfo->getBaseRegister();
26317 else { // Handles the SP or FP case.
26318 bool CantUseFP = RegInfo->hasStackRealignment(MF);
26319 if (CantUseFP)
26320 Reg = RegInfo->getPtrSizedStackRegister(MF);
26321 else
26322 Reg = RegInfo->getPtrSizedFrameRegister(MF);
26323 }
26324 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
26325 }
26326 case Intrinsic::x86_avx512_vp2intersect_q_512:
26327 case Intrinsic::x86_avx512_vp2intersect_q_256:
26328 case Intrinsic::x86_avx512_vp2intersect_q_128:
26329 case Intrinsic::x86_avx512_vp2intersect_d_512:
26330 case Intrinsic::x86_avx512_vp2intersect_d_256:
26331 case Intrinsic::x86_avx512_vp2intersect_d_128: {
26332 MVT MaskVT = Op.getSimpleValueType();
26333
26334 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
26335 SDLoc DL(Op);
26336
26339 Op->getOperand(1), Op->getOperand(2));
26340
26341 SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
26342 MaskVT, Operation);
26343 SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
26344 MaskVT, Operation);
26345 return DAG.getMergeValues({Result0, Result1}, DL);
26346 }
26347 case Intrinsic::x86_mmx_pslli_w:
26348 case Intrinsic::x86_mmx_pslli_d:
26349 case Intrinsic::x86_mmx_pslli_q:
26350 case Intrinsic::x86_mmx_psrli_w:
26351 case Intrinsic::x86_mmx_psrli_d:
26352 case Intrinsic::x86_mmx_psrli_q:
26353 case Intrinsic::x86_mmx_psrai_w:
26354 case Intrinsic::x86_mmx_psrai_d: {
26355 SDLoc DL(Op);
26356 SDValue ShAmt = Op.getOperand(2);
26357 // If the argument is a constant, convert it to a target constant.
26358 if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
26359 // Clamp out of bounds shift amounts since they will otherwise be masked
26360 // to 8-bits which may make it no longer out of bounds.
26361 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
26362 if (ShiftAmount == 0)
26363 return Op.getOperand(1);
26364
26365 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
26366 Op.getOperand(0), Op.getOperand(1),
26367 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
26368 }
26369
26370 unsigned NewIntrinsic;
26371 switch (IntNo) {
26372 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
26373 case Intrinsic::x86_mmx_pslli_w:
26374 NewIntrinsic = Intrinsic::x86_mmx_psll_w;
26375 break;
26376 case Intrinsic::x86_mmx_pslli_d:
26377 NewIntrinsic = Intrinsic::x86_mmx_psll_d;
26378 break;
26379 case Intrinsic::x86_mmx_pslli_q:
26380 NewIntrinsic = Intrinsic::x86_mmx_psll_q;
26381 break;
26382 case Intrinsic::x86_mmx_psrli_w:
26383 NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
26384 break;
26385 case Intrinsic::x86_mmx_psrli_d:
26386 NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
26387 break;
26388 case Intrinsic::x86_mmx_psrli_q:
26389 NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
26390 break;
26391 case Intrinsic::x86_mmx_psrai_w:
26392 NewIntrinsic = Intrinsic::x86_mmx_psra_w;
26393 break;
26394 case Intrinsic::x86_mmx_psrai_d:
26395 NewIntrinsic = Intrinsic::x86_mmx_psra_d;
26396 break;
26397 }
26398
26399 // The vector shift intrinsics with scalars uses 32b shift amounts but
26400 // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
26401 // MMX register.
26402 ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
26403 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
26404 DAG.getTargetConstant(NewIntrinsic, DL,
26406 Op.getOperand(1), ShAmt);
26407 }
26408 case Intrinsic::thread_pointer: {
26409 if (Subtarget.isTargetELF()) {
26410 SDLoc dl(Op);
26411 EVT PtrVT = getPointerTy(DAG.getDataLayout());
26412 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
26414 *DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));
26415 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
26417 }
26419 "Target OS doesn't support __builtin_thread_pointer() yet.");
26420 }
26421 }
26422}
26423
26425 SDValue Src, SDValue Mask, SDValue Base,
26426 SDValue Index, SDValue ScaleOp, SDValue Chain,
26427 const X86Subtarget &Subtarget) {
26428 SDLoc dl(Op);
26429 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26430 // Scale must be constant.
26431 if (!C)
26432 return SDValue();
26433 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26434 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26435 TLI.getPointerTy(DAG.getDataLayout()));
26436 EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
26437 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
26438 // If source is undef or we know it won't be used, use a zero vector
26439 // to break register dependency.
26440 // TODO: use undef instead and let BreakFalseDeps deal with it?
26441 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
26442 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
26443
26444 // Cast mask to an integer type.
26445 Mask = DAG.getBitcast(MaskVT, Mask);
26446
26447 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26448
26449 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
26450 SDValue Res =
26451 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
26452 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26453 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
26454}
26455
26457 SDValue Src, SDValue Mask, SDValue Base,
26458 SDValue Index, SDValue ScaleOp, SDValue Chain,
26459 const X86Subtarget &Subtarget) {
26460 MVT VT = Op.getSimpleValueType();
26461 SDLoc dl(Op);
26462 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26463 // Scale must be constant.
26464 if (!C)
26465 return SDValue();
26466 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26467 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26468 TLI.getPointerTy(DAG.getDataLayout()));
26469 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
26471 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
26472
26473 // We support two versions of the gather intrinsics. One with scalar mask and
26474 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
26475 if (Mask.getValueType() != MaskVT)
26476 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26477
26478 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
26479 // If source is undef or we know it won't be used, use a zero vector
26480 // to break register dependency.
26481 // TODO: use undef instead and let BreakFalseDeps deal with it?
26482 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
26483 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
26484
26485 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26486
26487 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
26488 SDValue Res =
26489 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
26490 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26491 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
26492}
26493
26494static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26495 SDValue Src, SDValue Mask, SDValue Base,
26496 SDValue Index, SDValue ScaleOp, SDValue Chain,
26497 const X86Subtarget &Subtarget) {
26498 SDLoc dl(Op);
26499 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26500 // Scale must be constant.
26501 if (!C)
26502 return SDValue();
26503 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26504 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26505 TLI.getPointerTy(DAG.getDataLayout()));
26506 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
26507 Src.getSimpleValueType().getVectorNumElements());
26508 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
26509
26510 // We support two versions of the scatter intrinsics. One with scalar mask and
26511 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
26512 if (Mask.getValueType() != MaskVT)
26513 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26514
26515 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26516
26517 SDVTList VTs = DAG.getVTList(MVT::Other);
26518 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
26519 SDValue Res =
26520 DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
26521 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26522 return Res;
26523}
26524
26525static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26527 SDValue ScaleOp, SDValue Chain,
26528 const X86Subtarget &Subtarget) {
26529 SDLoc dl(Op);
26530 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26531 // Scale must be constant.
26532 if (!C)
26533 return SDValue();
26534 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26535 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26536 TLI.getPointerTy(DAG.getDataLayout()));
26537 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
26538 SDValue Segment = DAG.getRegister(0, MVT::i32);
26539 MVT MaskVT =
26540 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
26541 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26542 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
26543 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
26544 return SDValue(Res, 0);
26545}
26546
26547/// Handles the lowering of builtin intrinsics with chain that return their
26548/// value into registers EDX:EAX.
26549/// If operand ScrReg is a valid register identifier, then operand 2 of N is
26550/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
26551/// TargetOpcode.
26552/// Returns a Glue value which can be used to add extra copy-from-reg if the
26553/// expanded intrinsics implicitly defines extra registers (i.e. not just
26554/// EDX:EAX).
26556 SelectionDAG &DAG,
26557 unsigned TargetOpcode,
26558 unsigned SrcReg,
26559 const X86Subtarget &Subtarget,
26561 SDValue Chain = N->getOperand(0);
26562 SDValue Glue;
26563
26564 if (SrcReg) {
26565 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
26566 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
26567 Glue = Chain.getValue(1);
26568 }
26569
26570 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
26571 SDValue N1Ops[] = {Chain, Glue};
26572 SDNode *N1 = DAG.getMachineNode(
26573 TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
26574 Chain = SDValue(N1, 0);
26575
26576 // Reads the content of XCR and returns it in registers EDX:EAX.
26577 SDValue LO, HI;
26578 if (Subtarget.is64Bit()) {
26579 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
26580 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
26581 LO.getValue(2));
26582 } else {
26583 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
26584 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
26585 LO.getValue(2));
26586 }
26587 Chain = HI.getValue(1);
26588 Glue = HI.getValue(2);
26589
26590 if (Subtarget.is64Bit()) {
26591 // Merge the two 32-bit values into a 64-bit one.
26592 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
26593 DAG.getConstant(32, DL, MVT::i8));
26594 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
26595 Results.push_back(Chain);
26596 return Glue;
26597 }
26598
26599 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
26600 SDValue Ops[] = { LO, HI };
26601 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
26602 Results.push_back(Pair);
26603 Results.push_back(Chain);
26604 return Glue;
26605}
26606
26607/// Handles the lowering of builtin intrinsics that read the time stamp counter
26608/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
26609/// READCYCLECOUNTER nodes.
26610static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
26611 SelectionDAG &DAG,
26612 const X86Subtarget &Subtarget,
26614 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
26615 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
26616 // and the EAX register is loaded with the low-order 32 bits.
26617 SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
26618 /* NoRegister */0, Subtarget,
26619 Results);
26620 if (Opcode != X86::RDTSCP)
26621 return;
26622
26623 SDValue Chain = Results[1];
26624 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
26625 // the ECX register. Add 'ecx' explicitly to the chain.
26626 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
26627 Results[1] = ecx;
26628 Results.push_back(ecx.getValue(1));
26629}
26630
26632 SelectionDAG &DAG) {
26634 SDLoc DL(Op);
26635 getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
26636 Results);
26637 return DAG.getMergeValues(Results, DL);
26638}
26639
26642 SDValue Chain = Op.getOperand(0);
26643 SDValue RegNode = Op.getOperand(2);
26644 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
26645 if (!EHInfo)
26646 report_fatal_error("EH registrations only live in functions using WinEH");
26647
26648 // Cast the operand to an alloca, and remember the frame index.
26649 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
26650 if (!FINode)
26651 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
26652 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
26653
26654 // Return the chain operand without making any DAG nodes.
26655 return Chain;
26656}
26657
26660 SDValue Chain = Op.getOperand(0);
26661 SDValue EHGuard = Op.getOperand(2);
26662 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
26663 if (!EHInfo)
26664 report_fatal_error("EHGuard only live in functions using WinEH");
26665
26666 // Cast the operand to an alloca, and remember the frame index.
26667 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
26668 if (!FINode)
26669 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
26670 EHInfo->EHGuardFrameIndex = FINode->getIndex();
26671
26672 // Return the chain operand without making any DAG nodes.
26673 return Chain;
26674}
26675
26676/// Emit Truncating Store with signed or unsigned saturation.
26677static SDValue
26678EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val,
26679 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
26680 SelectionDAG &DAG) {
26681 SDVTList VTs = DAG.getVTList(MVT::Other);
26682 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
26683 SDValue Ops[] = { Chain, Val, Ptr, Undef };
26684 unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
26685 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
26686}
26687
26688/// Emit Masked Truncating Store with signed or unsigned saturation.
26689static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain,
26690 const SDLoc &DL,
26691 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
26692 MachineMemOperand *MMO, SelectionDAG &DAG) {
26693 SDVTList VTs = DAG.getVTList(MVT::Other);
26694 SDValue Ops[] = { Chain, Val, Ptr, Mask };
26695 unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
26696 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
26697}
26698
26700 const MachineFunction &MF) {
26701 if (!Subtarget.is64Bit())
26702 return false;
26703 // 64-bit targets support extended Swift async frame setup,
26704 // except for targets that use the windows 64 prologue.
26705 return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
26706}
26707
26709 SelectionDAG &DAG) {
26710 unsigned IntNo = Op.getConstantOperandVal(1);
26711 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
26712 if (!IntrData) {
26713 switch (IntNo) {
26714
26715 case Intrinsic::swift_async_context_addr: {
26716 SDLoc dl(Op);
26717 auto &MF = DAG.getMachineFunction();
26718 auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
26719 if (X86::isExtendedSwiftAsyncFrameSupported(Subtarget, MF)) {
26721 X86FI->setHasSwiftAsyncContext(true);
26722 SDValue Chain = Op->getOperand(0);
26723 SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64);
26724 SDValue Result =
26725 SDValue(DAG.getMachineNode(X86::SUB64ri32, dl, MVT::i64, CopyRBP,
26726 DAG.getTargetConstant(8, dl, MVT::i32)),
26727 0);
26728 // Return { result, chain }.
26729 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
26730 CopyRBP.getValue(1));
26731 } else {
26732 // No special extended frame, create or reuse an existing stack slot.
26733 int PtrSize = Subtarget.is64Bit() ? 8 : 4;
26734 if (!X86FI->getSwiftAsyncContextFrameIdx())
26735 X86FI->setSwiftAsyncContextFrameIdx(
26736 MF.getFrameInfo().CreateStackObject(PtrSize, Align(PtrSize),
26737 false));
26738 SDValue Result =
26739 DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(),
26740 PtrSize == 8 ? MVT::i64 : MVT::i32);
26741 // Return { result, chain }.
26742 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
26743 Op->getOperand(0));
26744 }
26745 }
26746
26747 case llvm::Intrinsic::x86_seh_ehregnode:
26748 return MarkEHRegistrationNode(Op, DAG);
26749 case llvm::Intrinsic::x86_seh_ehguard:
26750 return MarkEHGuard(Op, DAG);
26751 case llvm::Intrinsic::x86_rdpkru: {
26752 SDLoc dl(Op);
26753 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26754 // Create a RDPKRU node and pass 0 to the ECX parameter.
26755 return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
26756 DAG.getConstant(0, dl, MVT::i32));
26757 }
26758 case llvm::Intrinsic::x86_wrpkru: {
26759 SDLoc dl(Op);
26760 // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
26761 // to the EDX and ECX parameters.
26762 return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
26763 Op.getOperand(0), Op.getOperand(2),
26764 DAG.getConstant(0, dl, MVT::i32),
26765 DAG.getConstant(0, dl, MVT::i32));
26766 }
26767 case llvm::Intrinsic::asan_check_memaccess: {
26768 // Mark this as adjustsStack because it will be lowered to a call.
26770 // Don't do anything here, we will expand these intrinsics out later.
26771 return Op;
26772 }
26773 case llvm::Intrinsic::x86_flags_read_u32:
26774 case llvm::Intrinsic::x86_flags_read_u64:
26775 case llvm::Intrinsic::x86_flags_write_u32:
26776 case llvm::Intrinsic::x86_flags_write_u64: {
26777 // We need a frame pointer because this will get lowered to a PUSH/POP
26778 // sequence.
26781 // Don't do anything here, we will expand these intrinsics out later
26782 // during FinalizeISel in EmitInstrWithCustomInserter.
26783 return Op;
26784 }
26785 case Intrinsic::x86_lwpins32:
26786 case Intrinsic::x86_lwpins64:
26787 case Intrinsic::x86_umwait:
26788 case Intrinsic::x86_tpause: {
26789 SDLoc dl(Op);
26790 SDValue Chain = Op->getOperand(0);
26791 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26792 unsigned Opcode;
26793
26794 switch (IntNo) {
26795 default: llvm_unreachable("Impossible intrinsic");
26796 case Intrinsic::x86_umwait:
26797 Opcode = X86ISD::UMWAIT;
26798 break;
26799 case Intrinsic::x86_tpause:
26800 Opcode = X86ISD::TPAUSE;
26801 break;
26802 case Intrinsic::x86_lwpins32:
26803 case Intrinsic::x86_lwpins64:
26804 Opcode = X86ISD::LWPINS;
26805 break;
26806 }
26807
26809 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
26810 Op->getOperand(3), Op->getOperand(4));
26811 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
26812 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26813 Operation.getValue(1));
26814 }
26815 case Intrinsic::x86_enqcmd:
26816 case Intrinsic::x86_enqcmds: {
26817 SDLoc dl(Op);
26818 SDValue Chain = Op.getOperand(0);
26819 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26820 unsigned Opcode;
26821 switch (IntNo) {
26822 default: llvm_unreachable("Impossible intrinsic!");
26823 case Intrinsic::x86_enqcmd:
26824 Opcode = X86ISD::ENQCMD;
26825 break;
26826 case Intrinsic::x86_enqcmds:
26827 Opcode = X86ISD::ENQCMDS;
26828 break;
26829 }
26830 SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
26831 Op.getOperand(3));
26832 SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
26833 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26834 Operation.getValue(1));
26835 }
26836 case Intrinsic::x86_aesenc128kl:
26837 case Intrinsic::x86_aesdec128kl:
26838 case Intrinsic::x86_aesenc256kl:
26839 case Intrinsic::x86_aesdec256kl: {
26840 SDLoc DL(Op);
26841 SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
26842 SDValue Chain = Op.getOperand(0);
26843 unsigned Opcode;
26844
26845 switch (IntNo) {
26846 default: llvm_unreachable("Impossible intrinsic");
26847 case Intrinsic::x86_aesenc128kl:
26848 Opcode = X86ISD::AESENC128KL;
26849 break;
26850 case Intrinsic::x86_aesdec128kl:
26851 Opcode = X86ISD::AESDEC128KL;
26852 break;
26853 case Intrinsic::x86_aesenc256kl:
26854 Opcode = X86ISD::AESENC256KL;
26855 break;
26856 case Intrinsic::x86_aesdec256kl:
26857 Opcode = X86ISD::AESDEC256KL;
26858 break;
26859 }
26860
26861 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26862 MachineMemOperand *MMO = MemIntr->getMemOperand();
26863 EVT MemVT = MemIntr->getMemoryVT();
26865 Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
26866 MMO);
26867 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
26868
26869 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
26870 {ZF, Operation.getValue(0), Operation.getValue(2)});
26871 }
26872 case Intrinsic::x86_aesencwide128kl:
26873 case Intrinsic::x86_aesdecwide128kl:
26874 case Intrinsic::x86_aesencwide256kl:
26875 case Intrinsic::x86_aesdecwide256kl: {
26876 SDLoc DL(Op);
26877 SDVTList VTs = DAG.getVTList(
26878 {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
26879 MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
26880 SDValue Chain = Op.getOperand(0);
26881 unsigned Opcode;
26882
26883 switch (IntNo) {
26884 default: llvm_unreachable("Impossible intrinsic");
26885 case Intrinsic::x86_aesencwide128kl:
26886 Opcode = X86ISD::AESENCWIDE128KL;
26887 break;
26888 case Intrinsic::x86_aesdecwide128kl:
26889 Opcode = X86ISD::AESDECWIDE128KL;
26890 break;
26891 case Intrinsic::x86_aesencwide256kl:
26892 Opcode = X86ISD::AESENCWIDE256KL;
26893 break;
26894 case Intrinsic::x86_aesdecwide256kl:
26895 Opcode = X86ISD::AESDECWIDE256KL;
26896 break;
26897 }
26898
26899 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26900 MachineMemOperand *MMO = MemIntr->getMemOperand();
26901 EVT MemVT = MemIntr->getMemoryVT();
26903 Opcode, DL, VTs,
26904 {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
26905 Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
26906 Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
26907 MemVT, MMO);
26908 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
26909
26910 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
26911 {ZF, Operation.getValue(1), Operation.getValue(2),
26912 Operation.getValue(3), Operation.getValue(4),
26913 Operation.getValue(5), Operation.getValue(6),
26914 Operation.getValue(7), Operation.getValue(8),
26915 Operation.getValue(9)});
26916 }
26917 case Intrinsic::x86_testui: {
26918 SDLoc dl(Op);
26919 SDValue Chain = Op.getOperand(0);
26920 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26921 SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
26922 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
26923 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26924 Operation.getValue(1));
26925 }
26926 case Intrinsic::x86_atomic_bts_rm:
26927 case Intrinsic::x86_atomic_btc_rm:
26928 case Intrinsic::x86_atomic_btr_rm: {
26929 SDLoc DL(Op);
26930 MVT VT = Op.getSimpleValueType();
26931 SDValue Chain = Op.getOperand(0);
26932 SDValue Op1 = Op.getOperand(2);
26933 SDValue Op2 = Op.getOperand(3);
26934 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm ? X86ISD::LBTS_RM
26935 : IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM
26937 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
26938 SDValue Res =
26939 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
26940 {Chain, Op1, Op2}, VT, MMO);
26941 Chain = Res.getValue(1);
26942 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
26943 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
26944 }
26945 case Intrinsic::x86_atomic_bts:
26946 case Intrinsic::x86_atomic_btc:
26947 case Intrinsic::x86_atomic_btr: {
26948 SDLoc DL(Op);
26949 MVT VT = Op.getSimpleValueType();
26950 SDValue Chain = Op.getOperand(0);
26951 SDValue Op1 = Op.getOperand(2);
26952 SDValue Op2 = Op.getOperand(3);
26953 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS
26954 : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC
26955 : X86ISD::LBTR;
26956 SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);
26957 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
26958 SDValue Res =
26959 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
26960 {Chain, Op1, Op2, Size}, VT, MMO);
26961 Chain = Res.getValue(1);
26962 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
26963 unsigned Imm = Op2->getAsZExtVal();
26964 if (Imm)
26965 Res = DAG.getNode(ISD::SHL, DL, VT, Res,
26966 DAG.getShiftAmountConstant(Imm, VT, DL));
26967 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
26968 }
26969 case Intrinsic::x86_cmpccxadd32:
26970 case Intrinsic::x86_cmpccxadd64: {
26971 SDLoc DL(Op);
26972 SDValue Chain = Op.getOperand(0);
26973 SDValue Addr = Op.getOperand(2);
26974 SDValue Src1 = Op.getOperand(3);
26975 SDValue Src2 = Op.getOperand(4);
26976 SDValue CC = Op.getOperand(5);
26977 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
26979 X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},
26980 MVT::i32, MMO);
26981 return Operation;
26982 }
26983 case Intrinsic::x86_aadd32:
26984 case Intrinsic::x86_aadd64:
26985 case Intrinsic::x86_aand32:
26986 case Intrinsic::x86_aand64:
26987 case Intrinsic::x86_aor32:
26988 case Intrinsic::x86_aor64:
26989 case Intrinsic::x86_axor32:
26990 case Intrinsic::x86_axor64: {
26991 SDLoc DL(Op);
26992 SDValue Chain = Op.getOperand(0);
26993 SDValue Op1 = Op.getOperand(2);
26994 SDValue Op2 = Op.getOperand(3);
26995 MVT VT = Op2.getSimpleValueType();
26996 unsigned Opc = 0;
26997 switch (IntNo) {
26998 default:
26999 llvm_unreachable("Unknown Intrinsic");
27000 case Intrinsic::x86_aadd32:
27001 case Intrinsic::x86_aadd64:
27002 Opc = X86ISD::AADD;
27003 break;
27004 case Intrinsic::x86_aand32:
27005 case Intrinsic::x86_aand64:
27006 Opc = X86ISD::AAND;
27007 break;
27008 case Intrinsic::x86_aor32:
27009 case Intrinsic::x86_aor64:
27010 Opc = X86ISD::AOR;
27011 break;
27012 case Intrinsic::x86_axor32:
27013 case Intrinsic::x86_axor64:
27014 Opc = X86ISD::AXOR;
27015 break;
27016 }
27017 MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand();
27018 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(),
27019 {Chain, Op1, Op2}, VT, MMO);
27020 }
27021 case Intrinsic::x86_atomic_add_cc:
27022 case Intrinsic::x86_atomic_sub_cc:
27023 case Intrinsic::x86_atomic_or_cc:
27024 case Intrinsic::x86_atomic_and_cc:
27025 case Intrinsic::x86_atomic_xor_cc: {
27026 SDLoc DL(Op);
27027 SDValue Chain = Op.getOperand(0);
27028 SDValue Op1 = Op.getOperand(2);
27029 SDValue Op2 = Op.getOperand(3);
27030 X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4);
27031 MVT VT = Op2.getSimpleValueType();
27032 unsigned Opc = 0;
27033 switch (IntNo) {
27034 default:
27035 llvm_unreachable("Unknown Intrinsic");
27036 case Intrinsic::x86_atomic_add_cc:
27037 Opc = X86ISD::LADD;
27038 break;
27039 case Intrinsic::x86_atomic_sub_cc:
27040 Opc = X86ISD::LSUB;
27041 break;
27042 case Intrinsic::x86_atomic_or_cc:
27043 Opc = X86ISD::LOR;
27044 break;
27045 case Intrinsic::x86_atomic_and_cc:
27046 Opc = X86ISD::LAND;
27047 break;
27048 case Intrinsic::x86_atomic_xor_cc:
27049 Opc = X86ISD::LXOR;
27050 break;
27051 }
27052 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27053 SDValue LockArith =
27054 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
27055 {Chain, Op1, Op2}, VT, MMO);
27056 Chain = LockArith.getValue(1);
27057 return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL);
27058 }
27059 }
27060 return SDValue();
27061 }
27062
27063 SDLoc dl(Op);
27064 switch(IntrData->Type) {
27065 default: llvm_unreachable("Unknown Intrinsic Type");
27066 case RDSEED:
27067 case RDRAND: {
27068 // Emit the node with the right value type.
27069 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
27070 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
27071
27072 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
27073 // Otherwise return the value from Rand, which is always 0, casted to i32.
27074 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
27075 DAG.getConstant(1, dl, Op->getValueType(1)),
27076 DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
27077 SDValue(Result.getNode(), 1)};
27078 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
27079
27080 // Return { result, isValid, chain }.
27081 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
27082 SDValue(Result.getNode(), 2));
27083 }
27084 case GATHER_AVX2: {
27085 SDValue Chain = Op.getOperand(0);
27086 SDValue Src = Op.getOperand(2);
27087 SDValue Base = Op.getOperand(3);
27088 SDValue Index = Op.getOperand(4);
27089 SDValue Mask = Op.getOperand(5);
27090 SDValue Scale = Op.getOperand(6);
27091 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
27092 Scale, Chain, Subtarget);
27093 }
27094 case GATHER: {
27095 //gather(v1, mask, index, base, scale);
27096 SDValue Chain = Op.getOperand(0);
27097 SDValue Src = Op.getOperand(2);
27098 SDValue Base = Op.getOperand(3);
27099 SDValue Index = Op.getOperand(4);
27100 SDValue Mask = Op.getOperand(5);
27101 SDValue Scale = Op.getOperand(6);
27102 return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
27103 Chain, Subtarget);
27104 }
27105 case SCATTER: {
27106 //scatter(base, mask, index, v1, scale);
27107 SDValue Chain = Op.getOperand(0);
27108 SDValue Base = Op.getOperand(2);
27109 SDValue Mask = Op.getOperand(3);
27110 SDValue Index = Op.getOperand(4);
27111 SDValue Src = Op.getOperand(5);
27112 SDValue Scale = Op.getOperand(6);
27113 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
27114 Scale, Chain, Subtarget);
27115 }
27116 case PREFETCH: {
27117 const APInt &HintVal = Op.getConstantOperandAPInt(6);
27118 assert((HintVal == 2 || HintVal == 3) &&
27119 "Wrong prefetch hint in intrinsic: should be 2 or 3");
27120 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
27121 SDValue Chain = Op.getOperand(0);
27122 SDValue Mask = Op.getOperand(2);
27123 SDValue Index = Op.getOperand(3);
27124 SDValue Base = Op.getOperand(4);
27125 SDValue Scale = Op.getOperand(5);
27126 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
27127 Subtarget);
27128 }
27129 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
27130 case RDTSC: {
27132 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
27133 Results);
27134 return DAG.getMergeValues(Results, dl);
27135 }
27136 // Read Performance Monitoring Counters.
27137 case RDPMC:
27138 // Read Processor Register.
27139 case RDPRU:
27140 // GetExtended Control Register.
27141 case XGETBV: {
27143
27144 // RDPMC uses ECX to select the index of the performance counter to read.
27145 // RDPRU uses ECX to select the processor register to read.
27146 // XGETBV uses ECX to select the index of the XCR register to return.
27147 // The result is stored into registers EDX:EAX.
27148 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
27149 Subtarget, Results);
27150 return DAG.getMergeValues(Results, dl);
27151 }
27152 // XTEST intrinsics.
27153 case XTEST: {
27154 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
27155 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
27156
27157 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
27158 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
27159 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
27160 Ret, SDValue(InTrans.getNode(), 1));
27161 }
27164 case TRUNCATE_TO_MEM_VI32: {
27165 SDValue Mask = Op.getOperand(4);
27166 SDValue DataToTruncate = Op.getOperand(3);
27167 SDValue Addr = Op.getOperand(2);
27168 SDValue Chain = Op.getOperand(0);
27169
27170 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
27171 assert(MemIntr && "Expected MemIntrinsicSDNode!");
27172
27173 EVT MemVT = MemIntr->getMemoryVT();
27174
27175 uint16_t TruncationOp = IntrData->Opc0;
27176 switch (TruncationOp) {
27177 case X86ISD::VTRUNC: {
27178 if (isAllOnesConstant(Mask)) // return just a truncate store
27179 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
27180 MemIntr->getMemOperand());
27181
27182 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
27183 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27184 SDValue Offset = DAG.getUNDEF(VMask.getValueType());
27185
27186 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
27187 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
27188 true /* truncating */);
27189 }
27190 case X86ISD::VTRUNCUS:
27191 case X86ISD::VTRUNCS: {
27192 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
27193 if (isAllOnesConstant(Mask))
27194 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
27195 MemIntr->getMemOperand(), DAG);
27196
27197 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
27198 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27199
27200 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
27201 VMask, MemVT, MemIntr->getMemOperand(), DAG);
27202 }
27203 default:
27204 llvm_unreachable("Unsupported truncstore intrinsic");
27205 }
27206 }
27207 }
27208}
27209
27210SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
27211 SelectionDAG &DAG) const {
27213 MFI.setReturnAddressIsTaken(true);
27214
27216 return SDValue();
27217
27218 unsigned Depth = Op.getConstantOperandVal(0);
27219 SDLoc dl(Op);
27220 EVT PtrVT = getPointerTy(DAG.getDataLayout());
27221
27222 if (Depth > 0) {
27223 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
27224 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27225 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
27226 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
27227 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
27229 }
27230
27231 // Just load the return address.
27232 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
27233 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
27235}
27236
27237SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
27238 SelectionDAG &DAG) const {
27240 return getReturnAddressFrameIndex(DAG);
27241}
27242
27243SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
27245 MachineFrameInfo &MFI = MF.getFrameInfo();
27247 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27248 EVT VT = Op.getValueType();
27249
27250 MFI.setFrameAddressIsTaken(true);
27251
27252 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
27253 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
27254 // is not possible to crawl up the stack without looking at the unwind codes
27255 // simultaneously.
27256 int FrameAddrIndex = FuncInfo->getFAIndex();
27257 if (!FrameAddrIndex) {
27258 // Set up a frame object for the return address.
27259 unsigned SlotSize = RegInfo->getSlotSize();
27260 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
27261 SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
27262 FuncInfo->setFAIndex(FrameAddrIndex);
27263 }
27264 return DAG.getFrameIndex(FrameAddrIndex, VT);
27265 }
27266
27267 unsigned FrameReg =
27268 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
27269 SDLoc dl(Op); // FIXME probably not meaningful
27270 unsigned Depth = Op.getConstantOperandVal(0);
27271 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
27272 (FrameReg == X86::EBP && VT == MVT::i32)) &&
27273 "Invalid Frame Register!");
27274 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
27275 while (Depth--)
27276 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
27278 return FrameAddr;
27279}
27280
27281// FIXME? Maybe this could be a TableGen attribute on some registers and
27282// this table could be generated automatically from RegInfo.
27284 const MachineFunction &MF) const {
27285 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
27286
27288 .Case("esp", X86::ESP)
27289 .Case("rsp", X86::RSP)
27290 .Case("ebp", X86::EBP)
27291 .Case("rbp", X86::RBP)
27292 .Case("r14", X86::R14)
27293 .Case("r15", X86::R15)
27294 .Default(0);
27295
27296 if (Reg == X86::EBP || Reg == X86::RBP) {
27297 if (!TFI.hasFP(MF))
27298 report_fatal_error("register " + StringRef(RegName) +
27299 " is allocatable: function has no frame pointer");
27300#ifndef NDEBUG
27301 else {
27302 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27303 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
27304 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
27305 "Invalid Frame Register!");
27306 }
27307#endif
27308 }
27309
27310 if (Reg)
27311 return Reg;
27312
27313 report_fatal_error("Invalid register name global variable");
27314}
27315
27316SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
27317 SelectionDAG &DAG) const {
27318 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27319 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
27320}
27321
27323 const Constant *PersonalityFn) const {
27324 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
27325 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
27326
27327 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
27328}
27329
27331 const Constant *PersonalityFn) const {
27332 // Funclet personalities don't use selectors (the runtime does the selection).
27334 return X86::NoRegister;
27335 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
27336}
27337
27339 return Subtarget.isTargetWin64();
27340}
27341
27342SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
27343 SDValue Chain = Op.getOperand(0);
27344 SDValue Offset = Op.getOperand(1);
27345 SDValue Handler = Op.getOperand(2);
27346 SDLoc dl (Op);
27347
27348 EVT PtrVT = getPointerTy(DAG.getDataLayout());
27349 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27350 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
27351 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
27352 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
27353 "Invalid Frame Register!");
27354 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
27355 Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
27356
27357 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
27358 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
27359 dl));
27360 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
27361 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
27362 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
27363
27364 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
27365 DAG.getRegister(StoreAddrReg, PtrVT));
27366}
27367
27368SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
27369 SelectionDAG &DAG) const {
27370 SDLoc DL(Op);
27371 // If the subtarget is not 64bit, we may need the global base reg
27372 // after isel expand pseudo, i.e., after CGBR pass ran.
27373 // Therefore, ask for the GlobalBaseReg now, so that the pass
27374 // inserts the code for us in case we need it.
27375 // Otherwise, we will end up in a situation where we will
27376 // reference a virtual register that is not defined!
27377 if (!Subtarget.is64Bit()) {
27378 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27379 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
27380 }
27381 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
27382 DAG.getVTList(MVT::i32, MVT::Other),
27383 Op.getOperand(0), Op.getOperand(1));
27384}
27385
27386SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
27387 SelectionDAG &DAG) const {
27388 SDLoc DL(Op);
27389 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
27390 Op.getOperand(0), Op.getOperand(1));
27391}
27392
27393SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
27394 SelectionDAG &DAG) const {
27395 SDLoc DL(Op);
27396 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
27397 Op.getOperand(0));
27398}
27399
27401 return Op.getOperand(0);
27402}
27403
27404SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
27405 SelectionDAG &DAG) const {
27406 SDValue Root = Op.getOperand(0);
27407 SDValue Trmp = Op.getOperand(1); // trampoline
27408 SDValue FPtr = Op.getOperand(2); // nested function
27409 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
27410 SDLoc dl (Op);
27411
27412 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
27413 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
27414
27415 if (Subtarget.is64Bit()) {
27416 SDValue OutChains[6];
27417
27418 // Large code-model.
27419 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
27420 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
27421
27422 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
27423 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
27424
27425 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
27426
27427 // Load the pointer to the nested function into R11.
27428 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
27429 SDValue Addr = Trmp;
27430 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27431 Addr, MachinePointerInfo(TrmpAddr));
27432
27433 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27434 DAG.getConstant(2, dl, MVT::i64));
27435 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
27436 MachinePointerInfo(TrmpAddr, 2), Align(2));
27437
27438 // Load the 'nest' parameter value into R10.
27439 // R10 is specified in X86CallingConv.td
27440 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
27441 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27442 DAG.getConstant(10, dl, MVT::i64));
27443 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27444 Addr, MachinePointerInfo(TrmpAddr, 10));
27445
27446 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27447 DAG.getConstant(12, dl, MVT::i64));
27448 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
27449 MachinePointerInfo(TrmpAddr, 12), Align(2));
27450
27451 // Jump to the nested function.
27452 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
27453 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27454 DAG.getConstant(20, dl, MVT::i64));
27455 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27456 Addr, MachinePointerInfo(TrmpAddr, 20));
27457
27458 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
27459 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27460 DAG.getConstant(22, dl, MVT::i64));
27461 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
27462 Addr, MachinePointerInfo(TrmpAddr, 22));
27463
27464 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
27465 } else {
27466 const Function *Func =
27467 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
27468 CallingConv::ID CC = Func->getCallingConv();
27469 unsigned NestReg;
27470
27471 switch (CC) {
27472 default:
27473 llvm_unreachable("Unsupported calling convention");
27474 case CallingConv::C:
27476 // Pass 'nest' parameter in ECX.
27477 // Must be kept in sync with X86CallingConv.td
27478 NestReg = X86::ECX;
27479
27480 // Check that ECX wasn't needed by an 'inreg' parameter.
27481 FunctionType *FTy = Func->getFunctionType();
27482 const AttributeList &Attrs = Func->getAttributes();
27483
27484 if (!Attrs.isEmpty() && !Func->isVarArg()) {
27485 unsigned InRegCount = 0;
27486 unsigned Idx = 0;
27487
27488 for (FunctionType::param_iterator I = FTy->param_begin(),
27489 E = FTy->param_end(); I != E; ++I, ++Idx)
27490 if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
27491 const DataLayout &DL = DAG.getDataLayout();
27492 // FIXME: should only count parameters that are lowered to integers.
27493 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
27494 }
27495
27496 if (InRegCount > 2) {
27497 report_fatal_error("Nest register in use - reduce number of inreg"
27498 " parameters!");
27499 }
27500 }
27501 break;
27502 }
27505 case CallingConv::Fast:
27506 case CallingConv::Tail:
27508 // Pass 'nest' parameter in EAX.
27509 // Must be kept in sync with X86CallingConv.td
27510 NestReg = X86::EAX;
27511 break;
27512 }
27513
27514 SDValue OutChains[4];
27515 SDValue Addr, Disp;
27516
27517 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27518 DAG.getConstant(10, dl, MVT::i32));
27519 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
27520
27521 // This is storing the opcode for MOV32ri.
27522 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
27523 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
27524 OutChains[0] =
27525 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
27526 Trmp, MachinePointerInfo(TrmpAddr));
27527
27528 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27529 DAG.getConstant(1, dl, MVT::i32));
27530 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
27531 MachinePointerInfo(TrmpAddr, 1), Align(1));
27532
27533 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
27534 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27535 DAG.getConstant(5, dl, MVT::i32));
27536 OutChains[2] =
27537 DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
27538 MachinePointerInfo(TrmpAddr, 5), Align(1));
27539
27540 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27541 DAG.getConstant(6, dl, MVT::i32));
27542 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
27543 MachinePointerInfo(TrmpAddr, 6), Align(1));
27544
27545 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
27546 }
27547}
27548
27549SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op,
27550 SelectionDAG &DAG) const {
27551 /*
27552 The rounding mode is in bits 11:10 of FPSR, and has the following
27553 settings:
27554 00 Round to nearest
27555 01 Round to -inf
27556 10 Round to +inf
27557 11 Round to 0
27558
27559 GET_ROUNDING, on the other hand, expects the following:
27560 -1 Undefined
27561 0 Round to 0
27562 1 Round to nearest
27563 2 Round to +inf
27564 3 Round to -inf
27565
27566 To perform the conversion, we use a packed lookup table of the four 2-bit
27567 values that we can index by FPSP[11:10]
27568 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
27569
27570 (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
27571 */
27572
27574 MVT VT = Op.getSimpleValueType();
27575 SDLoc DL(Op);
27576
27577 // Save FP Control Word to stack slot
27578 int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
27579 SDValue StackSlot =
27580 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
27581
27583
27584 SDValue Chain = Op.getOperand(0);
27585 SDValue Ops[] = {Chain, StackSlot};
27587 DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
27589
27590 // Load FP Control Word from stack slot
27591 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
27592 Chain = CWD.getValue(1);
27593
27594 // Mask and turn the control bits into a shift for the lookup table.
27595 SDValue Shift =
27596 DAG.getNode(ISD::SRL, DL, MVT::i16,
27597 DAG.getNode(ISD::AND, DL, MVT::i16,
27598 CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
27599 DAG.getConstant(9, DL, MVT::i8));
27600 Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
27601
27602 SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
27603 SDValue RetVal =
27604 DAG.getNode(ISD::AND, DL, MVT::i32,
27605 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
27606 DAG.getConstant(3, DL, MVT::i32));
27607
27608 RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
27609
27610 return DAG.getMergeValues({RetVal, Chain}, DL);
27611}
27612
27613SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
27614 SelectionDAG &DAG) const {
27616 SDLoc DL(Op);
27617 SDValue Chain = Op.getNode()->getOperand(0);
27618
27619 // FP control word may be set only from data in memory. So we need to allocate
27620 // stack space to save/load FP control word.
27621 int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
27622 SDValue StackSlot =
27623 DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
27625 MachineMemOperand *MMO =
27627
27628 // Store FP control word into memory.
27629 SDValue Ops[] = {Chain, StackSlot};
27630 Chain = DAG.getMemIntrinsicNode(
27631 X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
27632
27633 // Load FP Control Word from stack slot and clear RM field (bits 11:10).
27634 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
27635 Chain = CWD.getValue(1);
27636 CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
27637 DAG.getConstant(0xf3ff, DL, MVT::i16));
27638
27639 // Calculate new rounding mode.
27640 SDValue NewRM = Op.getNode()->getOperand(1);
27641 SDValue RMBits;
27642 if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
27643 uint64_t RM = CVal->getZExtValue();
27644 int FieldVal;
27645 switch (static_cast<RoundingMode>(RM)) {
27646 // clang-format off
27647 case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;
27648 case RoundingMode::TowardNegative: FieldVal = X86::rmDownward; break;
27649 case RoundingMode::TowardPositive: FieldVal = X86::rmUpward; break;
27650 case RoundingMode::TowardZero: FieldVal = X86::rmTowardZero; break;
27651 default:
27652 llvm_unreachable("rounding mode is not supported by X86 hardware");
27653 // clang-format on
27654 }
27655 RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
27656 } else {
27657 // Need to convert argument into bits of control word:
27658 // 0 Round to 0 -> 11
27659 // 1 Round to nearest -> 00
27660 // 2 Round to +inf -> 10
27661 // 3 Round to -inf -> 01
27662 // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
27663 // To make the conversion, put all these values into a value 0xc9 and shift
27664 // it left depending on the rounding mode:
27665 // (0xc9 << 4) & 0xc00 = X86::rmTowardZero
27666 // (0xc9 << 6) & 0xc00 = X86::rmToNearest
27667 // ...
27668 // (0xc9 << (2 * NewRM + 4)) & 0xc00
27669 SDValue ShiftValue =
27670 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
27671 DAG.getNode(ISD::ADD, DL, MVT::i32,
27672 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
27673 DAG.getConstant(1, DL, MVT::i8)),
27674 DAG.getConstant(4, DL, MVT::i32)));
27675 SDValue Shifted =
27676 DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
27677 ShiftValue);
27678 RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
27679 DAG.getConstant(0xc00, DL, MVT::i16));
27680 }
27681
27682 // Update rounding mode bits and store the new FP Control Word into stack.
27683 CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
27684 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(2));
27685
27686 // Load FP control word from the slot.
27687 SDValue OpsLD[] = {Chain, StackSlot};
27688 MachineMemOperand *MMOL =
27690 Chain = DAG.getMemIntrinsicNode(
27691 X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
27692
27693 // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
27694 // same way but in bits 14:13.
27695 if (Subtarget.hasSSE1()) {
27696 // Store MXCSR into memory.
27697 Chain = DAG.getNode(
27698 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27699 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
27700 StackSlot);
27701
27702 // Load MXCSR from stack slot and clear RM field (bits 14:13).
27703 SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
27704 Chain = CWD.getValue(1);
27705 CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
27706 DAG.getConstant(0xffff9fff, DL, MVT::i32));
27707
27708 // Shift X87 RM bits from 11:10 to 14:13.
27709 RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
27710 RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
27711 DAG.getConstant(3, DL, MVT::i8));
27712
27713 // Update rounding mode bits and store the new FP Control Word into stack.
27714 CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
27715 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(4));
27716
27717 // Load MXCSR from the slot.
27718 Chain = DAG.getNode(
27719 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27720 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
27721 StackSlot);
27722 }
27723
27724 return Chain;
27725}
27726
27727const unsigned X87StateSize = 28;
27728const unsigned FPStateSize = 32;
27729[[maybe_unused]] const unsigned FPStateSizeInBits = FPStateSize * 8;
27730
27731SDValue X86TargetLowering::LowerGET_FPENV_MEM(SDValue Op,
27732 SelectionDAG &DAG) const {
27734 SDLoc DL(Op);
27735 SDValue Chain = Op->getOperand(0);
27736 SDValue Ptr = Op->getOperand(1);
27737 auto *Node = cast<FPStateAccessSDNode>(Op);
27738 EVT MemVT = Node->getMemoryVT();
27740 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
27741
27742 // Get x87 state, if it presents.
27743 if (Subtarget.hasX87()) {
27744 Chain =
27745 DAG.getMemIntrinsicNode(X86ISD::FNSTENVm, DL, DAG.getVTList(MVT::Other),
27746 {Chain, Ptr}, MemVT, MMO);
27747
27748 // FNSTENV changes the exception mask, so load back the stored environment.
27749 MachineMemOperand::Flags NewFlags =
27751 (MMO->getFlags() & ~MachineMemOperand::MOStore);
27752 MMO = MF.getMachineMemOperand(MMO, NewFlags);
27753 Chain =
27754 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
27755 {Chain, Ptr}, MemVT, MMO);
27756 }
27757
27758 // If target supports SSE, get MXCSR as well.
27759 if (Subtarget.hasSSE1()) {
27760 // Get pointer to the MXCSR location in memory.
27762 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
27763 DAG.getConstant(X87StateSize, DL, PtrVT));
27764 // Store MXCSR into memory.
27765 Chain = DAG.getNode(
27766 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27767 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
27768 MXCSRAddr);
27769 }
27770
27771 return Chain;
27772}
27773
27775 EVT MemVT, MachineMemOperand *MMO,
27776 SelectionDAG &DAG,
27777 const X86Subtarget &Subtarget) {
27778 // Set x87 state, if it presents.
27779 if (Subtarget.hasX87())
27780 Chain =
27781 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
27782 {Chain, Ptr}, MemVT, MMO);
27783 // If target supports SSE, set MXCSR as well.
27784 if (Subtarget.hasSSE1()) {
27785 // Get pointer to the MXCSR location in memory.
27787 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
27788 DAG.getConstant(X87StateSize, DL, PtrVT));
27789 // Load MXCSR from memory.
27790 Chain = DAG.getNode(
27791 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27792 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
27793 MXCSRAddr);
27794 }
27795 return Chain;
27796}
27797
27798SDValue X86TargetLowering::LowerSET_FPENV_MEM(SDValue Op,
27799 SelectionDAG &DAG) const {
27800 SDLoc DL(Op);
27801 SDValue Chain = Op->getOperand(0);
27802 SDValue Ptr = Op->getOperand(1);
27803 auto *Node = cast<FPStateAccessSDNode>(Op);
27804 EVT MemVT = Node->getMemoryVT();
27806 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
27807 return createSetFPEnvNodes(Ptr, Chain, DL, MemVT, MMO, DAG, Subtarget);
27808}
27809
27810SDValue X86TargetLowering::LowerRESET_FPENV(SDValue Op,
27811 SelectionDAG &DAG) const {
27813 SDLoc DL(Op);
27814 SDValue Chain = Op.getNode()->getOperand(0);
27815
27816 IntegerType *ItemTy = Type::getInt32Ty(*DAG.getContext());
27817 ArrayType *FPEnvTy = ArrayType::get(ItemTy, 8);
27819
27820 // x87 FPU Control Word: mask all floating-point exceptions, sets rounding to
27821 // nearest. FPU precision is set to 53 bits on Windows and 64 bits otherwise
27822 // for compatibility with glibc.
27823 unsigned X87CW = Subtarget.isTargetWindowsMSVC() ? 0x27F : 0x37F;
27824 FPEnvVals.push_back(ConstantInt::get(ItemTy, X87CW));
27825 Constant *Zero = ConstantInt::get(ItemTy, 0);
27826 for (unsigned I = 0; I < 6; ++I)
27827 FPEnvVals.push_back(Zero);
27828
27829 // MXCSR: mask all floating-point exceptions, sets rounding to nearest, clear
27830 // all exceptions, sets DAZ and FTZ to 0.
27831 FPEnvVals.push_back(ConstantInt::get(ItemTy, 0x1F80));
27832 Constant *FPEnvBits = ConstantArray::get(FPEnvTy, FPEnvVals);
27834 SDValue Env = DAG.getConstantPool(FPEnvBits, PtrVT);
27835 MachinePointerInfo MPI =
27839
27840 return createSetFPEnvNodes(Env, Chain, DL, MVT::i32, MMO, DAG, Subtarget);
27841}
27842
27843/// Lower a vector CTLZ using native supported vector CTLZ instruction.
27844//
27845// i8/i16 vector implemented using dword LZCNT vector instruction
27846// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
27847// split the vector, perform operation on it's Lo a Hi part and
27848// concatenate the results.
27850 const X86Subtarget &Subtarget) {
27851 assert(Op.getOpcode() == ISD::CTLZ);
27852 SDLoc dl(Op);
27853 MVT VT = Op.getSimpleValueType();
27854 MVT EltVT = VT.getVectorElementType();
27855 unsigned NumElems = VT.getVectorNumElements();
27856
27857 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
27858 "Unsupported element type");
27859
27860 // Split vector, it's Lo and Hi parts will be handled in next iteration.
27861 if (NumElems > 16 ||
27862 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
27863 return splitVectorIntUnary(Op, DAG, dl);
27864
27865 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
27866 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
27867 "Unsupported value type for operation");
27868
27869 // Use native supported vector instruction vplzcntd.
27870 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
27871 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
27872 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
27873 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
27874
27875 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
27876}
27877
27878// Lower CTLZ using a PSHUFB lookup table implementation.
27880 const X86Subtarget &Subtarget,
27881 SelectionDAG &DAG) {
27882 MVT VT = Op.getSimpleValueType();
27883 int NumElts = VT.getVectorNumElements();
27884 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
27885 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
27886
27887 // Per-nibble leading zero PSHUFB lookup table.
27888 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
27889 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
27890 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
27891 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
27892
27894 for (int i = 0; i < NumBytes; ++i)
27895 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
27896 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
27897
27898 // Begin by bitcasting the input to byte vector, then split those bytes
27899 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
27900 // If the hi input nibble is zero then we add both results together, otherwise
27901 // we just take the hi result (by masking the lo result to zero before the
27902 // add).
27903 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
27904 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
27905
27906 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
27907 SDValue Lo = Op0;
27908 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
27909 SDValue HiZ;
27910 if (CurrVT.is512BitVector()) {
27911 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
27912 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
27913 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
27914 } else {
27915 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
27916 }
27917
27918 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
27919 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
27920 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
27921 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
27922
27923 // Merge result back from vXi8 back to VT, working on the lo/hi halves
27924 // of the current vector width in the same way we did for the nibbles.
27925 // If the upper half of the input element is zero then add the halves'
27926 // leading zero counts together, otherwise just use the upper half's.
27927 // Double the width of the result until we are at target width.
27928 while (CurrVT != VT) {
27929 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
27930 int CurrNumElts = CurrVT.getVectorNumElements();
27931 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
27932 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
27933 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
27934
27935 // Check if the upper half of the input element is zero.
27936 if (CurrVT.is512BitVector()) {
27937 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
27938 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
27939 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
27940 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
27941 } else {
27942 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
27943 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
27944 }
27945 HiZ = DAG.getBitcast(NextVT, HiZ);
27946
27947 // Move the upper/lower halves to the lower bits as we'll be extending to
27948 // NextVT. Mask the lower result to zero if HiZ is true and add the results
27949 // together.
27950 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
27951 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
27952 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
27953 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
27954 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
27955 CurrVT = NextVT;
27956 }
27957
27958 return Res;
27959}
27960
27962 const X86Subtarget &Subtarget,
27963 SelectionDAG &DAG) {
27964 MVT VT = Op.getSimpleValueType();
27965
27966 if (Subtarget.hasCDI() &&
27967 // vXi8 vectors need to be promoted to 512-bits for vXi32.
27968 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
27969 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
27970
27971 // Decompose 256-bit ops into smaller 128-bit ops.
27972 if (VT.is256BitVector() && !Subtarget.hasInt256())
27973 return splitVectorIntUnary(Op, DAG, DL);
27974
27975 // Decompose 512-bit ops into smaller 256-bit ops.
27976 if (VT.is512BitVector() && !Subtarget.hasBWI())
27977 return splitVectorIntUnary(Op, DAG, DL);
27978
27979 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
27980 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
27981}
27982
27983static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
27984 SelectionDAG &DAG) {
27985 MVT VT = Op.getSimpleValueType();
27986 MVT OpVT = VT;
27987 unsigned NumBits = VT.getSizeInBits();
27988 SDLoc dl(Op);
27989 unsigned Opc = Op.getOpcode();
27990
27991 if (VT.isVector())
27992 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
27993
27994 Op = Op.getOperand(0);
27995 if (VT == MVT::i8) {
27996 // Zero extend to i32 since there is not an i8 bsr.
27997 OpVT = MVT::i32;
27998 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
27999 }
28000
28001 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
28002 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
28003 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
28004
28005 if (Opc == ISD::CTLZ) {
28006 // If src is zero (i.e. bsr sets ZF), returns NumBits.
28007 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
28008 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
28009 Op.getValue(1)};
28010 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
28011 }
28012
28013 // Finally xor with NumBits-1.
28014 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
28015 DAG.getConstant(NumBits - 1, dl, OpVT));
28016
28017 if (VT == MVT::i8)
28018 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
28019 return Op;
28020}
28021
28022static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
28023 SelectionDAG &DAG) {
28024 MVT VT = Op.getSimpleValueType();
28025 unsigned NumBits = VT.getScalarSizeInBits();
28026 SDValue N0 = Op.getOperand(0);
28027 SDLoc dl(Op);
28028
28029 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
28030 "Only scalar CTTZ requires custom lowering");
28031
28032 // Issue a bsf (scan bits forward) which also sets EFLAGS.
28033 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
28034 Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
28035
28036 // If src is known never zero we can skip the CMOV.
28037 if (DAG.isKnownNeverZero(N0))
28038 return Op;
28039
28040 // If src is zero (i.e. bsf sets ZF), returns NumBits.
28041 SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
28042 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
28043 Op.getValue(1)};
28044 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
28045}
28046
28048 const X86Subtarget &Subtarget) {
28049 MVT VT = Op.getSimpleValueType();
28050 SDLoc DL(Op);
28051
28052 if (VT == MVT::i16 || VT == MVT::i32)
28053 return lowerAddSubToHorizontalOp(Op, DL, DAG, Subtarget);
28054
28055 if (VT == MVT::v32i16 || VT == MVT::v64i8)
28056 return splitVectorIntBinary(Op, DAG, DL);
28057
28058 assert(Op.getSimpleValueType().is256BitVector() &&
28059 Op.getSimpleValueType().isInteger() &&
28060 "Only handle AVX 256-bit vector integer operation");
28061 return splitVectorIntBinary(Op, DAG, DL);
28062}
28063
28065 const X86Subtarget &Subtarget) {
28066 MVT VT = Op.getSimpleValueType();
28067 SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
28068 unsigned Opcode = Op.getOpcode();
28069 SDLoc DL(Op);
28070
28071 if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
28072 (VT.is256BitVector() && !Subtarget.hasInt256())) {
28073 assert(Op.getSimpleValueType().isInteger() &&
28074 "Only handle AVX vector integer operation");
28075 return splitVectorIntBinary(Op, DAG, DL);
28076 }
28077
28078 // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
28079 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28080 EVT SetCCResultType =
28081 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
28082
28083 unsigned BitWidth = VT.getScalarSizeInBits();
28084 if (Opcode == ISD::USUBSAT) {
28085 if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {
28086 // Handle a special-case with a bit-hack instead of cmp+select:
28087 // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)
28088 // If the target can use VPTERNLOG, DAGToDAG will match this as
28089 // "vpsra + vpternlog" which is better than "vpmax + vpsub" with a
28090 // "broadcast" constant load.
28092 if (C && C->getAPIntValue().isSignMask()) {
28093 SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);
28094 SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);
28095 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);
28096 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);
28097 return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);
28098 }
28099 }
28100 if (!TLI.isOperationLegal(ISD::UMAX, VT)) {
28101 // usubsat X, Y --> (X >u Y) ? X - Y : 0
28102 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
28103 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
28104 // TODO: Move this to DAGCombiner?
28105 if (SetCCResultType == VT &&
28106 DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
28107 return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
28108 return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
28109 }
28110 }
28111
28112 if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&
28113 (!VT.isVector() || VT == MVT::v2i64)) {
28116 SDValue Zero = DAG.getConstant(0, DL, VT);
28117 SDValue Result =
28118 DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,
28119 DAG.getVTList(VT, SetCCResultType), X, Y);
28120 SDValue SumDiff = Result.getValue(0);
28121 SDValue Overflow = Result.getValue(1);
28122 SDValue SatMin = DAG.getConstant(MinVal, DL, VT);
28123 SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);
28124 SDValue SumNeg =
28125 DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);
28126 Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);
28127 return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);
28128 }
28129
28130 // Use default expansion.
28131 return SDValue();
28132}
28133
28134static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
28135 SelectionDAG &DAG) {
28136 MVT VT = Op.getSimpleValueType();
28137 SDLoc DL(Op);
28138
28139 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
28140 // Since X86 does not have CMOV for 8-bit integer, we don't convert
28141 // 8-bit integer abs to NEG and CMOV.
28142 SDValue N0 = Op.getOperand(0);
28143 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
28144 DAG.getConstant(0, DL, VT), N0);
28145 SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),
28146 SDValue(Neg.getNode(), 1)};
28147 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
28148 }
28149
28150 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
28151 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
28152 SDValue Src = Op.getOperand(0);
28153 SDValue Sub =
28154 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
28155 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);
28156 }
28157
28158 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
28159 assert(VT.isInteger() &&
28160 "Only handle AVX 256-bit vector integer operation");
28161 return splitVectorIntUnary(Op, DAG, DL);
28162 }
28163
28164 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
28165 return splitVectorIntUnary(Op, DAG, DL);
28166
28167 // Default to expand.
28168 return SDValue();
28169}
28170
28171static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,
28172 SelectionDAG &DAG) {
28173 MVT VT = Op.getSimpleValueType();
28174 SDLoc DL(Op);
28175
28176 // For AVX1 cases, split to use legal ops.
28177 if (VT.is256BitVector() && !Subtarget.hasInt256())
28178 return splitVectorIntBinary(Op, DAG, DL);
28179
28180 if (VT == MVT::v32i16 || VT == MVT::v64i8)
28181 return splitVectorIntBinary(Op, DAG, DL);
28182
28183 // Default to expand.
28184 return SDValue();
28185}
28186
28187static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,
28188 SelectionDAG &DAG) {
28189 MVT VT = Op.getSimpleValueType();
28190 SDLoc DL(Op);
28191
28192 // For AVX1 cases, split to use legal ops.
28193 if (VT.is256BitVector() && !Subtarget.hasInt256())
28194 return splitVectorIntBinary(Op, DAG, DL);
28195
28196 if (VT == MVT::v32i16 || VT == MVT::v64i8)
28197 return splitVectorIntBinary(Op, DAG, DL);
28198
28199 // Default to expand.
28200 return SDValue();
28201}
28202
28204 SelectionDAG &DAG) {
28205 assert((Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMINIMUM) &&
28206 "Expected FMAXIMUM or FMINIMUM opcode");
28207 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28208 EVT VT = Op.getValueType();
28209 SDValue X = Op.getOperand(0);
28210 SDValue Y = Op.getOperand(1);
28211 SDLoc DL(Op);
28212 uint64_t SizeInBits = VT.getScalarSizeInBits();
28213 APInt PreferredZero = APInt::getZero(SizeInBits);
28214 APInt OppositeZero = PreferredZero;
28215 EVT IVT = VT.changeTypeToInteger();
28216 X86ISD::NodeType MinMaxOp;
28217 if (Op.getOpcode() == ISD::FMAXIMUM) {
28218 MinMaxOp = X86ISD::FMAX;
28219 OppositeZero.setSignBit();
28220 } else {
28221 PreferredZero.setSignBit();
28222 MinMaxOp = X86ISD::FMIN;
28223 }
28224 EVT SetCCType =
28225 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
28226
28227 // The tables below show the expected result of Max in cases of NaN and
28228 // signed zeros.
28229 //
28230 // Y Y
28231 // Num xNaN +0 -0
28232 // --------------- ---------------
28233 // Num | Max | Y | +0 | +0 | +0 |
28234 // X --------------- X ---------------
28235 // xNaN | X | X/Y | -0 | +0 | -0 |
28236 // --------------- ---------------
28237 //
28238 // It is achieved by means of FMAX/FMIN with preliminary checks and operand
28239 // reordering.
28240 //
28241 // We check if any of operands is NaN and return NaN. Then we check if any of
28242 // operands is zero or negative zero (for fmaximum and fminimum respectively)
28243 // to ensure the correct zero is returned.
28244 auto MatchesZero = [](SDValue Op, APInt Zero) {
28246 if (auto *CstOp = dyn_cast<ConstantFPSDNode>(Op))
28247 return CstOp->getValueAPF().bitcastToAPInt() == Zero;
28248 if (auto *CstOp = dyn_cast<ConstantSDNode>(Op))
28249 return CstOp->getAPIntValue() == Zero;
28250 if (Op->getOpcode() == ISD::BUILD_VECTOR ||
28251 Op->getOpcode() == ISD::SPLAT_VECTOR) {
28252 for (const SDValue &OpVal : Op->op_values()) {
28253 if (OpVal.isUndef())
28254 continue;
28255 auto *CstOp = dyn_cast<ConstantFPSDNode>(OpVal);
28256 if (!CstOp)
28257 return false;
28258 if (!CstOp->getValueAPF().isZero())
28259 continue;
28260 if (CstOp->getValueAPF().bitcastToAPInt() != Zero)
28261 return false;
28262 }
28263 return true;
28264 }
28265 return false;
28266 };
28267
28268 bool IsXNeverNaN = DAG.isKnownNeverNaN(X);
28269 bool IsYNeverNaN = DAG.isKnownNeverNaN(Y);
28270 bool IgnoreSignedZero = DAG.getTarget().Options.NoSignedZerosFPMath ||
28271 Op->getFlags().hasNoSignedZeros() ||
28272 DAG.isKnownNeverZeroFloat(X) ||
28274 SDValue NewX, NewY;
28275 if (IgnoreSignedZero || MatchesZero(Y, PreferredZero) ||
28276 MatchesZero(X, OppositeZero)) {
28277 // Operands are already in right order or order does not matter.
28278 NewX = X;
28279 NewY = Y;
28280 } else if (MatchesZero(X, PreferredZero) || MatchesZero(Y, OppositeZero)) {
28281 NewX = Y;
28282 NewY = X;
28283 } else if (!VT.isVector() && (VT == MVT::f16 || Subtarget.hasDQI()) &&
28284 (Op->getFlags().hasNoNaNs() || IsXNeverNaN || IsYNeverNaN)) {
28285 if (IsXNeverNaN)
28286 std::swap(X, Y);
28287 // VFPCLASSS consumes a vector type. So provide a minimal one corresponded
28288 // xmm register.
28289 MVT VectorType = MVT::getVectorVT(VT.getSimpleVT(), 128 / SizeInBits);
28291 // Bits of classes:
28292 // Bits Imm8[0] Imm8[1] Imm8[2] Imm8[3] Imm8[4] Imm8[5] Imm8[6] Imm8[7]
28293 // Class QNAN PosZero NegZero PosINF NegINF Denormal Negative SNAN
28294 SDValue Imm = DAG.getTargetConstant(MinMaxOp == X86ISD::FMAX ? 0b11 : 0b101,
28295 DL, MVT::i32);
28296 SDValue IsNanZero = DAG.getNode(X86ISD::VFPCLASSS, DL, MVT::v1i1, VX, Imm);
28297 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
28298 DAG.getConstant(0, DL, MVT::v8i1), IsNanZero,
28299 DAG.getIntPtrConstant(0, DL));
28300 SDValue NeedSwap = DAG.getBitcast(MVT::i8, Ins);
28301 NewX = DAG.getSelect(DL, VT, NeedSwap, Y, X);
28302 NewY = DAG.getSelect(DL, VT, NeedSwap, X, Y);
28303 return DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
28304 } else {
28305 SDValue IsXSigned;
28306 if (Subtarget.is64Bit() || VT != MVT::f64) {
28307 SDValue XInt = DAG.getNode(ISD::BITCAST, DL, IVT, X);
28308 SDValue ZeroCst = DAG.getConstant(0, DL, IVT);
28309 IsXSigned = DAG.getSetCC(DL, SetCCType, XInt, ZeroCst, ISD::SETLT);
28310 } else {
28311 assert(VT == MVT::f64);
28312 SDValue Ins = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2f64,
28313 DAG.getConstantFP(0, DL, MVT::v2f64), X,
28314 DAG.getIntPtrConstant(0, DL));
28315 SDValue VX = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, Ins);
28316 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VX,
28317 DAG.getIntPtrConstant(1, DL));
28318 Hi = DAG.getBitcast(MVT::i32, Hi);
28319 SDValue ZeroCst = DAG.getConstant(0, DL, MVT::i32);
28320 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(),
28321 *DAG.getContext(), MVT::i32);
28322 IsXSigned = DAG.getSetCC(DL, SetCCType, Hi, ZeroCst, ISD::SETLT);
28323 }
28324 if (MinMaxOp == X86ISD::FMAX) {
28325 NewX = DAG.getSelect(DL, VT, IsXSigned, X, Y);
28326 NewY = DAG.getSelect(DL, VT, IsXSigned, Y, X);
28327 } else {
28328 NewX = DAG.getSelect(DL, VT, IsXSigned, Y, X);
28329 NewY = DAG.getSelect(DL, VT, IsXSigned, X, Y);
28330 }
28331 }
28332
28333 bool IgnoreNaN = DAG.getTarget().Options.NoNaNsFPMath ||
28334 Op->getFlags().hasNoNaNs() || (IsXNeverNaN && IsYNeverNaN);
28335
28336 // If we did no ordering operands for signed zero handling and we need
28337 // to process NaN and we know that the second operand is not NaN then put
28338 // it in first operand and we will not need to post handle NaN after max/min.
28339 if (IgnoreSignedZero && !IgnoreNaN && DAG.isKnownNeverNaN(NewY))
28340 std::swap(NewX, NewY);
28341
28342 SDValue MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
28343
28344 if (IgnoreNaN || DAG.isKnownNeverNaN(NewX))
28345 return MinMax;
28346
28347 SDValue IsNaN = DAG.getSetCC(DL, SetCCType, NewX, NewX, ISD::SETUO);
28348 return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);
28349}
28350
28351static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,
28352 SelectionDAG &DAG) {
28353 MVT VT = Op.getSimpleValueType();
28354 SDLoc dl(Op);
28355
28356 // For AVX1 cases, split to use legal ops.
28357 if (VT.is256BitVector() && !Subtarget.hasInt256())
28358 return splitVectorIntBinary(Op, DAG, dl);
28359
28360 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.useBWIRegs())
28361 return splitVectorIntBinary(Op, DAG, dl);
28362
28363 bool IsSigned = Op.getOpcode() == ISD::ABDS;
28364 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28365
28366 // TODO: Move to TargetLowering expandABD() once we have ABD promotion.
28367 if (VT.isScalarInteger()) {
28368 unsigned WideBits = std::max<unsigned>(2 * VT.getScalarSizeInBits(), 32u);
28369 MVT WideVT = MVT::getIntegerVT(WideBits);
28370 if (TLI.isTypeLegal(WideVT)) {
28371 // abds(lhs, rhs) -> trunc(abs(sub(sext(lhs), sext(rhs))))
28372 // abdu(lhs, rhs) -> trunc(abs(sub(zext(lhs), zext(rhs))))
28373 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28374 SDValue LHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(0));
28375 SDValue RHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(1));
28376 SDValue Diff = DAG.getNode(ISD::SUB, dl, WideVT, LHS, RHS);
28377 SDValue AbsDiff = DAG.getNode(ISD::ABS, dl, WideVT, Diff);
28378 return DAG.getNode(ISD::TRUNCATE, dl, VT, AbsDiff);
28379 }
28380 }
28381
28382 // TODO: Move to TargetLowering expandABD().
28383 if (!Subtarget.hasSSE41() &&
28384 ((IsSigned && VT == MVT::v16i8) || VT == MVT::v4i32)) {
28385 SDValue LHS = DAG.getFreeze(Op.getOperand(0));
28386 SDValue RHS = DAG.getFreeze(Op.getOperand(1));
28388 SDValue Cmp = DAG.getSetCC(dl, VT, LHS, RHS, CC);
28389 SDValue Diff0 = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
28390 SDValue Diff1 = DAG.getNode(ISD::SUB, dl, VT, RHS, LHS);
28391 return getBitSelect(dl, VT, Diff0, Diff1, Cmp, DAG);
28392 }
28393
28394 // Default to expand.
28395 return SDValue();
28396}
28397
28398static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
28399 SelectionDAG &DAG) {
28400 SDLoc dl(Op);
28401 MVT VT = Op.getSimpleValueType();
28402
28403 // Decompose 256-bit ops into 128-bit ops.
28404 if (VT.is256BitVector() && !Subtarget.hasInt256())
28405 return splitVectorIntBinary(Op, DAG, dl);
28406
28407 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
28408 return splitVectorIntBinary(Op, DAG, dl);
28409
28410 SDValue A = Op.getOperand(0);
28411 SDValue B = Op.getOperand(1);
28412
28413 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
28414 // vector pairs, multiply and truncate.
28415 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
28416 unsigned NumElts = VT.getVectorNumElements();
28417
28418 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
28419 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
28420 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
28421 return DAG.getNode(
28422 ISD::TRUNCATE, dl, VT,
28423 DAG.getNode(ISD::MUL, dl, ExVT,
28424 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
28425 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
28426 }
28427
28428 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28429
28430 // Extract the lo/hi parts to any extend to i16.
28431 // We're going to mask off the low byte of each result element of the
28432 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
28433 // element.
28434 SDValue Undef = DAG.getUNDEF(VT);
28435 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
28436 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
28437
28438 SDValue BLo, BHi;
28439 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
28440 // If the RHS is a constant, manually unpackl/unpackh.
28441 SmallVector<SDValue, 16> LoOps, HiOps;
28442 for (unsigned i = 0; i != NumElts; i += 16) {
28443 for (unsigned j = 0; j != 8; ++j) {
28444 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
28445 MVT::i16));
28446 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
28447 MVT::i16));
28448 }
28449 }
28450
28451 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
28452 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
28453 } else {
28454 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
28455 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
28456 }
28457
28458 // Multiply, mask the lower 8bits of the lo/hi results and pack.
28459 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
28460 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
28461 return getPack(DAG, Subtarget, dl, VT, RLo, RHi);
28462 }
28463
28464 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
28465 if (VT == MVT::v4i32) {
28466 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
28467 "Should not custom lower when pmulld is available!");
28468
28469 // Extract the odd parts.
28470 static const int UnpackMask[] = { 1, -1, 3, -1 };
28471 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
28472 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
28473
28474 // Multiply the even parts.
28475 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
28476 DAG.getBitcast(MVT::v2i64, A),
28477 DAG.getBitcast(MVT::v2i64, B));
28478 // Now multiply odd parts.
28479 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
28480 DAG.getBitcast(MVT::v2i64, Aodds),
28481 DAG.getBitcast(MVT::v2i64, Bodds));
28482
28483 Evens = DAG.getBitcast(VT, Evens);
28484 Odds = DAG.getBitcast(VT, Odds);
28485
28486 // Merge the two vectors back together with a shuffle. This expands into 2
28487 // shuffles.
28488 static const int ShufMask[] = { 0, 4, 2, 6 };
28489 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
28490 }
28491
28492 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
28493 "Only know how to lower V2I64/V4I64/V8I64 multiply");
28494 assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
28495
28496 // Ahi = psrlqi(a, 32);
28497 // Bhi = psrlqi(b, 32);
28498 //
28499 // AloBlo = pmuludq(a, b);
28500 // AloBhi = pmuludq(a, Bhi);
28501 // AhiBlo = pmuludq(Ahi, b);
28502 //
28503 // Hi = psllqi(AloBhi + AhiBlo, 32);
28504 // return AloBlo + Hi;
28505 KnownBits AKnown = DAG.computeKnownBits(A);
28506 KnownBits BKnown = DAG.computeKnownBits(B);
28507
28508 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
28509 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
28510 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
28511
28512 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
28513 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
28514 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
28515
28516 SDValue Zero = DAG.getConstant(0, dl, VT);
28517
28518 // Only multiply lo/hi halves that aren't known to be zero.
28519 SDValue AloBlo = Zero;
28520 if (!ALoIsZero && !BLoIsZero)
28521 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
28522
28523 SDValue AloBhi = Zero;
28524 if (!ALoIsZero && !BHiIsZero) {
28525 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
28526 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
28527 }
28528
28529 SDValue AhiBlo = Zero;
28530 if (!AHiIsZero && !BLoIsZero) {
28531 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
28532 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
28533 }
28534
28535 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
28536 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
28537
28538 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
28539}
28540
28542 MVT VT, bool IsSigned,
28543 const X86Subtarget &Subtarget,
28544 SelectionDAG &DAG,
28545 SDValue *Low = nullptr) {
28546 unsigned NumElts = VT.getVectorNumElements();
28547
28548 // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
28549 // to a vXi16 type. Do the multiplies, shift the results and pack the half
28550 // lane results back together.
28551
28552 // We'll take different approaches for signed and unsigned.
28553 // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
28554 // and use pmullw to calculate the full 16-bit product.
28555 // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
28556 // shift them left into the upper byte of each word. This allows us to use
28557 // pmulhw to calculate the full 16-bit product. This trick means we don't
28558 // need to sign extend the bytes to use pmullw.
28559
28560 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28561 SDValue Zero = DAG.getConstant(0, dl, VT);
28562
28563 SDValue ALo, AHi;
28564 if (IsSigned) {
28565 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
28566 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
28567 } else {
28568 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
28569 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
28570 }
28571
28572 SDValue BLo, BHi;
28573 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
28574 // If the RHS is a constant, manually unpackl/unpackh and extend.
28575 SmallVector<SDValue, 16> LoOps, HiOps;
28576 for (unsigned i = 0; i != NumElts; i += 16) {
28577 for (unsigned j = 0; j != 8; ++j) {
28578 SDValue LoOp = B.getOperand(i + j);
28579 SDValue HiOp = B.getOperand(i + j + 8);
28580
28581 if (IsSigned) {
28582 LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
28583 HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
28584 LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
28585 DAG.getConstant(8, dl, MVT::i16));
28586 HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
28587 DAG.getConstant(8, dl, MVT::i16));
28588 } else {
28589 LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
28590 HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
28591 }
28592
28593 LoOps.push_back(LoOp);
28594 HiOps.push_back(HiOp);
28595 }
28596 }
28597
28598 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
28599 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
28600 } else if (IsSigned) {
28601 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
28602 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
28603 } else {
28604 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
28605 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
28606 }
28607
28608 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
28609 // pack back to vXi8.
28610 unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
28611 SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
28612 SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
28613
28614 if (Low)
28615 *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
28616
28617 return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
28618}
28619
28620static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
28621 SelectionDAG &DAG) {
28622 SDLoc dl(Op);
28623 MVT VT = Op.getSimpleValueType();
28624 bool IsSigned = Op->getOpcode() == ISD::MULHS;
28625 unsigned NumElts = VT.getVectorNumElements();
28626 SDValue A = Op.getOperand(0);
28627 SDValue B = Op.getOperand(1);
28628
28629 // Decompose 256-bit ops into 128-bit ops.
28630 if (VT.is256BitVector() && !Subtarget.hasInt256())
28631 return splitVectorIntBinary(Op, DAG, dl);
28632
28633 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
28634 return splitVectorIntBinary(Op, DAG, dl);
28635
28636 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
28637 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
28638 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
28639 (VT == MVT::v16i32 && Subtarget.hasAVX512()));
28640
28641 // PMULxD operations multiply each even value (starting at 0) of LHS with
28642 // the related value of RHS and produce a widen result.
28643 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
28644 // => <2 x i64> <ae|cg>
28645 //
28646 // In other word, to have all the results, we need to perform two PMULxD:
28647 // 1. one with the even values.
28648 // 2. one with the odd values.
28649 // To achieve #2, with need to place the odd values at an even position.
28650 //
28651 // Place the odd value at an even position (basically, shift all values 1
28652 // step to the left):
28653 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
28654 9, -1, 11, -1, 13, -1, 15, -1};
28655 // <a|b|c|d> => <b|undef|d|undef>
28656 SDValue Odd0 =
28657 DAG.getVectorShuffle(VT, dl, A, A, ArrayRef(&Mask[0], NumElts));
28658 // <e|f|g|h> => <f|undef|h|undef>
28659 SDValue Odd1 =
28660 DAG.getVectorShuffle(VT, dl, B, B, ArrayRef(&Mask[0], NumElts));
28661
28662 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
28663 // ints.
28664 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
28665 unsigned Opcode =
28666 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
28667 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
28668 // => <2 x i64> <ae|cg>
28669 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
28670 DAG.getBitcast(MulVT, A),
28671 DAG.getBitcast(MulVT, B)));
28672 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
28673 // => <2 x i64> <bf|dh>
28674 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
28675 DAG.getBitcast(MulVT, Odd0),
28676 DAG.getBitcast(MulVT, Odd1)));
28677
28678 // Shuffle it back into the right order.
28679 SmallVector<int, 16> ShufMask(NumElts);
28680 for (int i = 0; i != (int)NumElts; ++i)
28681 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
28682
28683 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
28684
28685 // If we have a signed multiply but no PMULDQ fix up the result of an
28686 // unsigned multiply.
28687 if (IsSigned && !Subtarget.hasSSE41()) {
28688 SDValue Zero = DAG.getConstant(0, dl, VT);
28689 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
28690 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
28691 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
28692 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
28693
28694 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
28695 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
28696 }
28697
28698 return Res;
28699 }
28700
28701 // Only i8 vectors should need custom lowering after this.
28702 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
28703 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
28704 "Unsupported vector type");
28705
28706 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
28707 // logical shift down the upper half and pack back to i8.
28708
28709 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
28710 // and then ashr/lshr the upper bits down to the lower bits before multiply.
28711
28712 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
28713 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
28714 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
28715 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28716 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
28717 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
28718 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
28719 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28720 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
28721 }
28722
28723 return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
28724}
28725
28726// Custom lowering for SMULO/UMULO.
28727static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
28728 SelectionDAG &DAG) {
28729 MVT VT = Op.getSimpleValueType();
28730
28731 // Scalars defer to LowerXALUO.
28732 if (!VT.isVector())
28733 return LowerXALUO(Op, DAG);
28734
28735 SDLoc dl(Op);
28736 bool IsSigned = Op->getOpcode() == ISD::SMULO;
28737 SDValue A = Op.getOperand(0);
28738 SDValue B = Op.getOperand(1);
28739 EVT OvfVT = Op->getValueType(1);
28740
28741 if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
28742 (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
28743 // Extract the LHS Lo/Hi vectors
28744 SDValue LHSLo, LHSHi;
28745 std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
28746
28747 // Extract the RHS Lo/Hi vectors
28748 SDValue RHSLo, RHSHi;
28749 std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
28750
28751 EVT LoOvfVT, HiOvfVT;
28752 std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
28753 SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
28754 SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
28755
28756 // Issue the split operations.
28757 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
28758 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
28759
28760 // Join the separate data results and the overflow results.
28761 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
28762 SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
28763 Hi.getValue(1));
28764
28765 return DAG.getMergeValues({Res, Ovf}, dl);
28766 }
28767
28768 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28769 EVT SetccVT =
28770 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
28771
28772 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
28773 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
28774 unsigned NumElts = VT.getVectorNumElements();
28775 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
28776 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28777 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
28778 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
28779 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
28780
28781 SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
28782
28783 SDValue Ovf;
28784 if (IsSigned) {
28785 SDValue High, LowSign;
28786 if (OvfVT.getVectorElementType() == MVT::i1 &&
28787 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
28788 // Rather the truncating try to do the compare on vXi16 or vXi32.
28789 // Shift the high down filling with sign bits.
28790 High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
28791 // Fill all 16 bits with the sign bit from the low.
28792 LowSign =
28793 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
28794 LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
28795 15, DAG);
28796 SetccVT = OvfVT;
28797 if (!Subtarget.hasBWI()) {
28798 // We can't do a vXi16 compare so sign extend to v16i32.
28799 High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
28800 LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
28801 }
28802 } else {
28803 // Otherwise do the compare at vXi8.
28804 High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28805 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
28806 LowSign =
28807 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
28808 }
28809
28810 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
28811 } else {
28812 SDValue High =
28813 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28814 if (OvfVT.getVectorElementType() == MVT::i1 &&
28815 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
28816 // Rather the truncating try to do the compare on vXi16 or vXi32.
28817 SetccVT = OvfVT;
28818 if (!Subtarget.hasBWI()) {
28819 // We can't do a vXi16 compare so sign extend to v16i32.
28820 High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
28821 }
28822 } else {
28823 // Otherwise do the compare at vXi8.
28824 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
28825 }
28826
28827 Ovf =
28828 DAG.getSetCC(dl, SetccVT, High,
28829 DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
28830 }
28831
28832 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
28833
28834 return DAG.getMergeValues({Low, Ovf}, dl);
28835 }
28836
28837 SDValue Low;
28838 SDValue High =
28839 LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
28840
28841 SDValue Ovf;
28842 if (IsSigned) {
28843 // SMULO overflows if the high bits don't match the sign of the low.
28844 SDValue LowSign =
28845 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
28846 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
28847 } else {
28848 // UMULO overflows if the high bits are non-zero.
28849 Ovf =
28850 DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
28851 }
28852
28853 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
28854
28855 return DAG.getMergeValues({Low, Ovf}, dl);
28856}
28857
28858SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
28859 assert(Subtarget.isTargetWin64() && "Unexpected target");
28860 EVT VT = Op.getValueType();
28861 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
28862 "Unexpected return type for lowering");
28863
28864 if (isa<ConstantSDNode>(Op->getOperand(1))) {
28866 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i64, DAG))
28867 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Result[0], Result[1]);
28868 }
28869
28870 RTLIB::Libcall LC;
28871 bool isSigned;
28872 switch (Op->getOpcode()) {
28873 // clang-format off
28874 default: llvm_unreachable("Unexpected request for libcall!");
28875 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
28876 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
28877 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
28878 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
28879 // clang-format on
28880 }
28881
28882 SDLoc dl(Op);
28883 SDValue InChain = DAG.getEntryNode();
28884
28887 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
28888 EVT ArgVT = Op->getOperand(i).getValueType();
28889 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
28890 "Unexpected argument type for lowering");
28891 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
28892 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
28893 MachinePointerInfo MPI =
28895 Entry.Node = StackPtr;
28896 InChain =
28897 DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
28898 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
28899 Entry.Ty = PointerType::get(ArgTy,0);
28900 Entry.IsSExt = false;
28901 Entry.IsZExt = false;
28902 Args.push_back(Entry);
28903 }
28904
28907
28909 CLI.setDebugLoc(dl)
28910 .setChain(InChain)
28911 .setLibCallee(
28913 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
28914 std::move(Args))
28915 .setInRegister()
28916 .setSExtResult(isSigned)
28917 .setZExtResult(!isSigned);
28918
28919 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
28920 return DAG.getBitcast(VT, CallInfo.first);
28921}
28922
28923SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,
28924 SelectionDAG &DAG,
28925 SDValue &Chain) const {
28926 assert(Subtarget.isTargetWin64() && "Unexpected target");
28927 EVT VT = Op.getValueType();
28928 bool IsStrict = Op->isStrictFPOpcode();
28929
28930 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
28931 EVT ArgVT = Arg.getValueType();
28932
28933 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
28934 "Unexpected return type for lowering");
28935
28936 RTLIB::Libcall LC;
28937 if (Op->getOpcode() == ISD::FP_TO_SINT ||
28938 Op->getOpcode() == ISD::STRICT_FP_TO_SINT)
28939 LC = RTLIB::getFPTOSINT(ArgVT, VT);
28940 else
28941 LC = RTLIB::getFPTOUINT(ArgVT, VT);
28942 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
28943
28944 SDLoc dl(Op);
28945 MakeLibCallOptions CallOptions;
28946 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
28947
28949 // Expect the i128 argument returned as a v2i64 in xmm0, cast back to the
28950 // expected VT (i128).
28951 std::tie(Result, Chain) =
28952 makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);
28953 Result = DAG.getBitcast(VT, Result);
28954 return Result;
28955}
28956
28957SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
28958 SelectionDAG &DAG) const {
28959 assert(Subtarget.isTargetWin64() && "Unexpected target");
28960 EVT VT = Op.getValueType();
28961 bool IsStrict = Op->isStrictFPOpcode();
28962
28963 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
28964 EVT ArgVT = Arg.getValueType();
28965
28966 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
28967 "Unexpected argument type for lowering");
28968
28969 RTLIB::Libcall LC;
28970 if (Op->getOpcode() == ISD::SINT_TO_FP ||
28971 Op->getOpcode() == ISD::STRICT_SINT_TO_FP)
28972 LC = RTLIB::getSINTTOFP(ArgVT, VT);
28973 else
28974 LC = RTLIB::getUINTTOFP(ArgVT, VT);
28975 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
28976
28977 SDLoc dl(Op);
28978 MakeLibCallOptions CallOptions;
28979 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
28980
28981 // Pass the i128 argument as an indirect argument on the stack.
28982 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
28983 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
28984 MachinePointerInfo MPI =
28986 Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));
28987
28989 std::tie(Result, Chain) =
28990 makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);
28991 return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
28992}
28993
28994// Return true if the required (according to Opcode) shift-imm form is natively
28995// supported by the Subtarget
28996static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,
28997 unsigned Opcode) {
28998 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
28999 "Unexpected shift opcode");
29000
29001 if (!VT.isSimple())
29002 return false;
29003
29004 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
29005 return false;
29006
29007 if (VT.getScalarSizeInBits() < 16)
29008 return false;
29009
29010 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
29011 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
29012 return true;
29013
29014 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
29015 (VT.is256BitVector() && Subtarget.hasInt256());
29016
29017 bool AShift = LShift && (Subtarget.hasAVX512() ||
29018 (VT != MVT::v2i64 && VT != MVT::v4i64));
29019 return (Opcode == ISD::SRA) ? AShift : LShift;
29020}
29021
29022// The shift amount is a variable, but it is the same for all vector lanes.
29023// These instructions are defined together with shift-immediate.
29024static
29026 unsigned Opcode) {
29027 return supportedVectorShiftWithImm(VT, Subtarget, Opcode);
29028}
29029
29030// Return true if the required (according to Opcode) variable-shift form is
29031// natively supported by the Subtarget
29032static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget,
29033 unsigned Opcode) {
29034 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
29035 "Unexpected shift opcode");
29036
29037 if (!VT.isSimple())
29038 return false;
29039
29040 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
29041 return false;
29042
29043 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
29044 return false;
29045
29046 // vXi16 supported only on AVX-512, BWI
29047 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
29048 return false;
29049
29050 if (Subtarget.hasAVX512() &&
29051 (Subtarget.useAVX512Regs() || !VT.is512BitVector()))
29052 return true;
29053
29054 bool LShift = VT.is128BitVector() || VT.is256BitVector();
29055 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
29056 return (Opcode == ISD::SRA) ? AShift : LShift;
29057}
29058
29060 const X86Subtarget &Subtarget) {
29061 MVT VT = Op.getSimpleValueType();
29062 SDLoc dl(Op);
29063 SDValue R = Op.getOperand(0);
29064 SDValue Amt = Op.getOperand(1);
29065 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
29066 unsigned EltSizeInBits = VT.getScalarSizeInBits();
29067
29068 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
29069 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
29070 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
29071 SDValue Ex = DAG.getBitcast(ExVT, R);
29072
29073 // ashr(R, 63) === cmp_slt(R, 0)
29074 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
29075 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
29076 "Unsupported PCMPGT op");
29077 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
29078 }
29079
29080 if (ShiftAmt >= 32) {
29081 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
29082 SDValue Upper =
29083 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
29085 ShiftAmt - 32, DAG);
29086 if (VT == MVT::v2i64)
29087 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
29088 if (VT == MVT::v4i64)
29089 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
29090 {9, 1, 11, 3, 13, 5, 15, 7});
29091 } else {
29092 // SRA upper i32, SRL whole i64 and select lower i32.
29094 ShiftAmt, DAG);
29095 SDValue Lower =
29096 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
29097 Lower = DAG.getBitcast(ExVT, Lower);
29098 if (VT == MVT::v2i64)
29099 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
29100 if (VT == MVT::v4i64)
29101 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
29102 {8, 1, 10, 3, 12, 5, 14, 7});
29103 }
29104 return DAG.getBitcast(VT, Ex);
29105 };
29106
29107 // Optimize shl/srl/sra with constant shift amount.
29108 APInt APIntShiftAmt;
29109 if (!X86::isConstantSplat(Amt, APIntShiftAmt))
29110 return SDValue();
29111
29112 // If the shift amount is out of range, return undef.
29113 if (APIntShiftAmt.uge(EltSizeInBits))
29114 return DAG.getUNDEF(VT);
29115
29116 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
29117
29118 if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) {
29119 // Hardware support for vector shifts is sparse which makes us scalarize the
29120 // vector operations in many cases. Also, on sandybridge ADD is faster than
29121 // shl: (shl V, 1) -> (add (freeze V), (freeze V))
29122 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
29123 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
29124 // must be 0). (add undef, undef) however can be any value. To make this
29125 // safe, we must freeze R to ensure that register allocation uses the same
29126 // register for an undefined value. This ensures that the result will
29127 // still be even and preserves the original semantics.
29128 R = DAG.getFreeze(R);
29129 return DAG.getNode(ISD::ADD, dl, VT, R, R);
29130 }
29131
29132 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
29133 }
29134
29135 // i64 SRA needs to be performed as partial shifts.
29136 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
29137 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
29138 Op.getOpcode() == ISD::SRA)
29139 return ArithmeticShiftRight64(ShiftAmt);
29140
29141 // If we're logical shifting an all-signbits value then we can just perform as
29142 // a mask.
29143 if ((Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) &&
29144 DAG.ComputeNumSignBits(R) == EltSizeInBits) {
29145 SDValue Mask = DAG.getAllOnesConstant(dl, VT);
29146 Mask = DAG.getNode(Op.getOpcode(), dl, VT, Mask, Amt);
29147 return DAG.getNode(ISD::AND, dl, VT, R, Mask);
29148 }
29149
29150 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
29151 (Subtarget.hasBWI() && VT == MVT::v64i8)) {
29152 unsigned NumElts = VT.getVectorNumElements();
29153 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29154
29155 // Simple i8 add case
29156 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
29157 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
29158 // must be 0). (add undef, undef) however can be any value. To make this
29159 // safe, we must freeze R to ensure that register allocation uses the same
29160 // register for an undefined value. This ensures that the result will
29161 // still be even and preserves the original semantics.
29162 R = DAG.getFreeze(R);
29163 return DAG.getNode(ISD::ADD, dl, VT, R, R);
29164 }
29165
29166 // ashr(R, 7) === cmp_slt(R, 0)
29167 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
29168 SDValue Zeros = DAG.getConstant(0, dl, VT);
29169 if (VT.is512BitVector()) {
29170 assert(VT == MVT::v64i8 && "Unexpected element type!");
29171 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
29172 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
29173 }
29174 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
29175 }
29176
29177 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
29178 if (VT == MVT::v16i8 && Subtarget.hasXOP())
29179 return SDValue();
29180
29181 if (Op.getOpcode() == ISD::SHL) {
29182 // Make a large shift.
29183 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
29184 ShiftAmt, DAG);
29185 SHL = DAG.getBitcast(VT, SHL);
29186 // Zero out the rightmost bits.
29187 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
29188 return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
29189 }
29190 if (Op.getOpcode() == ISD::SRL) {
29191 // Make a large shift.
29192 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
29193 ShiftAmt, DAG);
29194 SRL = DAG.getBitcast(VT, SRL);
29195 // Zero out the leftmost bits.
29196 APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
29197 return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
29198 }
29199 if (Op.getOpcode() == ISD::SRA) {
29200 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
29201 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
29202
29203 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
29204 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
29205 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
29206 return Res;
29207 }
29208 llvm_unreachable("Unknown shift opcode.");
29209 }
29210
29211 return SDValue();
29212}
29213
29215 const X86Subtarget &Subtarget) {
29216 MVT VT = Op.getSimpleValueType();
29217 SDLoc dl(Op);
29218 SDValue R = Op.getOperand(0);
29219 SDValue Amt = Op.getOperand(1);
29220 unsigned Opcode = Op.getOpcode();
29221 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
29222
29223 int BaseShAmtIdx = -1;
29224 if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {
29225 if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))
29226 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,
29227 Subtarget, DAG);
29228
29229 // vXi8 shifts - shift as v8i16 + mask result.
29230 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
29231 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
29232 VT == MVT::v64i8) &&
29233 !Subtarget.hasXOP()) {
29234 unsigned NumElts = VT.getVectorNumElements();
29235 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29236 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
29237 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
29238 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
29239
29240 // Create the mask using vXi16 shifts. For shift-rights we need to move
29241 // the upper byte down before splatting the vXi8 mask.
29242 SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
29243 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
29244 BaseShAmt, BaseShAmtIdx, Subtarget, DAG);
29245 if (Opcode != ISD::SHL)
29246 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
29247 8, DAG);
29248 BitMask = DAG.getBitcast(VT, BitMask);
29249 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
29250 SmallVector<int, 64>(NumElts, 0));
29251
29252 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
29253 DAG.getBitcast(ExtVT, R), BaseShAmt,
29254 BaseShAmtIdx, Subtarget, DAG);
29255 Res = DAG.getBitcast(VT, Res);
29256 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
29257
29258 if (Opcode == ISD::SRA) {
29259 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
29260 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
29261 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
29262 SignMask =
29263 getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,
29264 BaseShAmtIdx, Subtarget, DAG);
29265 SignMask = DAG.getBitcast(VT, SignMask);
29266 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
29267 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
29268 }
29269 return Res;
29270 }
29271 }
29272 }
29273
29274 return SDValue();
29275}
29276
29277// Convert a shift/rotate left amount to a multiplication scale factor.
29279 const X86Subtarget &Subtarget,
29280 SelectionDAG &DAG) {
29281 MVT VT = Amt.getSimpleValueType();
29282 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
29283 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
29284 (Subtarget.hasAVX512() && VT == MVT::v32i16) ||
29285 (!Subtarget.hasAVX512() && VT == MVT::v16i8) ||
29286 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
29287 (Subtarget.hasBWI() && VT == MVT::v64i8)))
29288 return SDValue();
29289
29290 MVT SVT = VT.getVectorElementType();
29291 unsigned SVTBits = SVT.getSizeInBits();
29292 unsigned NumElems = VT.getVectorNumElements();
29293
29294 APInt UndefElts;
29295 SmallVector<APInt> EltBits;
29296 if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {
29297 APInt One(SVTBits, 1);
29298 SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));
29299 for (unsigned I = 0; I != NumElems; ++I) {
29300 if (UndefElts[I] || EltBits[I].uge(SVTBits))
29301 continue;
29302 uint64_t ShAmt = EltBits[I].getZExtValue();
29303 Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);
29304 }
29305 return DAG.getBuildVector(VT, dl, Elts);
29306 }
29307
29308 // If the target doesn't support variable shifts, use either FP conversion
29309 // or integer multiplication to avoid shifting each element individually.
29310 if (VT == MVT::v4i32) {
29311 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
29312 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
29313 DAG.getConstant(0x3f800000U, dl, VT));
29314 Amt = DAG.getBitcast(MVT::v4f32, Amt);
29315 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
29316 }
29317
29318 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
29319 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
29320 SDValue Z = DAG.getConstant(0, dl, VT);
29321 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
29322 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
29323 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
29324 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
29325 if (Subtarget.hasSSE41())
29326 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
29327 return getPack(DAG, Subtarget, dl, VT, Lo, Hi);
29328 }
29329
29330 return SDValue();
29331}
29332
29333static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
29334 SelectionDAG &DAG) {
29335 MVT VT = Op.getSimpleValueType();
29336 SDLoc dl(Op);
29337 SDValue R = Op.getOperand(0);
29338 SDValue Amt = Op.getOperand(1);
29339 unsigned EltSizeInBits = VT.getScalarSizeInBits();
29340 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
29341
29342 unsigned Opc = Op.getOpcode();
29343 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
29344 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
29345
29346 assert(VT.isVector() && "Custom lowering only for vector shifts!");
29347 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
29348
29349 if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))
29350 return V;
29351
29352 if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))
29353 return V;
29354
29355 if (supportedVectorVarShift(VT, Subtarget, Opc))
29356 return Op;
29357
29358 // i64 vector arithmetic shift can be emulated with the transform:
29359 // M = lshr(SIGN_MASK, Amt)
29360 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
29361 if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||
29362 (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
29363 Opc == ISD::SRA) {
29364 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
29365 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
29366 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
29367 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
29368 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
29369 return R;
29370 }
29371
29372 // XOP has 128-bit variable logical/arithmetic shifts.
29373 // +ve/-ve Amt = shift left/right.
29374 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
29375 VT == MVT::v8i16 || VT == MVT::v16i8)) {
29376 if (Opc == ISD::SRL || Opc == ISD::SRA) {
29377 SDValue Zero = DAG.getConstant(0, dl, VT);
29378 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
29379 }
29380 if (Opc == ISD::SHL || Opc == ISD::SRL)
29381 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
29382 if (Opc == ISD::SRA)
29383 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
29384 }
29385
29386 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
29387 // shifts per-lane and then shuffle the partial results back together.
29388 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
29389 // Splat the shift amounts so the scalar shifts above will catch it.
29390 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
29391 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
29392 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
29393 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
29394 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
29395 }
29396
29397 // If possible, lower this shift as a sequence of two shifts by
29398 // constant plus a BLENDing shuffle instead of scalarizing it.
29399 // Example:
29400 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
29401 //
29402 // Could be rewritten as:
29403 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
29404 //
29405 // The advantage is that the two shifts from the example would be
29406 // lowered as X86ISD::VSRLI nodes in parallel before blending.
29407 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
29408 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
29409 SDValue Amt1, Amt2;
29410 unsigned NumElts = VT.getVectorNumElements();
29411 SmallVector<int, 8> ShuffleMask;
29412 for (unsigned i = 0; i != NumElts; ++i) {
29413 SDValue A = Amt->getOperand(i);
29414 if (A.isUndef()) {
29415 ShuffleMask.push_back(SM_SentinelUndef);
29416 continue;
29417 }
29418 if (!Amt1 || Amt1 == A) {
29419 ShuffleMask.push_back(i);
29420 Amt1 = A;
29421 continue;
29422 }
29423 if (!Amt2 || Amt2 == A) {
29424 ShuffleMask.push_back(i + NumElts);
29425 Amt2 = A;
29426 continue;
29427 }
29428 break;
29429 }
29430
29431 // Only perform this blend if we can perform it without loading a mask.
29432 if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
29433 (VT != MVT::v16i16 ||
29434 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
29435 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
29436 canWidenShuffleElements(ShuffleMask))) {
29437 auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
29438 auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
29439 if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
29440 Cst2->getAPIntValue().ult(EltSizeInBits)) {
29441 SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
29442 Cst1->getZExtValue(), DAG);
29443 SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
29444 Cst2->getZExtValue(), DAG);
29445 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
29446 }
29447 }
29448 }
29449
29450 // If possible, lower this packed shift into a vector multiply instead of
29451 // expanding it into a sequence of scalar shifts.
29452 // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.
29453 if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||
29454 Subtarget.canExtendTo512BW())))
29455 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
29456 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
29457
29458 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
29459 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
29460 if (Opc == ISD::SRL && ConstantAmt &&
29461 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
29462 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
29463 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
29464 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
29465 SDValue Zero = DAG.getConstant(0, dl, VT);
29466 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
29467 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
29468 return DAG.getSelect(dl, VT, ZAmt, R, Res);
29469 }
29470 }
29471
29472 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
29473 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
29474 // TODO: Special case handling for shift by 0/1, really we can afford either
29475 // of these cases in pre-SSE41/XOP/AVX512 but not both.
29476 if (Opc == ISD::SRA && ConstantAmt &&
29477 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
29478 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
29479 !Subtarget.hasAVX512()) ||
29480 DAG.isKnownNeverZero(Amt))) {
29481 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
29482 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
29483 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
29484 SDValue Amt0 =
29485 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
29486 SDValue Amt1 =
29487 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
29488 SDValue Sra1 =
29489 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
29490 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
29491 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
29492 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
29493 }
29494 }
29495
29496 // v4i32 Non Uniform Shifts.
29497 // If the shift amount is constant we can shift each lane using the SSE2
29498 // immediate shifts, else we need to zero-extend each lane to the lower i64
29499 // and shift using the SSE2 variable shifts.
29500 // The separate results can then be blended together.
29501 if (VT == MVT::v4i32) {
29502 SDValue Amt0, Amt1, Amt2, Amt3;
29503 if (ConstantAmt) {
29504 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
29505 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
29506 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
29507 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
29508 } else {
29509 // The SSE2 shifts use the lower i64 as the same shift amount for
29510 // all lanes and the upper i64 is ignored. On AVX we're better off
29511 // just zero-extending, but for SSE just duplicating the top 16-bits is
29512 // cheaper and has the same effect for out of range values.
29513 if (Subtarget.hasAVX()) {
29514 SDValue Z = DAG.getConstant(0, dl, VT);
29515 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
29516 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
29517 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
29518 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
29519 } else {
29520 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
29521 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
29522 {4, 5, 6, 7, -1, -1, -1, -1});
29523 SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG);
29524 SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG);
29525 Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02);
29526 Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13);
29527 Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02);
29528 Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13);
29529 }
29530 }
29531
29532 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
29533 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
29534 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
29535 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
29536 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
29537
29538 // Merge the shifted lane results optimally with/without PBLENDW.
29539 // TODO - ideally shuffle combining would handle this.
29540 if (Subtarget.hasSSE41()) {
29541 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
29542 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
29543 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
29544 }
29545 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
29546 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
29547 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
29548 }
29549
29550 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
29551 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
29552 // make the existing SSE solution better.
29553 // NOTE: We honor prefered vector width before promoting to 512-bits.
29554 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
29555 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
29556 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
29557 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
29558 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
29559 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
29560 "Unexpected vector type");
29561 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
29562 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
29563 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29564 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
29565 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
29566 return DAG.getNode(ISD::TRUNCATE, dl, VT,
29567 DAG.getNode(Opc, dl, ExtVT, R, Amt));
29568 }
29569
29570 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
29571 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
29572 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
29573 (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
29574 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
29575 !Subtarget.hasXOP()) {
29576 int NumElts = VT.getVectorNumElements();
29577 SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
29578
29579 // Extend constant shift amount to vXi16 (it doesn't matter if the type
29580 // isn't legal).
29581 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29582 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
29583 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
29584 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
29586 "Constant build vector expected");
29587
29588 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
29589 bool IsSigned = Opc == ISD::SRA;
29590 R = DAG.getExtOrTrunc(IsSigned, R, dl, ExVT);
29591 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
29592 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
29593 return DAG.getZExtOrTrunc(R, dl, VT);
29594 }
29595
29596 SmallVector<SDValue, 16> LoAmt, HiAmt;
29597 for (int i = 0; i != NumElts; i += 16) {
29598 for (int j = 0; j != 8; ++j) {
29599 LoAmt.push_back(Amt.getOperand(i + j));
29600 HiAmt.push_back(Amt.getOperand(i + j + 8));
29601 }
29602 }
29603
29604 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
29605 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
29606 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
29607
29608 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
29609 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
29610 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
29611 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
29612 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
29613 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
29614 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
29615 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
29616 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
29617 }
29618
29619 if (VT == MVT::v16i8 ||
29620 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
29621 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
29622 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
29623
29624 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
29625 if (VT.is512BitVector()) {
29626 // On AVX512BW targets we make use of the fact that VSELECT lowers
29627 // to a masked blend which selects bytes based just on the sign bit
29628 // extracted to a mask.
29629 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
29630 V0 = DAG.getBitcast(VT, V0);
29631 V1 = DAG.getBitcast(VT, V1);
29632 Sel = DAG.getBitcast(VT, Sel);
29633 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
29634 ISD::SETGT);
29635 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
29636 } else if (Subtarget.hasSSE41()) {
29637 // On SSE41 targets we can use PBLENDVB which selects bytes based just
29638 // on the sign bit.
29639 V0 = DAG.getBitcast(VT, V0);
29640 V1 = DAG.getBitcast(VT, V1);
29641 Sel = DAG.getBitcast(VT, Sel);
29642 return DAG.getBitcast(SelVT,
29643 DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
29644 }
29645 // On pre-SSE41 targets we test for the sign bit by comparing to
29646 // zero - a negative value will set all bits of the lanes to true
29647 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
29648 SDValue Z = DAG.getConstant(0, dl, SelVT);
29649 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
29650 return DAG.getSelect(dl, SelVT, C, V0, V1);
29651 };
29652
29653 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
29654 // We can safely do this using i16 shifts as we're only interested in
29655 // the 3 lower bits of each byte.
29656 Amt = DAG.getBitcast(ExtVT, Amt);
29657 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
29658 Amt = DAG.getBitcast(VT, Amt);
29659
29660 if (Opc == ISD::SHL || Opc == ISD::SRL) {
29661 // r = VSELECT(r, shift(r, 4), a);
29662 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
29663 R = SignBitSelect(VT, Amt, M, R);
29664
29665 // a += a
29666 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29667
29668 // r = VSELECT(r, shift(r, 2), a);
29669 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
29670 R = SignBitSelect(VT, Amt, M, R);
29671
29672 // a += a
29673 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29674
29675 // return VSELECT(r, shift(r, 1), a);
29676 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
29677 R = SignBitSelect(VT, Amt, M, R);
29678 return R;
29679 }
29680
29681 if (Opc == ISD::SRA) {
29682 // For SRA we need to unpack each byte to the higher byte of a i16 vector
29683 // so we can correctly sign extend. We don't care what happens to the
29684 // lower byte.
29685 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
29686 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
29687 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
29688 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
29689 ALo = DAG.getBitcast(ExtVT, ALo);
29690 AHi = DAG.getBitcast(ExtVT, AHi);
29691 RLo = DAG.getBitcast(ExtVT, RLo);
29692 RHi = DAG.getBitcast(ExtVT, RHi);
29693
29694 // r = VSELECT(r, shift(r, 4), a);
29695 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
29696 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
29697 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
29698 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
29699
29700 // a += a
29701 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
29702 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
29703
29704 // r = VSELECT(r, shift(r, 2), a);
29705 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
29706 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
29707 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
29708 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
29709
29710 // a += a
29711 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
29712 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
29713
29714 // r = VSELECT(r, shift(r, 1), a);
29715 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
29716 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
29717 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
29718 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
29719
29720 // Logical shift the result back to the lower byte, leaving a zero upper
29721 // byte meaning that we can safely pack with PACKUSWB.
29722 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
29723 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
29724 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
29725 }
29726 }
29727
29728 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
29729 MVT ExtVT = MVT::v8i32;
29730 SDValue Z = DAG.getConstant(0, dl, VT);
29731 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
29732 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
29733 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
29734 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
29735 ALo = DAG.getBitcast(ExtVT, ALo);
29736 AHi = DAG.getBitcast(ExtVT, AHi);
29737 RLo = DAG.getBitcast(ExtVT, RLo);
29738 RHi = DAG.getBitcast(ExtVT, RHi);
29739 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
29740 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
29741 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
29742 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
29743 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
29744 }
29745
29746 if (VT == MVT::v8i16) {
29747 // If we have a constant shift amount, the non-SSE41 path is best as
29748 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
29749 bool UseSSE41 = Subtarget.hasSSE41() &&
29751
29752 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
29753 // On SSE41 targets we can use PBLENDVB which selects bytes based just on
29754 // the sign bit.
29755 if (UseSSE41) {
29756 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
29757 V0 = DAG.getBitcast(ExtVT, V0);
29758 V1 = DAG.getBitcast(ExtVT, V1);
29759 Sel = DAG.getBitcast(ExtVT, Sel);
29760 return DAG.getBitcast(
29761 VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
29762 }
29763 // On pre-SSE41 targets we splat the sign bit - a negative value will
29764 // set all bits of the lanes to true and VSELECT uses that in
29765 // its OR(AND(V0,C),AND(V1,~C)) lowering.
29766 SDValue C =
29767 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
29768 return DAG.getSelect(dl, VT, C, V0, V1);
29769 };
29770
29771 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
29772 if (UseSSE41) {
29773 // On SSE41 targets we need to replicate the shift mask in both
29774 // bytes for PBLENDVB.
29775 Amt = DAG.getNode(
29776 ISD::OR, dl, VT,
29777 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
29778 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
29779 } else {
29780 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
29781 }
29782
29783 // r = VSELECT(r, shift(r, 8), a);
29784 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
29785 R = SignBitSelect(Amt, M, R);
29786
29787 // a += a
29788 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29789
29790 // r = VSELECT(r, shift(r, 4), a);
29791 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
29792 R = SignBitSelect(Amt, M, R);
29793
29794 // a += a
29795 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29796
29797 // r = VSELECT(r, shift(r, 2), a);
29798 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
29799 R = SignBitSelect(Amt, M, R);
29800
29801 // a += a
29802 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29803
29804 // return VSELECT(r, shift(r, 1), a);
29805 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
29806 R = SignBitSelect(Amt, M, R);
29807 return R;
29808 }
29809
29810 // Decompose 256-bit shifts into 128-bit shifts.
29811 if (VT.is256BitVector())
29812 return splitVectorIntBinary(Op, DAG, dl);
29813
29814 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29815 return splitVectorIntBinary(Op, DAG, dl);
29816
29817 return SDValue();
29818}
29819
29821 SelectionDAG &DAG) {
29822 MVT VT = Op.getSimpleValueType();
29823 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
29824 "Unexpected funnel shift opcode!");
29825
29826 SDLoc DL(Op);
29827 SDValue Op0 = Op.getOperand(0);
29828 SDValue Op1 = Op.getOperand(1);
29829 SDValue Amt = Op.getOperand(2);
29830 unsigned EltSizeInBits = VT.getScalarSizeInBits();
29831 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
29832
29833 if (VT.isVector()) {
29834 APInt APIntShiftAmt;
29835 bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
29836 unsigned NumElts = VT.getVectorNumElements();
29837
29838 if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
29839 if (IsFSHR)
29840 std::swap(Op0, Op1);
29841
29842 if (IsCstSplat) {
29843 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
29844 SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
29845 return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
29846 {Op0, Op1, Imm}, DAG, Subtarget);
29847 }
29848 return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
29849 {Op0, Op1, Amt}, DAG, Subtarget);
29850 }
29851 assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
29852 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||
29853 VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&
29854 "Unexpected funnel shift type!");
29855
29856 // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
29857 // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).
29858 if (IsCstSplat) {
29859 // TODO: Can't use generic expansion as UNDEF amt elements can be
29860 // converted to other values when folded to shift amounts, losing the
29861 // splat.
29862 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
29863 uint64_t ShXAmt = IsFSHR ? (EltSizeInBits - ShiftAmt) : ShiftAmt;
29864 uint64_t ShYAmt = IsFSHR ? ShiftAmt : (EltSizeInBits - ShiftAmt);
29865 assert((ShXAmt + ShYAmt) == EltSizeInBits && "Illegal funnel shift");
29866
29867 if (EltSizeInBits == 8 && ShXAmt > 1 &&
29868 (Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT))) {
29869 // For vXi8 cases on Subtargets that can perform VPCMOV/VPTERNLOG
29870 // bit-select - lower using vXi16 shifts and then perform the bitmask at
29871 // the original vector width to handle cases where we split.
29872 MVT WideVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29873 APInt MaskX = APInt::getHighBitsSet(8, 8 - ShXAmt);
29874 APInt MaskY = APInt::getLowBitsSet(8, 8 - ShYAmt);
29875 SDValue ShX =
29876 DAG.getNode(ISD::SHL, DL, WideVT, DAG.getBitcast(WideVT, Op0),
29877 DAG.getShiftAmountConstant(ShXAmt, WideVT, DL));
29878 SDValue ShY =
29879 DAG.getNode(ISD::SRL, DL, WideVT, DAG.getBitcast(WideVT, Op1),
29880 DAG.getShiftAmountConstant(ShYAmt, WideVT, DL));
29881 ShX = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShX),
29882 DAG.getConstant(MaskX, DL, VT));
29883 ShY = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShY),
29884 DAG.getConstant(MaskY, DL, VT));
29885 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
29886 }
29887
29888 SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, Op0,
29889 DAG.getShiftAmountConstant(ShXAmt, VT, DL));
29890 SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Op1,
29891 DAG.getShiftAmountConstant(ShYAmt, VT, DL));
29892 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
29893 }
29894
29895 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
29896 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
29897 bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());
29898
29899 // Constant vXi16 funnel shifts can be efficiently handled by default.
29900 if (IsCst && EltSizeInBits == 16)
29901 return SDValue();
29902
29903 unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;
29904 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
29905 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
29906
29907 // Split 256-bit integers on XOP/pre-AVX2 targets.
29908 // Split 512-bit integers on non 512-bit BWI targets.
29909 if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||
29910 !Subtarget.hasAVX2())) ||
29911 (VT.is512BitVector() && !Subtarget.useBWIRegs() &&
29912 EltSizeInBits < 32)) {
29913 // Pre-mask the amount modulo using the wider vector.
29914 Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);
29915 return splitVectorOp(Op, DAG, DL);
29916 }
29917
29918 // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))
29919 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {
29920 int ScalarAmtIdx = -1;
29921 if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {
29922 // Uniform vXi16 funnel shifts can be efficiently handled by default.
29923 if (EltSizeInBits == 16)
29924 return SDValue();
29925
29926 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
29927 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
29928 Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,
29929 ScalarAmtIdx, Subtarget, DAG);
29930 Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,
29931 ScalarAmtIdx, Subtarget, DAG);
29932 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
29933 }
29934 }
29935
29936 MVT WideSVT = MVT::getIntegerVT(
29937 std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));
29938 MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);
29939
29940 // If per-element shifts are legal, fallback to generic expansion.
29941 if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())
29942 return SDValue();
29943
29944 // Attempt to fold as:
29945 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
29946 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
29947 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
29948 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
29949 Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);
29950 Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);
29951 AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
29952 Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,
29953 EltSizeInBits, DAG);
29954 SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);
29955 Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);
29956 if (!IsFSHR)
29957 Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,
29958 EltSizeInBits, DAG);
29959 return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
29960 }
29961
29962 // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)
29963 if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||
29964 supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
29965 SDValue Z = DAG.getConstant(0, DL, VT);
29966 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
29967 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
29968 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
29969 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
29970 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
29971 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
29972 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
29973 }
29974
29975 // Fallback to generic expansion.
29976 return SDValue();
29977 }
29978 assert(
29979 (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
29980 "Unexpected funnel shift type!");
29981
29982 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
29983 bool OptForSize = DAG.shouldOptForSize();
29984 bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
29985
29986 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
29987 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
29988 if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
29989 !isa<ConstantSDNode>(Amt)) {
29990 SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
29991 SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
29992 Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
29993 Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
29994 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
29995 SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
29996 Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
29997 if (IsFSHR) {
29998 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
29999 } else {
30000 Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
30001 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
30002 }
30003 return DAG.getZExtOrTrunc(Res, DL, VT);
30004 }
30005
30006 if (VT == MVT::i8 || ExpandFunnel)
30007 return SDValue();
30008
30009 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
30010 if (VT == MVT::i16) {
30011 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
30012 DAG.getConstant(15, DL, Amt.getValueType()));
30013 unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
30014 return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
30015 }
30016
30017 return Op;
30018}
30019
30020static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
30021 SelectionDAG &DAG) {
30022 MVT VT = Op.getSimpleValueType();
30023 assert(VT.isVector() && "Custom lowering only for vector rotates!");
30024
30025 SDLoc DL(Op);
30026 SDValue R = Op.getOperand(0);
30027 SDValue Amt = Op.getOperand(1);
30028 unsigned Opcode = Op.getOpcode();
30029 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30030 int NumElts = VT.getVectorNumElements();
30031 bool IsROTL = Opcode == ISD::ROTL;
30032
30033 // Check for constant splat rotation amount.
30034 APInt CstSplatValue;
30035 bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
30036
30037 // Check for splat rotate by zero.
30038 if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
30039 return R;
30040
30041 // AVX512 implicitly uses modulo rotation amounts.
30042 if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
30043 // Attempt to rotate by immediate.
30044 if (IsCstSplat) {
30045 unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;
30046 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
30047 return DAG.getNode(RotOpc, DL, VT, R,
30048 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
30049 }
30050
30051 // Else, fall-back on VPROLV/VPRORV.
30052 return Op;
30053 }
30054
30055 // AVX512 VBMI2 vXi16 - lower to funnel shifts.
30056 if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
30057 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
30058 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
30059 }
30060
30061 SDValue Z = DAG.getConstant(0, DL, VT);
30062
30063 if (!IsROTL) {
30064 // If the ISD::ROTR amount is constant, we're always better converting to
30065 // ISD::ROTL.
30066 if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))
30067 return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);
30068
30069 // XOP targets always prefers ISD::ROTL.
30070 if (Subtarget.hasXOP())
30071 return DAG.getNode(ISD::ROTL, DL, VT, R,
30072 DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
30073 }
30074
30075 // Split 256-bit integers on XOP/pre-AVX2 targets.
30076 if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
30077 return splitVectorIntBinary(Op, DAG, DL);
30078
30079 // XOP has 128-bit vector variable + immediate rotates.
30080 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
30081 // XOP implicitly uses modulo rotation amounts.
30082 if (Subtarget.hasXOP()) {
30083 assert(IsROTL && "Only ROTL expected");
30084 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
30085
30086 // Attempt to rotate by immediate.
30087 if (IsCstSplat) {
30088 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
30089 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
30090 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
30091 }
30092
30093 // Use general rotate by variable (per-element).
30094 return Op;
30095 }
30096
30097 // Rotate by an uniform constant - expand back to shifts.
30098 // TODO: Can't use generic expansion as UNDEF amt elements can be converted
30099 // to other values when folded to shift amounts, losing the splat.
30100 if (IsCstSplat) {
30101 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
30102 uint64_t ShlAmt = IsROTL ? RotAmt : (EltSizeInBits - RotAmt);
30103 uint64_t SrlAmt = IsROTL ? (EltSizeInBits - RotAmt) : RotAmt;
30104 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, R,
30105 DAG.getShiftAmountConstant(ShlAmt, VT, DL));
30106 SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, R,
30107 DAG.getShiftAmountConstant(SrlAmt, VT, DL));
30108 return DAG.getNode(ISD::OR, DL, VT, Shl, Srl);
30109 }
30110
30111 // Split 512-bit integers on non 512-bit BWI targets.
30112 if (VT.is512BitVector() && !Subtarget.useBWIRegs())
30113 return splitVectorIntBinary(Op, DAG, DL);
30114
30115 assert(
30116 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
30117 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
30118 Subtarget.hasAVX2()) ||
30119 ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&
30120 "Only vXi32/vXi16/vXi8 vector rotates supported");
30121
30122 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
30123 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
30124
30125 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
30126 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
30127
30128 // Attempt to fold as unpack(x,x) << zext(splat(y)):
30129 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
30130 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
30131 if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {
30132 int BaseRotAmtIdx = -1;
30133 if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {
30134 if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {
30135 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
30136 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
30137 }
30138 unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;
30139 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
30140 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
30141 Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
30142 BaseRotAmtIdx, Subtarget, DAG);
30143 Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
30144 BaseRotAmtIdx, Subtarget, DAG);
30145 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
30146 }
30147 }
30148
30149 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
30150 unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;
30151
30152 // Attempt to fold as unpack(x,x) << zext(y):
30153 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
30154 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
30155 // Const vXi16/vXi32 are excluded in favor of MUL-based lowering.
30156 if (!(ConstantAmt && EltSizeInBits != 8) &&
30157 !supportedVectorVarShift(VT, Subtarget, ShiftOpc) &&
30158 (ConstantAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc))) {
30159 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
30160 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
30161 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
30162 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
30163 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
30164 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
30165 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
30166 }
30167
30168 // v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by
30169 // the amount bit.
30170 // TODO: We're doing nothing here that we couldn't do for funnel shifts.
30171 if (EltSizeInBits == 8) {
30172 MVT WideVT =
30173 MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);
30174
30175 // Attempt to fold as:
30176 // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
30177 // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
30178 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
30179 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
30180 // If we're rotating by constant, just use default promotion.
30181 if (ConstantAmt)
30182 return SDValue();
30183 // See if we can perform this by widening to vXi16 or vXi32.
30184 R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);
30185 R = DAG.getNode(
30186 ISD::OR, DL, WideVT, R,
30187 getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));
30188 Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
30189 R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);
30190 if (IsROTL)
30191 R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);
30192 return DAG.getNode(ISD::TRUNCATE, DL, VT, R);
30193 }
30194
30195 // We don't need ModuloAmt here as we just peek at individual bits.
30196 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
30197 if (Subtarget.hasSSE41()) {
30198 // On SSE41 targets we can use PBLENDVB which selects bytes based just
30199 // on the sign bit.
30200 V0 = DAG.getBitcast(VT, V0);
30201 V1 = DAG.getBitcast(VT, V1);
30202 Sel = DAG.getBitcast(VT, Sel);
30203 return DAG.getBitcast(SelVT,
30204 DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
30205 }
30206 // On pre-SSE41 targets we test for the sign bit by comparing to
30207 // zero - a negative value will set all bits of the lanes to true
30208 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
30209 SDValue Z = DAG.getConstant(0, DL, SelVT);
30210 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
30211 return DAG.getSelect(DL, SelVT, C, V0, V1);
30212 };
30213
30214 // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.
30215 if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {
30216 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
30217 IsROTL = true;
30218 }
30219
30220 unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;
30221 unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;
30222
30223 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
30224 // We can safely do this using i16 shifts as we're only interested in
30225 // the 3 lower bits of each byte.
30226 Amt = DAG.getBitcast(ExtVT, Amt);
30227 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
30228 Amt = DAG.getBitcast(VT, Amt);
30229
30230 // r = VSELECT(r, rot(r, 4), a);
30231 SDValue M;
30232 M = DAG.getNode(
30233 ISD::OR, DL, VT,
30234 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
30235 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
30236 R = SignBitSelect(VT, Amt, M, R);
30237
30238 // a += a
30239 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
30240
30241 // r = VSELECT(r, rot(r, 2), a);
30242 M = DAG.getNode(
30243 ISD::OR, DL, VT,
30244 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
30245 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
30246 R = SignBitSelect(VT, Amt, M, R);
30247
30248 // a += a
30249 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
30250
30251 // return VSELECT(r, rot(r, 1), a);
30252 M = DAG.getNode(
30253 ISD::OR, DL, VT,
30254 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
30255 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
30256 return SignBitSelect(VT, Amt, M, R);
30257 }
30258
30259 bool IsSplatAmt = DAG.isSplatValue(Amt);
30260 bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
30261 supportedVectorVarShift(VT, Subtarget, ISD::SRL);
30262
30263 // Fallback for splats + all supported variable shifts.
30264 // Fallback for non-constants AVX2 vXi16 as well.
30265 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
30266 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
30267 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
30268 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
30269 SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);
30270 SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);
30271 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
30272 }
30273
30274 // Everything below assumes ISD::ROTL.
30275 if (!IsROTL) {
30276 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
30277 IsROTL = true;
30278 }
30279
30280 // ISD::ROT* uses modulo rotate amounts.
30281 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
30282
30283 assert(IsROTL && "Only ROTL supported");
30284
30285 // As with shifts, attempt to convert the rotation amount to a multiplication
30286 // factor, fallback to general expansion.
30287 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
30288 if (!Scale)
30289 return SDValue();
30290
30291 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
30292 if (EltSizeInBits == 16) {
30293 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
30294 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
30295 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
30296 }
30297
30298 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
30299 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
30300 // that can then be OR'd with the lower 32-bits.
30301 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
30302 static const int OddMask[] = {1, -1, 3, -1};
30303 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
30304 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
30305
30306 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
30307 DAG.getBitcast(MVT::v2i64, R),
30308 DAG.getBitcast(MVT::v2i64, Scale));
30309 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
30310 DAG.getBitcast(MVT::v2i64, R13),
30311 DAG.getBitcast(MVT::v2i64, Scale13));
30312 Res02 = DAG.getBitcast(VT, Res02);
30313 Res13 = DAG.getBitcast(VT, Res13);
30314
30315 return DAG.getNode(ISD::OR, DL, VT,
30316 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
30317 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
30318}
30319
30320/// Returns true if the operand type is exactly twice the native width, and
30321/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
30322/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
30323/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
30324bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
30325 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
30326
30327 if (OpWidth == 64)
30328 return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();
30329 if (OpWidth == 128)
30330 return Subtarget.canUseCMPXCHG16B();
30331
30332 return false;
30333}
30334
30336X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
30337 Type *MemType = SI->getValueOperand()->getType();
30338
30339 bool NoImplicitFloatOps =
30340 SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
30341 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
30342 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
30343 (Subtarget.hasSSE1() || Subtarget.hasX87()))
30345
30346 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
30348}
30349
30350// Note: this turns large loads into lock cmpxchg8b/16b.
30351// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
30353X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
30354 Type *MemType = LI->getType();
30355
30356 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
30357 // can use movq to do the load. If we have X87 we can load into an 80-bit
30358 // X87 register and store it to a stack temporary.
30359 bool NoImplicitFloatOps =
30360 LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
30361 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
30362 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
30363 (Subtarget.hasSSE1() || Subtarget.hasX87()))
30365
30366 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
30368}
30369
30370enum BitTestKind : unsigned {
30377
30378static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
30379 using namespace llvm::PatternMatch;
30380 BitTestKind BTK = UndefBit;
30381 auto *C = dyn_cast<ConstantInt>(V);
30382 if (C) {
30383 // Check if V is a power of 2 or NOT power of 2.
30384 if (isPowerOf2_64(C->getZExtValue()))
30385 BTK = ConstantBit;
30386 else if (isPowerOf2_64((~C->getValue()).getZExtValue()))
30387 BTK = NotConstantBit;
30388 return {V, BTK};
30389 }
30390
30391 // Check if V is some power of 2 pattern known to be non-zero
30392 auto *I = dyn_cast<Instruction>(V);
30393 if (I) {
30394 bool Not = false;
30395 // Check if we have a NOT
30396 Value *PeekI;
30397 if (match(I, m_c_Xor(m_Value(PeekI), m_AllOnes())) ||
30398 match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {
30399 Not = true;
30400 I = dyn_cast<Instruction>(PeekI);
30401
30402 // If I is constant, it will fold and we can evaluate later. If its an
30403 // argument or something of that nature, we can't analyze.
30404 if (I == nullptr)
30405 return {nullptr, UndefBit};
30406 }
30407 // We can only use 1 << X without more sophisticated analysis. C << X where
30408 // C is a power of 2 but not 1 can result in zero which cannot be translated
30409 // to bittest. Likewise any C >> X (either arith or logical) can be zero.
30410 if (I->getOpcode() == Instruction::Shl) {
30411 // Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &
30412 // -X` and some other provable power of 2 patterns that we can use CTZ on
30413 // may be profitable.
30414 // Todo(2): It may be possible in some cases to prove that Shl(C, X) is
30415 // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also
30416 // be provably a non-zero power of 2.
30417 // Todo(3): ROTL and ROTR patterns on a power of 2 C should also be
30418 // transformable to bittest.
30419 auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));
30420 if (!ShiftVal)
30421 return {nullptr, UndefBit};
30422 if (ShiftVal->equalsInt(1))
30423 BTK = Not ? NotShiftBit : ShiftBit;
30424
30425 if (BTK == UndefBit)
30426 return {nullptr, UndefBit};
30427
30428 Value *BitV = I->getOperand(1);
30429
30430 Value *AndOp;
30431 const APInt *AndC;
30432 if (match(BitV, m_c_And(m_Value(AndOp), m_APInt(AndC)))) {
30433 // Read past a shiftmask instruction to find count
30434 if (*AndC == (I->getType()->getPrimitiveSizeInBits() - 1))
30435 BitV = AndOp;
30436 }
30437 return {BitV, BTK};
30438 }
30439 }
30440 return {nullptr, UndefBit};
30441}
30442
30444X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
30445 using namespace llvm::PatternMatch;
30446 // If the atomicrmw's result isn't actually used, we can just add a "lock"
30447 // prefix to a normal instruction for these operations.
30448 if (AI->use_empty())
30450
30451 if (AI->getOperation() == AtomicRMWInst::Xor) {
30452 // A ^ SignBit -> A + SignBit. This allows us to use `xadd` which is
30453 // preferable to both `cmpxchg` and `btc`.
30454 if (match(AI->getOperand(1), m_SignMask()))
30456 }
30457
30458 // If the atomicrmw's result is used by a single bit AND, we may use
30459 // bts/btr/btc instruction for these operations.
30460 // Note: InstCombinePass can cause a de-optimization here. It replaces the
30461 // SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor
30462 // (depending on CC). This pattern can only use bts/btr/btc but we don't
30463 // detect it.
30464 Instruction *I = AI->user_back();
30465 auto BitChange = FindSingleBitChange(AI->getValOperand());
30466 if (BitChange.second == UndefBit || !AI->hasOneUse() ||
30467 I->getOpcode() != Instruction::And ||
30468 AI->getType()->getPrimitiveSizeInBits() == 8 ||
30469 AI->getParent() != I->getParent())
30471
30472 unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0;
30473
30474 // This is a redundant AND, it should get cleaned up elsewhere.
30475 if (AI == I->getOperand(OtherIdx))
30477
30478 // The following instruction must be a AND single bit.
30479 if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {
30480 auto *C1 = cast<ConstantInt>(AI->getValOperand());
30481 auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx));
30482 if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {
30484 }
30485 if (AI->getOperation() == AtomicRMWInst::And) {
30486 return ~C1->getValue() == C2->getValue()
30489 }
30492 }
30493
30494 assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit);
30495
30496 auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx));
30497 if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)
30499
30500 assert(BitChange.first != nullptr && BitTested.first != nullptr);
30501
30502 // If shift amounts are not the same we can't use BitTestIntrinsic.
30503 if (BitChange.first != BitTested.first)
30505
30506 // If atomic AND need to be masking all be one bit and testing the one bit
30507 // unset in the mask.
30508 if (AI->getOperation() == AtomicRMWInst::And)
30509 return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit)
30512
30513 // If atomic XOR/OR need to be setting and testing the same bit.
30514 return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)
30517}
30518
30519void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
30520 IRBuilder<> Builder(AI);
30521 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
30524 switch (AI->getOperation()) {
30525 default:
30526 llvm_unreachable("Unknown atomic operation");
30527 case AtomicRMWInst::Or:
30528 IID_C = Intrinsic::x86_atomic_bts;
30529 IID_I = Intrinsic::x86_atomic_bts_rm;
30530 break;
30531 case AtomicRMWInst::Xor:
30532 IID_C = Intrinsic::x86_atomic_btc;
30533 IID_I = Intrinsic::x86_atomic_btc_rm;
30534 break;
30535 case AtomicRMWInst::And:
30536 IID_C = Intrinsic::x86_atomic_btr;
30537 IID_I = Intrinsic::x86_atomic_btr_rm;
30538 break;
30539 }
30540 Instruction *I = AI->user_back();
30541 LLVMContext &Ctx = AI->getContext();
30542 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
30544 Function *BitTest = nullptr;
30545 Value *Result = nullptr;
30546 auto BitTested = FindSingleBitChange(AI->getValOperand());
30547 assert(BitTested.first != nullptr);
30548
30549 if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {
30550 auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));
30551
30552 BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_C, AI->getType());
30553
30554 unsigned Imm = llvm::countr_zero(C->getZExtValue());
30555 Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)});
30556 } else {
30557 BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_I, AI->getType());
30558
30559 assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit);
30560
30561 Value *SI = BitTested.first;
30562 assert(SI != nullptr);
30563
30564 // BT{S|R|C} on memory operand don't modulo bit position so we need to
30565 // mask it.
30566 unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();
30567 Value *BitPos =
30568 Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));
30569 // Todo(1): In many cases it may be provable that SI is less than
30570 // ShiftBits in which case this mask is unnecessary
30571 // Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1
30572 // << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in
30573 // favor of just a raw BT{S|R|C}.
30574
30575 Result = Builder.CreateCall(BitTest, {Addr, BitPos});
30576 Result = Builder.CreateZExtOrTrunc(Result, AI->getType());
30577
30578 // If the result is only used for zero/non-zero status then we don't need to
30579 // shift value back. Otherwise do so.
30580 for (auto It = I->user_begin(); It != I->user_end(); ++It) {
30581 if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {
30582 if (ICmp->isEquality()) {
30583 auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));
30584 auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));
30585 if (C0 || C1) {
30586 assert(C0 == nullptr || C1 == nullptr);
30587 if ((C0 ? C0 : C1)->isZero())
30588 continue;
30589 }
30590 }
30591 }
30592 Result = Builder.CreateShl(Result, BitPos);
30593 break;
30594 }
30595 }
30596
30597 I->replaceAllUsesWith(Result);
30598 I->eraseFromParent();
30599 AI->eraseFromParent();
30600}
30601
30603 using namespace llvm::PatternMatch;
30604 if (!AI->hasOneUse())
30605 return false;
30606
30607 Value *Op = AI->getOperand(1);
30609 Instruction *I = AI->user_back();
30611 if (Opc == AtomicRMWInst::Add) {
30612 if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value())))
30613 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
30614 if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) {
30615 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
30616 return Pred == CmpInst::ICMP_SLT;
30617 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
30618 return Pred == CmpInst::ICMP_SGT;
30619 }
30620 return false;
30621 }
30622 if (Opc == AtomicRMWInst::Sub) {
30623 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
30624 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
30625 if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) {
30626 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
30627 return Pred == CmpInst::ICMP_SLT;
30628 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
30629 return Pred == CmpInst::ICMP_SGT;
30630 }
30631 return false;
30632 }
30633 if ((Opc == AtomicRMWInst::Or &&
30635 (Opc == AtomicRMWInst::And &&
30637 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
30638 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE ||
30639 Pred == CmpInst::ICMP_SLT;
30640 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
30641 return Pred == CmpInst::ICMP_SGT;
30642 return false;
30643 }
30644 if (Opc == AtomicRMWInst::Xor) {
30645 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
30646 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
30647 if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) {
30648 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
30649 return Pred == CmpInst::ICMP_SLT;
30650 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
30651 return Pred == CmpInst::ICMP_SGT;
30652 }
30653 return false;
30654 }
30655
30656 return false;
30657}
30658
30659void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic(
30660 AtomicRMWInst *AI) const {
30661 IRBuilder<> Builder(AI);
30662 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
30663 Instruction *TempI = nullptr;
30664 LLVMContext &Ctx = AI->getContext();
30665 ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back());
30666 if (!ICI) {
30667 TempI = AI->user_back();
30668 assert(TempI->hasOneUse() && "Must have one use");
30669 ICI = cast<ICmpInst>(TempI->user_back());
30670 }
30672 ICmpInst::Predicate Pred = ICI->getPredicate();
30673 switch (Pred) {
30674 default:
30675 llvm_unreachable("Not supported Pred");
30676 case CmpInst::ICMP_EQ:
30677 CC = X86::COND_E;
30678 break;
30679 case CmpInst::ICMP_NE:
30680 CC = X86::COND_NE;
30681 break;
30682 case CmpInst::ICMP_SLT:
30683 CC = X86::COND_S;
30684 break;
30685 case CmpInst::ICMP_SGT:
30686 CC = X86::COND_NS;
30687 break;
30688 }
30690 switch (AI->getOperation()) {
30691 default:
30692 llvm_unreachable("Unknown atomic operation");
30693 case AtomicRMWInst::Add:
30694 IID = Intrinsic::x86_atomic_add_cc;
30695 break;
30696 case AtomicRMWInst::Sub:
30697 IID = Intrinsic::x86_atomic_sub_cc;
30698 break;
30699 case AtomicRMWInst::Or:
30700 IID = Intrinsic::x86_atomic_or_cc;
30701 break;
30702 case AtomicRMWInst::And:
30703 IID = Intrinsic::x86_atomic_and_cc;
30704 break;
30705 case AtomicRMWInst::Xor:
30706 IID = Intrinsic::x86_atomic_xor_cc;
30707 break;
30708 }
30709 Function *CmpArith =
30710 Intrinsic::getDeclaration(AI->getModule(), IID, AI->getType());
30711 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
30713 Value *Call = Builder.CreateCall(
30714 CmpArith, {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});
30715 Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx));
30716 ICI->replaceAllUsesWith(Result);
30717 ICI->eraseFromParent();
30718 if (TempI)
30719 TempI->eraseFromParent();
30720 AI->eraseFromParent();
30721}
30722
30724X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
30725 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
30726 Type *MemType = AI->getType();
30727
30728 // If the operand is too big, we must see if cmpxchg8/16b is available
30729 // and default to library calls otherwise.
30730 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
30731 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
30733 }
30734
30736 switch (Op) {
30739 case AtomicRMWInst::Add:
30740 case AtomicRMWInst::Sub:
30743 // It's better to use xadd, xsub or xchg for these in other cases.
30745 case AtomicRMWInst::Or:
30746 case AtomicRMWInst::And:
30747 case AtomicRMWInst::Xor:
30750 return shouldExpandLogicAtomicRMWInIR(AI);
30752 case AtomicRMWInst::Max:
30753 case AtomicRMWInst::Min:
30762 default:
30763 // These always require a non-trivial set of data operations on x86. We must
30764 // use a cmpxchg loop.
30766 }
30767}
30768
30769LoadInst *
30770X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
30771 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
30772 Type *MemType = AI->getType();
30773 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
30774 // there is no benefit in turning such RMWs into loads, and it is actually
30775 // harmful as it introduces a mfence.
30776 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
30777 return nullptr;
30778
30779 // If this is a canonical idempotent atomicrmw w/no uses, we have a better
30780 // lowering available in lowerAtomicArith.
30781 // TODO: push more cases through this path.
30782 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
30783 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
30784 AI->use_empty())
30785 return nullptr;
30786
30787 IRBuilder<> Builder(AI);
30788 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
30789 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
30790 auto SSID = AI->getSyncScopeID();
30791 // We must restrict the ordering to avoid generating loads with Release or
30792 // ReleaseAcquire orderings.
30794
30795 // Before the load we need a fence. Here is an example lifted from
30796 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
30797 // is required:
30798 // Thread 0:
30799 // x.store(1, relaxed);
30800 // r1 = y.fetch_add(0, release);
30801 // Thread 1:
30802 // y.fetch_add(42, acquire);
30803 // r2 = x.load(relaxed);
30804 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
30805 // lowered to just a load without a fence. A mfence flushes the store buffer,
30806 // making the optimization clearly correct.
30807 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
30808 // otherwise, we might be able to be more aggressive on relaxed idempotent
30809 // rmw. In practice, they do not look useful, so we don't try to be
30810 // especially clever.
30811 if (SSID == SyncScope::SingleThread)
30812 // FIXME: we could just insert an ISD::MEMBARRIER here, except we are at
30813 // the IR level, so we must wrap it in an intrinsic.
30814 return nullptr;
30815
30816 if (!Subtarget.hasMFence())
30817 // FIXME: it might make sense to use a locked operation here but on a
30818 // different cache-line to prevent cache-line bouncing. In practice it
30819 // is probably a small win, and x86 processors without mfence are rare
30820 // enough that we do not bother.
30821 return nullptr;
30822
30823 Function *MFence =
30824 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
30825 Builder.CreateCall(MFence, {});
30826
30827 // Finally we can emit the atomic load.
30828 LoadInst *Loaded = Builder.CreateAlignedLoad(
30829 AI->getType(), AI->getPointerOperand(), AI->getAlign());
30830 Loaded->setAtomic(Order, SSID);
30831 AI->replaceAllUsesWith(Loaded);
30832 AI->eraseFromParent();
30833 return Loaded;
30834}
30835
30836/// Emit a locked operation on a stack location which does not change any
30837/// memory location, but does involve a lock prefix. Location is chosen to be
30838/// a) very likely accessed only by a single thread to minimize cache traffic,
30839/// and b) definitely dereferenceable. Returns the new Chain result.
30841 const X86Subtarget &Subtarget, SDValue Chain,
30842 const SDLoc &DL) {
30843 // Implementation notes:
30844 // 1) LOCK prefix creates a full read/write reordering barrier for memory
30845 // operations issued by the current processor. As such, the location
30846 // referenced is not relevant for the ordering properties of the instruction.
30847 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
30848 // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
30849 // 2) Using an immediate operand appears to be the best encoding choice
30850 // here since it doesn't require an extra register.
30851 // 3) OR appears to be very slightly faster than ADD. (Though, the difference
30852 // is small enough it might just be measurement noise.)
30853 // 4) When choosing offsets, there are several contributing factors:
30854 // a) If there's no redzone, we default to TOS. (We could allocate a cache
30855 // line aligned stack object to improve this case.)
30856 // b) To minimize our chances of introducing a false dependence, we prefer
30857 // to offset the stack usage from TOS slightly.
30858 // c) To minimize concerns about cross thread stack usage - in particular,
30859 // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
30860 // captures state in the TOS frame and accesses it from many threads -
30861 // we want to use an offset such that the offset is in a distinct cache
30862 // line from the TOS frame.
30863 //
30864 // For a general discussion of the tradeoffs and benchmark results, see:
30865 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
30866
30867 auto &MF = DAG.getMachineFunction();
30868 auto &TFL = *Subtarget.getFrameLowering();
30869 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
30870
30871 if (Subtarget.is64Bit()) {
30872 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
30873 SDValue Ops[] = {
30874 DAG.getRegister(X86::RSP, MVT::i64), // Base
30875 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
30876 DAG.getRegister(0, MVT::i64), // Index
30877 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
30878 DAG.getRegister(0, MVT::i16), // Segment.
30879 Zero,
30880 Chain};
30881 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
30882 MVT::Other, Ops);
30883 return SDValue(Res, 1);
30884 }
30885
30886 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
30887 SDValue Ops[] = {
30888 DAG.getRegister(X86::ESP, MVT::i32), // Base
30889 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
30890 DAG.getRegister(0, MVT::i32), // Index
30891 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
30892 DAG.getRegister(0, MVT::i16), // Segment.
30893 Zero,
30894 Chain
30895 };
30896 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
30897 MVT::Other, Ops);
30898 return SDValue(Res, 1);
30899}
30900
30902 SelectionDAG &DAG) {
30903 SDLoc dl(Op);
30904 AtomicOrdering FenceOrdering =
30905 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
30906 SyncScope::ID FenceSSID =
30907 static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
30908
30909 // The only fence that needs an instruction is a sequentially-consistent
30910 // cross-thread fence.
30911 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
30912 FenceSSID == SyncScope::System) {
30913 if (Subtarget.hasMFence())
30914 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
30915
30916 SDValue Chain = Op.getOperand(0);
30917 return emitLockedStackOp(DAG, Subtarget, Chain, dl);
30918 }
30919
30920 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
30921 return DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
30922}
30923
30925 SelectionDAG &DAG) {
30926 MVT T = Op.getSimpleValueType();
30927 SDLoc DL(Op);
30928 unsigned Reg = 0;
30929 unsigned size = 0;
30930 switch(T.SimpleTy) {
30931 default: llvm_unreachable("Invalid value type!");
30932 case MVT::i8: Reg = X86::AL; size = 1; break;
30933 case MVT::i16: Reg = X86::AX; size = 2; break;
30934 case MVT::i32: Reg = X86::EAX; size = 4; break;
30935 case MVT::i64:
30936 assert(Subtarget.is64Bit() && "Node not type legal!");
30937 Reg = X86::RAX; size = 8;
30938 break;
30939 }
30940 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
30941 Op.getOperand(2), SDValue());
30942 SDValue Ops[] = { cpIn.getValue(0),
30943 Op.getOperand(1),
30944 Op.getOperand(3),
30945 DAG.getTargetConstant(size, DL, MVT::i8),
30946 cpIn.getValue(1) };
30947 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
30948 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
30950 Ops, T, MMO);
30951
30952 SDValue cpOut =
30953 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
30954 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
30955 MVT::i32, cpOut.getValue(2));
30956 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
30957
30958 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
30959 cpOut, Success, EFLAGS.getValue(1));
30960}
30961
30962// Create MOVMSKB, taking into account whether we need to split for AVX1.
30964 const X86Subtarget &Subtarget) {
30965 MVT InVT = V.getSimpleValueType();
30966
30967 if (InVT == MVT::v64i8) {
30968 SDValue Lo, Hi;
30969 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
30970 Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
30971 Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
30972 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
30973 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
30974 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
30975 DAG.getConstant(32, DL, MVT::i8));
30976 return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
30977 }
30978 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
30979 SDValue Lo, Hi;
30980 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
30981 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
30982 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
30983 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
30984 DAG.getConstant(16, DL, MVT::i8));
30985 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
30986 }
30987
30988 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
30989}
30990
30991static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
30992 SelectionDAG &DAG) {
30993 SDValue Src = Op.getOperand(0);
30994 MVT SrcVT = Src.getSimpleValueType();
30995 MVT DstVT = Op.getSimpleValueType();
30996
30997 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
30998 // half to v32i1 and concatenating the result.
30999 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
31000 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
31001 assert(Subtarget.hasBWI() && "Expected BWI target");
31002 SDLoc dl(Op);
31003 SDValue Lo, Hi;
31004 std::tie(Lo, Hi) = DAG.SplitScalar(Src, dl, MVT::i32, MVT::i32);
31005 Lo = DAG.getBitcast(MVT::v32i1, Lo);
31006 Hi = DAG.getBitcast(MVT::v32i1, Hi);
31007 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
31008 }
31009
31010 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
31011 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
31012 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
31013 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
31014 SDLoc DL(Op);
31015 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
31016 V = getPMOVMSKB(DL, V, DAG, Subtarget);
31017 return DAG.getZExtOrTrunc(V, DL, DstVT);
31018 }
31019
31020 assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
31021 SrcVT == MVT::i64) && "Unexpected VT!");
31022
31023 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
31024 if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
31025 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
31026 // This conversion needs to be expanded.
31027 return SDValue();
31028
31029 SDLoc dl(Op);
31030 if (SrcVT.isVector()) {
31031 // Widen the vector in input in the case of MVT::v2i32.
31032 // Example: from MVT::v2i32 to MVT::v4i32.
31034 SrcVT.getVectorNumElements() * 2);
31035 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
31036 DAG.getUNDEF(SrcVT));
31037 } else {
31038 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
31039 "Unexpected source type in LowerBITCAST");
31040 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
31041 }
31042
31043 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
31044 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
31045
31046 if (DstVT == MVT::x86mmx)
31047 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
31048
31049 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
31050 DAG.getIntPtrConstant(0, dl));
31051}
31052
31053/// Compute the horizontal sum of bytes in V for the elements of VT.
31054///
31055/// Requires V to be a byte vector and VT to be an integer vector type with
31056/// wider elements than V's type. The width of the elements of VT determines
31057/// how many bytes of V are summed horizontally to produce each element of the
31058/// result.
31060 const X86Subtarget &Subtarget,
31061 SelectionDAG &DAG) {
31062 SDLoc DL(V);
31063 MVT ByteVecVT = V.getSimpleValueType();
31064 MVT EltVT = VT.getVectorElementType();
31065 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
31066 "Expected value to have byte element type.");
31067 assert(EltVT != MVT::i8 &&
31068 "Horizontal byte sum only makes sense for wider elements!");
31069 unsigned VecSize = VT.getSizeInBits();
31070 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
31071
31072 // PSADBW instruction horizontally add all bytes and leave the result in i64
31073 // chunks, thus directly computes the pop count for v2i64 and v4i64.
31074 if (EltVT == MVT::i64) {
31075 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
31076 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
31077 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
31078 return DAG.getBitcast(VT, V);
31079 }
31080
31081 if (EltVT == MVT::i32) {
31082 // We unpack the low half and high half into i32s interleaved with zeros so
31083 // that we can use PSADBW to horizontally sum them. The most useful part of
31084 // this is that it lines up the results of two PSADBW instructions to be
31085 // two v2i64 vectors which concatenated are the 4 population counts. We can
31086 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
31087 SDValue Zeros = DAG.getConstant(0, DL, VT);
31088 SDValue V32 = DAG.getBitcast(VT, V);
31089 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
31090 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
31091
31092 // Do the horizontal sums into two v2i64s.
31093 Zeros = DAG.getConstant(0, DL, ByteVecVT);
31094 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
31095 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
31096 DAG.getBitcast(ByteVecVT, Low), Zeros);
31097 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
31098 DAG.getBitcast(ByteVecVT, High), Zeros);
31099
31100 // Merge them together.
31101 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
31102 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
31103 DAG.getBitcast(ShortVecVT, Low),
31104 DAG.getBitcast(ShortVecVT, High));
31105
31106 return DAG.getBitcast(VT, V);
31107 }
31108
31109 // The only element type left is i16.
31110 assert(EltVT == MVT::i16 && "Unknown how to handle type");
31111
31112 // To obtain pop count for each i16 element starting from the pop count for
31113 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
31114 // right by 8. It is important to shift as i16s as i8 vector shift isn't
31115 // directly supported.
31116 SDValue ShifterV = DAG.getConstant(8, DL, VT);
31117 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
31118 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
31119 DAG.getBitcast(ByteVecVT, V));
31120 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
31121}
31122
31124 const X86Subtarget &Subtarget,
31125 SelectionDAG &DAG) {
31126 MVT VT = Op.getSimpleValueType();
31127 MVT EltVT = VT.getVectorElementType();
31128 int NumElts = VT.getVectorNumElements();
31129 (void)EltVT;
31130 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");
31131
31132 // Implement a lookup table in register by using an algorithm based on:
31133 // http://wm.ite.pl/articles/sse-popcount.html
31134 //
31135 // The general idea is that every lower byte nibble in the input vector is an
31136 // index into a in-register pre-computed pop count table. We then split up the
31137 // input vector in two new ones: (1) a vector with only the shifted-right
31138 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
31139 // masked out higher ones) for each byte. PSHUFB is used separately with both
31140 // to index the in-register table. Next, both are added and the result is a
31141 // i8 vector where each element contains the pop count for input byte.
31142 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
31143 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
31144 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
31145 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
31146
31148 for (int i = 0; i < NumElts; ++i)
31149 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
31150 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
31151 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
31152
31153 // High nibbles
31154 SDValue FourV = DAG.getConstant(4, DL, VT);
31155 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
31156
31157 // Low nibbles
31158 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
31159
31160 // The input vector is used as the shuffle mask that index elements into the
31161 // LUT. After counting low and high nibbles, add the vector to obtain the
31162 // final pop count per i8 element.
31163 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
31164 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
31165 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
31166}
31167
31168// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
31169// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
31171 const X86Subtarget &Subtarget,
31172 SelectionDAG &DAG) {
31173 MVT VT = Op.getSimpleValueType();
31174 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
31175 "Unknown CTPOP type to handle");
31176 SDValue Op0 = Op.getOperand(0);
31177
31178 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
31179 if (Subtarget.hasVPOPCNTDQ()) {
31180 unsigned NumElems = VT.getVectorNumElements();
31181 assert((VT.getVectorElementType() == MVT::i8 ||
31182 VT.getVectorElementType() == MVT::i16) && "Unexpected type");
31183 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
31184 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
31185 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
31186 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
31187 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
31188 }
31189 }
31190
31191 // Decompose 256-bit ops into smaller 128-bit ops.
31192 if (VT.is256BitVector() && !Subtarget.hasInt256())
31193 return splitVectorIntUnary(Op, DAG, DL);
31194
31195 // Decompose 512-bit ops into smaller 256-bit ops.
31196 if (VT.is512BitVector() && !Subtarget.hasBWI())
31197 return splitVectorIntUnary(Op, DAG, DL);
31198
31199 // For element types greater than i8, do vXi8 pop counts and a bytesum.
31200 if (VT.getScalarType() != MVT::i8) {
31201 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
31202 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
31203 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
31204 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
31205 }
31206
31207 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
31208 if (!Subtarget.hasSSSE3())
31209 return SDValue();
31210
31211 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
31212}
31213
31214static SDValue LowerCTPOP(SDValue N, const X86Subtarget &Subtarget,
31215 SelectionDAG &DAG) {
31216 MVT VT = N.getSimpleValueType();
31217 SDValue Op = N.getOperand(0);
31218 SDLoc DL(N);
31219
31220 if (VT.isScalarInteger()) {
31221 // Compute the lower/upper bounds of the active bits of the value,
31222 // allowing us to shift the active bits down if necessary to fit into the
31223 // special cases below.
31224 KnownBits Known = DAG.computeKnownBits(Op);
31225 unsigned LZ = Known.countMinLeadingZeros();
31226 unsigned TZ = Known.countMinTrailingZeros();
31227 assert((LZ + TZ) < Known.getBitWidth() && "Illegal shifted mask");
31228 unsigned ActiveBits = Known.getBitWidth() - LZ;
31229 unsigned ShiftedActiveBits = Known.getBitWidth() - (LZ + TZ);
31230
31231 // i2 CTPOP - "ctpop(x) --> sub(x, (x >> 1))".
31232 if (ShiftedActiveBits <= 2) {
31233 if (ActiveBits > 2)
31234 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
31235 DAG.getShiftAmountConstant(TZ, VT, DL));
31236 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
31237 Op = DAG.getNode(ISD::SUB, DL, MVT::i32, Op,
31238 DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
31239 DAG.getShiftAmountConstant(1, VT, DL)));
31240 return DAG.getZExtOrTrunc(Op, DL, VT);
31241 }
31242
31243 // i3 CTPOP - perform LUT into i32 integer.
31244 if (ShiftedActiveBits <= 3) {
31245 if (ActiveBits > 3)
31246 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
31247 DAG.getShiftAmountConstant(TZ, VT, DL));
31248 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
31249 Op = DAG.getNode(ISD::SHL, DL, MVT::i32, Op,
31250 DAG.getShiftAmountConstant(1, VT, DL));
31251 Op = DAG.getNode(ISD::SRL, DL, MVT::i32,
31252 DAG.getConstant(0b1110100110010100U, DL, MVT::i32), Op);
31253 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op,
31254 DAG.getConstant(0x3, DL, MVT::i32));
31255 return DAG.getZExtOrTrunc(Op, DL, VT);
31256 }
31257
31258 // i4 CTPOP - perform LUT into i64 integer.
31259 if (ShiftedActiveBits <= 4 &&
31260 DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64)) {
31261 SDValue LUT = DAG.getConstant(0x4332322132212110ULL, DL, MVT::i64);
31262 if (ActiveBits > 4)
31263 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
31264 DAG.getShiftAmountConstant(TZ, VT, DL));
31265 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
31266 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
31267 DAG.getConstant(4, DL, MVT::i32));
31268 Op = DAG.getNode(ISD::SRL, DL, MVT::i64, LUT,
31269 DAG.getShiftAmountOperand(MVT::i64, Op));
31270 Op = DAG.getNode(ISD::AND, DL, MVT::i64, Op,
31271 DAG.getConstant(0x7, DL, MVT::i64));
31272 return DAG.getZExtOrTrunc(Op, DL, VT);
31273 }
31274
31275 // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
31276 if (ShiftedActiveBits <= 8) {
31277 SDValue Mask11 = DAG.getConstant(0x11111111U, DL, MVT::i32);
31278 if (ActiveBits > 8)
31279 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
31280 DAG.getShiftAmountConstant(TZ, VT, DL));
31281 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
31282 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
31283 DAG.getConstant(0x08040201U, DL, MVT::i32));
31284 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
31285 DAG.getShiftAmountConstant(3, MVT::i32, DL));
31286 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op, Mask11);
31287 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, Mask11);
31288 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
31289 DAG.getShiftAmountConstant(28, MVT::i32, DL));
31290 return DAG.getZExtOrTrunc(Op, DL, VT);
31291 }
31292
31293 return SDValue(); // fallback to generic expansion.
31294 }
31295
31296 assert(VT.isVector() &&
31297 "We only do custom lowering for vector population count.");
31298 return LowerVectorCTPOP(N, DL, Subtarget, DAG);
31299}
31300
31302 MVT VT = Op.getSimpleValueType();
31303 SDValue In = Op.getOperand(0);
31304 SDLoc DL(Op);
31305
31306 // For scalars, its still beneficial to transfer to/from the SIMD unit to
31307 // perform the BITREVERSE.
31308 if (!VT.isVector()) {
31309 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
31310 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
31311 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
31312 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
31313 DAG.getIntPtrConstant(0, DL));
31314 }
31315
31316 int NumElts = VT.getVectorNumElements();
31317 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
31318
31319 // Decompose 256-bit ops into smaller 128-bit ops.
31320 if (VT.is256BitVector())
31321 return splitVectorIntUnary(Op, DAG, DL);
31322
31323 assert(VT.is128BitVector() &&
31324 "Only 128-bit vector bitreverse lowering supported.");
31325
31326 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
31327 // perform the BSWAP in the shuffle.
31328 // Its best to shuffle using the second operand as this will implicitly allow
31329 // memory folding for multiple vectors.
31330 SmallVector<SDValue, 16> MaskElts;
31331 for (int i = 0; i != NumElts; ++i) {
31332 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
31333 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
31334 int PermuteByte = SourceByte | (2 << 5);
31335 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
31336 }
31337 }
31338
31339 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
31340 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
31341 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
31342 Res, Mask);
31343 return DAG.getBitcast(VT, Res);
31344}
31345
31347 SelectionDAG &DAG) {
31348 MVT VT = Op.getSimpleValueType();
31349
31350 if (Subtarget.hasXOP() && !VT.is512BitVector())
31351 return LowerBITREVERSE_XOP(Op, DAG);
31352
31353 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
31354
31355 SDValue In = Op.getOperand(0);
31356 SDLoc DL(Op);
31357
31358 // Split 512-bit ops without BWI so that we can still use the PSHUFB lowering.
31359 if (VT.is512BitVector() && !Subtarget.hasBWI())
31360 return splitVectorIntUnary(Op, DAG, DL);
31361
31362 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
31363 if (VT.is256BitVector() && !Subtarget.hasInt256())
31364 return splitVectorIntUnary(Op, DAG, DL);
31365
31366 // Lower i8/i16/i32/i64 as vXi8 BITREVERSE + BSWAP
31367 if (!VT.isVector()) {
31368 assert(
31369 (VT == MVT::i32 || VT == MVT::i64 || VT == MVT::i16 || VT == MVT::i8) &&
31370 "Only tested for i8/i16/i32/i64");
31371 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
31372 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
31373 Res = DAG.getNode(ISD::BITREVERSE, DL, MVT::v16i8,
31374 DAG.getBitcast(MVT::v16i8, Res));
31375 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
31376 DAG.getBitcast(VecVT, Res), DAG.getIntPtrConstant(0, DL));
31377 return (VT == MVT::i8) ? Res : DAG.getNode(ISD::BSWAP, DL, VT, Res);
31378 }
31379
31380 assert(VT.isVector() && VT.getSizeInBits() >= 128);
31381
31382 // Lower vXi16/vXi32/vXi64 as BSWAP + vXi8 BITREVERSE.
31383 if (VT.getScalarType() != MVT::i8) {
31384 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
31385 SDValue Res = DAG.getNode(ISD::BSWAP, DL, VT, In);
31386 Res = DAG.getBitcast(ByteVT, Res);
31387 Res = DAG.getNode(ISD::BITREVERSE, DL, ByteVT, Res);
31388 return DAG.getBitcast(VT, Res);
31389 }
31390 assert(VT.isVector() && VT.getScalarType() == MVT::i8 &&
31391 "Only byte vector BITREVERSE supported");
31392
31393 unsigned NumElts = VT.getVectorNumElements();
31394
31395 // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
31396 if (Subtarget.hasGFNI()) {
31397 MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
31398 SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);
31399 Matrix = DAG.getBitcast(VT, Matrix);
31400 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
31401 DAG.getTargetConstant(0, DL, MVT::i8));
31402 }
31403
31404 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
31405 // two nibbles and a PSHUFB lookup to find the bitreverse of each
31406 // 0-15 value (moved to the other nibble).
31407 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
31408 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
31409 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
31410
31411 const int LoLUT[16] = {
31412 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
31413 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
31414 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
31415 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
31416 const int HiLUT[16] = {
31417 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
31418 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
31419 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
31420 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
31421
31422 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
31423 for (unsigned i = 0; i < NumElts; ++i) {
31424 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
31425 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
31426 }
31427
31428 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
31429 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
31430 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
31431 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
31432 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
31433}
31434
31435static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
31436 SelectionDAG &DAG) {
31437 SDLoc DL(Op);
31438 SDValue X = Op.getOperand(0);
31439 MVT VT = Op.getSimpleValueType();
31440
31441 // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
31442 if (VT == MVT::i8 ||
31444 X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
31445 SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
31446 DAG.getConstant(0, DL, MVT::i8));
31447 // Copy the inverse of the parity flag into a register with setcc.
31448 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
31449 // Extend to the original type.
31450 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
31451 }
31452
31453 // If we have POPCNT, use the default expansion.
31454 if (Subtarget.hasPOPCNT())
31455 return SDValue();
31456
31457 if (VT == MVT::i64) {
31458 // Xor the high and low 16-bits together using a 32-bit operation.
31459 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
31460 DAG.getNode(ISD::SRL, DL, MVT::i64, X,
31461 DAG.getConstant(32, DL, MVT::i8)));
31462 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
31463 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
31464 }
31465
31466 if (VT != MVT::i16) {
31467 // Xor the high and low 16-bits together using a 32-bit operation.
31468 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
31469 DAG.getConstant(16, DL, MVT::i8));
31470 X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
31471 } else {
31472 // If the input is 16-bits, we need to extend to use an i32 shift below.
31473 X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
31474 }
31475
31476 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
31477 // This should allow an h-reg to be used to save a shift.
31478 SDValue Hi = DAG.getNode(
31479 ISD::TRUNCATE, DL, MVT::i8,
31480 DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
31481 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
31482 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
31483 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
31484
31485 // Copy the inverse of the parity flag into a register with setcc.
31486 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
31487 // Extend to the original type.
31488 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
31489}
31490
31492 const X86Subtarget &Subtarget) {
31493 unsigned NewOpc = 0;
31494 switch (N->getOpcode()) {
31496 NewOpc = X86ISD::LADD;
31497 break;
31499 NewOpc = X86ISD::LSUB;
31500 break;
31502 NewOpc = X86ISD::LOR;
31503 break;
31505 NewOpc = X86ISD::LXOR;
31506 break;
31508 NewOpc = X86ISD::LAND;
31509 break;
31510 default:
31511 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
31512 }
31513
31514 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
31515
31516 return DAG.getMemIntrinsicNode(
31517 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
31518 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
31519 /*MemVT=*/N->getSimpleValueType(0), MMO);
31520}
31521
31522/// Lower atomic_load_ops into LOCK-prefixed operations.
31524 const X86Subtarget &Subtarget) {
31525 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
31526 SDValue Chain = N->getOperand(0);
31527 SDValue LHS = N->getOperand(1);
31528 SDValue RHS = N->getOperand(2);
31529 unsigned Opc = N->getOpcode();
31530 MVT VT = N->getSimpleValueType(0);
31531 SDLoc DL(N);
31532
31533 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
31534 // can only be lowered when the result is unused. They should have already
31535 // been transformed into a cmpxchg loop in AtomicExpand.
31536 if (N->hasAnyUseOfValue(0)) {
31537 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
31538 // select LXADD if LOCK_SUB can't be selected.
31539 // Handle (atomic_load_xor p, SignBit) as (atomic_load_add p, SignBit) so we
31540 // can use LXADD as opposed to cmpxchg.
31541 if (Opc == ISD::ATOMIC_LOAD_SUB ||
31543 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
31544 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS, RHS,
31545 AN->getMemOperand());
31546 }
31548 "Used AtomicRMW ops other than Add should have been expanded!");
31549 return N;
31550 }
31551
31552 // Specialized lowering for the canonical form of an idemptotent atomicrmw.
31553 // The core idea here is that since the memory location isn't actually
31554 // changing, all we need is a lowering for the *ordering* impacts of the
31555 // atomicrmw. As such, we can chose a different operation and memory
31556 // location to minimize impact on other code.
31557 // The above holds unless the node is marked volatile in which
31558 // case it needs to be preserved according to the langref.
31559 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS) && !AN->isVolatile()) {
31560 // On X86, the only ordering which actually requires an instruction is
31561 // seq_cst which isn't SingleThread, everything just needs to be preserved
31562 // during codegen and then dropped. Note that we expect (but don't assume),
31563 // that orderings other than seq_cst and acq_rel have been canonicalized to
31564 // a store or load.
31567 // Prefer a locked operation against a stack location to minimize cache
31568 // traffic. This assumes that stack locations are very likely to be
31569 // accessed only by the owning thread.
31570 SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
31571 assert(!N->hasAnyUseOfValue(0));
31572 // NOTE: The getUNDEF is needed to give something for the unused result 0.
31573 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
31574 DAG.getUNDEF(VT), NewChain);
31575 }
31576 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
31577 SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain);
31578 assert(!N->hasAnyUseOfValue(0));
31579 // NOTE: The getUNDEF is needed to give something for the unused result 0.
31580 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
31581 DAG.getUNDEF(VT), NewChain);
31582 }
31583
31584 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
31585 // RAUW the chain, but don't worry about the result, as it's unused.
31586 assert(!N->hasAnyUseOfValue(0));
31587 // NOTE: The getUNDEF is needed to give something for the unused result 0.
31588 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
31589 DAG.getUNDEF(VT), LockOp.getValue(1));
31590}
31591
31593 const X86Subtarget &Subtarget) {
31594 auto *Node = cast<AtomicSDNode>(Op.getNode());
31595 SDLoc dl(Node);
31596 EVT VT = Node->getMemoryVT();
31597
31598 bool IsSeqCst =
31599 Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
31600 bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
31601
31602 // If this store is not sequentially consistent and the type is legal
31603 // we can just keep it.
31604 if (!IsSeqCst && IsTypeLegal)
31605 return Op;
31606
31607 if (VT == MVT::i64 && !IsTypeLegal) {
31608 // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
31609 // is enabled.
31610 bool NoImplicitFloatOps =
31612 Attribute::NoImplicitFloat);
31613 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
31614 SDValue Chain;
31615 if (Subtarget.hasSSE1()) {
31616 SDValue SclToVec =
31617 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getVal());
31618 MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
31619 SclToVec = DAG.getBitcast(StVT, SclToVec);
31620 SDVTList Tys = DAG.getVTList(MVT::Other);
31621 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
31622 Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
31623 MVT::i64, Node->getMemOperand());
31624 } else if (Subtarget.hasX87()) {
31625 // First load this into an 80-bit X87 register using a stack temporary.
31626 // This will put the whole integer into the significand.
31627 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
31628 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
31629 MachinePointerInfo MPI =
31631 Chain = DAG.getStore(Node->getChain(), dl, Node->getVal(), StackPtr,
31633 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
31634 SDValue LdOps[] = {Chain, StackPtr};
31636 X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
31637 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
31638 Chain = Value.getValue(1);
31639
31640 // Now use an FIST to do the atomic store.
31641 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
31642 Chain =
31643 DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
31644 StoreOps, MVT::i64, Node->getMemOperand());
31645 }
31646
31647 if (Chain) {
31648 // If this is a sequentially consistent store, also emit an appropriate
31649 // barrier.
31650 if (IsSeqCst)
31651 Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
31652
31653 return Chain;
31654 }
31655 }
31656 }
31657
31658 // Convert seq_cst store -> xchg
31659 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
31660 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
31661 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, Node->getMemoryVT(),
31662 Node->getOperand(0), Node->getOperand(2),
31663 Node->getOperand(1), Node->getMemOperand());
31664 return Swap.getValue(1);
31665}
31666
31668 SDNode *N = Op.getNode();
31669 MVT VT = N->getSimpleValueType(0);
31670 unsigned Opc = Op.getOpcode();
31671
31672 // Let legalize expand this if it isn't a legal type yet.
31673 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
31674 return SDValue();
31675
31676 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
31677 SDLoc DL(N);
31678
31679 // Set the carry flag.
31680 SDValue Carry = Op.getOperand(2);
31681 EVT CarryVT = Carry.getValueType();
31682 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
31683 Carry, DAG.getAllOnesConstant(DL, CarryVT));
31684
31685 bool IsAdd = Opc == ISD::UADDO_CARRY || Opc == ISD::SADDO_CARRY;
31686 SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
31687 Op.getOperand(0), Op.getOperand(1),
31688 Carry.getValue(1));
31689
31690 bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
31691 SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
31692 Sum.getValue(1), DL, DAG);
31693 if (N->getValueType(1) == MVT::i1)
31694 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
31695
31696 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
31697}
31698
31699static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
31700 SelectionDAG &DAG) {
31701 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
31702
31703 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
31704 // which returns the values as { float, float } (in XMM0) or
31705 // { double, double } (which is returned in XMM0, XMM1).
31706 SDLoc dl(Op);
31707 SDValue Arg = Op.getOperand(0);
31708 EVT ArgVT = Arg.getValueType();
31709 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
31710
31713
31714 Entry.Node = Arg;
31715 Entry.Ty = ArgTy;
31716 Entry.IsSExt = false;
31717 Entry.IsZExt = false;
31718 Args.push_back(Entry);
31719
31720 bool isF64 = ArgVT == MVT::f64;
31721 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
31722 // the small struct {f32, f32} is returned in (eax, edx). For f64,
31723 // the results are returned via SRet in memory.
31724 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31725 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
31726 const char *LibcallName = TLI.getLibcallName(LC);
31727 SDValue Callee =
31728 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
31729
31730 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
31731 : (Type *)FixedVectorType::get(ArgTy, 4);
31732
31734 CLI.setDebugLoc(dl)
31735 .setChain(DAG.getEntryNode())
31736 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
31737
31738 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
31739
31740 if (isF64)
31741 // Returned in xmm0 and xmm1.
31742 return CallResult.first;
31743
31744 // Returned in bits 0:31 and 32:64 xmm0.
31745 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
31746 CallResult.first, DAG.getIntPtrConstant(0, dl));
31747 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
31748 CallResult.first, DAG.getIntPtrConstant(1, dl));
31749 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
31750 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
31751}
31752
31753/// Widen a vector input to a vector of NVT. The
31754/// input vector must have the same element type as NVT.
31756 bool FillWithZeroes = false) {
31757 // Check if InOp already has the right width.
31758 MVT InVT = InOp.getSimpleValueType();
31759 if (InVT == NVT)
31760 return InOp;
31761
31762 if (InOp.isUndef())
31763 return DAG.getUNDEF(NVT);
31764
31766 "input and widen element type must match");
31767
31768 unsigned InNumElts = InVT.getVectorNumElements();
31769 unsigned WidenNumElts = NVT.getVectorNumElements();
31770 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
31771 "Unexpected request for vector widening");
31772
31773 SDLoc dl(InOp);
31774 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
31775 InOp.getNumOperands() == 2) {
31776 SDValue N1 = InOp.getOperand(1);
31777 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
31778 N1.isUndef()) {
31779 InOp = InOp.getOperand(0);
31780 InVT = InOp.getSimpleValueType();
31781 InNumElts = InVT.getVectorNumElements();
31782 }
31783 }
31787 for (unsigned i = 0; i < InNumElts; ++i)
31788 Ops.push_back(InOp.getOperand(i));
31789
31790 EVT EltVT = InOp.getOperand(0).getValueType();
31791
31792 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
31793 DAG.getUNDEF(EltVT);
31794 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
31795 Ops.push_back(FillVal);
31796 return DAG.getBuildVector(NVT, dl, Ops);
31797 }
31798 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
31799 DAG.getUNDEF(NVT);
31800 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
31801 InOp, DAG.getIntPtrConstant(0, dl));
31802}
31803
31805 SelectionDAG &DAG) {
31806 assert(Subtarget.hasAVX512() &&
31807 "MGATHER/MSCATTER are supported on AVX-512 arch only");
31808
31809 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
31810 SDValue Src = N->getValue();
31811 MVT VT = Src.getSimpleValueType();
31812 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
31813 SDLoc dl(Op);
31814
31815 SDValue Scale = N->getScale();
31816 SDValue Index = N->getIndex();
31817 SDValue Mask = N->getMask();
31818 SDValue Chain = N->getChain();
31819 SDValue BasePtr = N->getBasePtr();
31820
31821 if (VT == MVT::v2f32 || VT == MVT::v2i32) {
31822 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
31823 // If the index is v2i64 and we have VLX we can use xmm for data and index.
31824 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
31825 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31826 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
31827 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
31828 SDVTList VTs = DAG.getVTList(MVT::Other);
31829 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
31830 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
31831 N->getMemoryVT(), N->getMemOperand());
31832 }
31833 return SDValue();
31834 }
31835
31836 MVT IndexVT = Index.getSimpleValueType();
31837
31838 // If the index is v2i32, we're being called by type legalization and we
31839 // should just let the default handling take care of it.
31840 if (IndexVT == MVT::v2i32)
31841 return SDValue();
31842
31843 // If we don't have VLX and neither the passthru or index is 512-bits, we
31844 // need to widen until one is.
31845 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
31846 !Index.getSimpleValueType().is512BitVector()) {
31847 // Determine how much we need to widen by to get a 512-bit type.
31848 unsigned Factor = std::min(512/VT.getSizeInBits(),
31849 512/IndexVT.getSizeInBits());
31850 unsigned NumElts = VT.getVectorNumElements() * Factor;
31851
31852 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
31853 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
31854 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
31855
31856 Src = ExtendToType(Src, VT, DAG);
31857 Index = ExtendToType(Index, IndexVT, DAG);
31858 Mask = ExtendToType(Mask, MaskVT, DAG, true);
31859 }
31860
31861 SDVTList VTs = DAG.getVTList(MVT::Other);
31862 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
31863 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
31864 N->getMemoryVT(), N->getMemOperand());
31865}
31866
31867static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
31868 SelectionDAG &DAG) {
31869
31870 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
31871 MVT VT = Op.getSimpleValueType();
31872 MVT ScalarVT = VT.getScalarType();
31873 SDValue Mask = N->getMask();
31874 MVT MaskVT = Mask.getSimpleValueType();
31875 SDValue PassThru = N->getPassThru();
31876 SDLoc dl(Op);
31877
31878 // Handle AVX masked loads which don't support passthru other than 0.
31879 if (MaskVT.getVectorElementType() != MVT::i1) {
31880 // We also allow undef in the isel pattern.
31881 if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
31882 return Op;
31883
31884 SDValue NewLoad = DAG.getMaskedLoad(
31885 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
31886 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
31887 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
31888 N->isExpandingLoad());
31889 // Emit a blend.
31890 SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
31891 return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
31892 }
31893
31894 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
31895 "Expanding masked load is supported on AVX-512 target only!");
31896
31897 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
31898 "Expanding masked load is supported for 32 and 64-bit types only!");
31899
31900 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
31901 "Cannot lower masked load op.");
31902
31903 assert((ScalarVT.getSizeInBits() >= 32 ||
31904 (Subtarget.hasBWI() &&
31905 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
31906 "Unsupported masked load op.");
31907
31908 // This operation is legal for targets with VLX, but without
31909 // VLX the vector should be widened to 512 bit
31910 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
31911 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
31912 PassThru = ExtendToType(PassThru, WideDataVT, DAG);
31913
31914 // Mask element has to be i1.
31915 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
31916 "Unexpected mask type");
31917
31918 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
31919
31920 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
31921 SDValue NewLoad = DAG.getMaskedLoad(
31922 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
31923 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
31924 N->getExtensionType(), N->isExpandingLoad());
31925
31926 SDValue Extract =
31927 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
31928 DAG.getIntPtrConstant(0, dl));
31929 SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
31930 return DAG.getMergeValues(RetOps, dl);
31931}
31932
31933static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
31934 SelectionDAG &DAG) {
31935 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
31936 SDValue DataToStore = N->getValue();
31937 MVT VT = DataToStore.getSimpleValueType();
31938 MVT ScalarVT = VT.getScalarType();
31939 SDValue Mask = N->getMask();
31940 SDLoc dl(Op);
31941
31942 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
31943 "Expanding masked load is supported on AVX-512 target only!");
31944
31945 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
31946 "Expanding masked load is supported for 32 and 64-bit types only!");
31947
31948 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
31949 "Cannot lower masked store op.");
31950
31951 assert((ScalarVT.getSizeInBits() >= 32 ||
31952 (Subtarget.hasBWI() &&
31953 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
31954 "Unsupported masked store op.");
31955
31956 // This operation is legal for targets with VLX, but without
31957 // VLX the vector should be widened to 512 bit
31958 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
31959 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
31960
31961 // Mask element has to be i1.
31962 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
31963 "Unexpected mask type");
31964
31965 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
31966
31967 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
31968 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
31969 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
31970 N->getOffset(), Mask, N->getMemoryVT(),
31971 N->getMemOperand(), N->getAddressingMode(),
31972 N->isTruncatingStore(), N->isCompressingStore());
31973}
31974
31975static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
31976 SelectionDAG &DAG) {
31977 assert(Subtarget.hasAVX2() &&
31978 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
31979
31980 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
31981 SDLoc dl(Op);
31982 MVT VT = Op.getSimpleValueType();
31983 SDValue Index = N->getIndex();
31984 SDValue Mask = N->getMask();
31985 SDValue PassThru = N->getPassThru();
31986 MVT IndexVT = Index.getSimpleValueType();
31987
31988 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
31989
31990 // If the index is v2i32, we're being called by type legalization.
31991 if (IndexVT == MVT::v2i32)
31992 return SDValue();
31993
31994 // If we don't have VLX and neither the passthru or index is 512-bits, we
31995 // need to widen until one is.
31996 MVT OrigVT = VT;
31997 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
31998 !IndexVT.is512BitVector()) {
31999 // Determine how much we need to widen by to get a 512-bit type.
32000 unsigned Factor = std::min(512/VT.getSizeInBits(),
32001 512/IndexVT.getSizeInBits());
32002
32003 unsigned NumElts = VT.getVectorNumElements() * Factor;
32004
32005 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
32006 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
32007 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
32008
32009 PassThru = ExtendToType(PassThru, VT, DAG);
32010 Index = ExtendToType(Index, IndexVT, DAG);
32011 Mask = ExtendToType(Mask, MaskVT, DAG, true);
32012 }
32013
32014 // Break dependency on the data register.
32015 if (PassThru.isUndef())
32016 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
32017
32018 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
32019 N->getScale() };
32020 SDValue NewGather = DAG.getMemIntrinsicNode(
32021 X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
32022 N->getMemOperand());
32023 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
32024 NewGather, DAG.getIntPtrConstant(0, dl));
32025 return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
32026}
32027
32029 SDLoc dl(Op);
32030 SDValue Src = Op.getOperand(0);
32031 MVT DstVT = Op.getSimpleValueType();
32032
32033 AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
32034 unsigned SrcAS = N->getSrcAddressSpace();
32035
32036 assert(SrcAS != N->getDestAddressSpace() &&
32037 "addrspacecast must be between different address spaces");
32038
32039 if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
32040 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
32041 } else if (DstVT == MVT::i64) {
32042 Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
32043 } else if (DstVT == MVT::i32) {
32044 Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
32045 } else {
32046 report_fatal_error("Bad address space in addrspacecast");
32047 }
32048 return Op;
32049}
32050
32051SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
32052 SelectionDAG &DAG) const {
32053 // TODO: Eventually, the lowering of these nodes should be informed by or
32054 // deferred to the GC strategy for the function in which they appear. For
32055 // now, however, they must be lowered to something. Since they are logically
32056 // no-ops in the case of a null GC strategy (or a GC strategy which does not
32057 // require special handling for these nodes), lower them as literal NOOPs for
32058 // the time being.
32060 Ops.push_back(Op.getOperand(0));
32061 if (Op->getGluedNode())
32062 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
32063
32064 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
32065 return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
32066}
32067
32068// Custom split CVTPS2PH with wide types.
32070 SDLoc dl(Op);
32071 EVT VT = Op.getValueType();
32072 SDValue Lo, Hi;
32073 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
32074 EVT LoVT, HiVT;
32075 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
32076 SDValue RC = Op.getOperand(1);
32077 Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
32078 Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
32079 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
32080}
32081
32083 SelectionDAG &DAG) {
32084 unsigned IsData = Op.getConstantOperandVal(4);
32085
32086 // We don't support non-data prefetch without PREFETCHI.
32087 // Just preserve the chain.
32088 if (!IsData && !Subtarget.hasPREFETCHI())
32089 return Op.getOperand(0);
32090
32091 return Op;
32092}
32093
32095 unsigned OpNo) {
32096 const APInt Operand(32, OpNo);
32097 std::string OpNoStr = llvm::toString(Operand, 10, false);
32098 std::string Str(" $");
32099
32100 std::string OpNoStr1(Str + OpNoStr); // e.g. " $1" (OpNo=1)
32101 std::string OpNoStr2(Str + "{" + OpNoStr + ":"); // With modifier, e.g. ${1:P}
32102
32103 auto I = StringRef::npos;
32104 for (auto &AsmStr : AsmStrs) {
32105 // Match the OpNo string. We should match exactly to exclude match
32106 // sub-string, e.g. "$12" contain "$1"
32107 if (AsmStr.ends_with(OpNoStr1))
32108 I = AsmStr.size() - OpNoStr1.size();
32109
32110 // Get the index of operand in AsmStr.
32111 if (I == StringRef::npos)
32112 I = AsmStr.find(OpNoStr1 + ",");
32113 if (I == StringRef::npos)
32114 I = AsmStr.find(OpNoStr2);
32115
32116 if (I == StringRef::npos)
32117 continue;
32118
32119 assert(I > 0 && "Unexpected inline asm string!");
32120 // Remove the operand string and label (if exsit).
32121 // For example:
32122 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr ${0:P}"
32123 // ==>
32124 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr "
32125 // ==>
32126 // "call dword ptr "
32127 auto TmpStr = AsmStr.substr(0, I);
32128 I = TmpStr.rfind(':');
32129 if (I != StringRef::npos)
32130 TmpStr = TmpStr.substr(I + 1);
32131 return TmpStr.take_while(llvm::isAlpha);
32132 }
32133
32134 return StringRef();
32135}
32136
32138 const SmallVectorImpl<StringRef> &AsmStrs, unsigned OpNo) const {
32139 // In a __asm block, __asm inst foo where inst is CALL or JMP should be
32140 // changed from indirect TargetLowering::C_Memory to direct
32141 // TargetLowering::C_Address.
32142 // We don't need to special case LOOP* and Jcc, which cannot target a memory
32143 // location.
32144 StringRef Inst = getInstrStrFromOpNo(AsmStrs, OpNo);
32145 return Inst.equals_insensitive("call") || Inst.equals_insensitive("jmp");
32146}
32147
32148/// Provide custom lowering hooks for some operations.
32150 switch (Op.getOpcode()) {
32151 // clang-format off
32152 default: llvm_unreachable("Should not custom lower this!");
32153 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
32155 return LowerCMP_SWAP(Op, Subtarget, DAG);
32156 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
32161 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
32162 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
32163 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
32164 case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);
32165 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
32166 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
32167 case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
32168 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
32169 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
32170 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
32171 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
32172 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
32173 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
32174 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
32175 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
32176 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
32177 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
32178 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
32179 case ISD::SHL_PARTS:
32180 case ISD::SRA_PARTS:
32181 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
32182 case ISD::FSHL:
32183 case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
32185 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
32187 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
32188 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
32189 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
32190 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
32191 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
32194 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
32195 case ISD::FP_TO_SINT:
32197 case ISD::FP_TO_UINT:
32198 case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
32200 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);
32201 case ISD::FP_EXTEND:
32202 case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
32203 case ISD::FP_ROUND:
32204 case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
32205 case ISD::FP16_TO_FP:
32206 case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
32207 case ISD::FP_TO_FP16:
32208 case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
32209 case ISD::FP_TO_BF16: return LowerFP_TO_BF16(Op, DAG);
32210 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
32211 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
32212 case ISD::FADD:
32213 case ISD::FSUB: return lowerFaddFsub(Op, DAG);
32214 case ISD::FROUND: return LowerFROUND(Op, DAG);
32215 case ISD::FABS:
32216 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
32217 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
32218 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
32219 case ISD::LRINT:
32220 case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
32221 case ISD::SETCC:
32222 case ISD::STRICT_FSETCC:
32223 case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
32224 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
32225 case ISD::SELECT: return LowerSELECT(Op, DAG);
32226 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
32227 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
32228 case ISD::VASTART: return LowerVASTART(Op, DAG);
32229 case ISD::VAARG: return LowerVAARG(Op, DAG);
32230 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
32231 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
32233 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
32234 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
32235 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
32236 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
32238 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
32239 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
32240 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
32241 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
32242 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
32244 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
32245 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
32247 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
32248 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
32249 case ISD::GET_FPENV_MEM: return LowerGET_FPENV_MEM(Op, DAG);
32250 case ISD::SET_FPENV_MEM: return LowerSET_FPENV_MEM(Op, DAG);
32251 case ISD::RESET_FPENV: return LowerRESET_FPENV(Op, DAG);
32252 case ISD::CTLZ:
32253 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
32254 case ISD::CTTZ:
32255 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
32256 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
32257 case ISD::MULHS:
32258 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
32259 case ISD::ROTL:
32260 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
32261 case ISD::SRA:
32262 case ISD::SRL:
32263 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
32264 case ISD::SADDO:
32265 case ISD::UADDO:
32266 case ISD::SSUBO:
32267 case ISD::USUBO: return LowerXALUO(Op, DAG);
32268 case ISD::SMULO:
32269 case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);
32270 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
32271 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
32272 case ISD::SADDO_CARRY:
32273 case ISD::SSUBO_CARRY:
32274 case ISD::UADDO_CARRY:
32275 case ISD::USUBO_CARRY: return LowerADDSUBO_CARRY(Op, DAG);
32276 case ISD::ADD:
32277 case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
32278 case ISD::UADDSAT:
32279 case ISD::SADDSAT:
32280 case ISD::USUBSAT:
32281 case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
32282 case ISD::SMAX:
32283 case ISD::SMIN:
32284 case ISD::UMAX:
32285 case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG);
32286 case ISD::FMINIMUM:
32287 case ISD::FMAXIMUM:
32288 return LowerFMINIMUM_FMAXIMUM(Op, Subtarget, DAG);
32289 case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
32290 case ISD::ABDS:
32291 case ISD::ABDU: return LowerABD(Op, Subtarget, DAG);
32292 case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);
32293 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
32294 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
32295 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
32296 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
32297 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
32299 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
32300 case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
32301 case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
32302 case ISD::PREFETCH: return LowerPREFETCH(Op, Subtarget, DAG);
32303 // clang-format on
32304 }
32305}
32306
32307/// Replace a node with an illegal result type with a new node built out of
32308/// custom code.
32311 SelectionDAG &DAG) const {
32312 SDLoc dl(N);
32313 switch (N->getOpcode()) {
32314 default:
32315#ifndef NDEBUG
32316 dbgs() << "ReplaceNodeResults: ";
32317 N->dump(&DAG);
32318#endif
32319 llvm_unreachable("Do not know how to custom type legalize this operation!");
32320 case X86ISD::CVTPH2PS: {
32321 EVT VT = N->getValueType(0);
32322 SDValue Lo, Hi;
32323 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
32324 EVT LoVT, HiVT;
32325 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
32326 Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
32327 Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
32328 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
32329 Results.push_back(Res);
32330 return;
32331 }
32333 EVT VT = N->getValueType(0);
32334 SDValue Lo, Hi;
32335 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
32336 EVT LoVT, HiVT;
32337 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
32338 Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
32339 {N->getOperand(0), Lo});
32340 Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
32341 {N->getOperand(0), Hi});
32342 SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
32343 Lo.getValue(1), Hi.getValue(1));
32344 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
32345 Results.push_back(Res);
32346 Results.push_back(Chain);
32347 return;
32348 }
32349 case X86ISD::CVTPS2PH:
32350 Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
32351 return;
32352 case ISD::CTPOP: {
32353 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
32354 // If we have at most 32 active bits, then perform as i32 CTPOP.
32355 // TODO: Perform this in generic legalizer?
32356 KnownBits Known = DAG.computeKnownBits(N->getOperand(0));
32357 unsigned LZ = Known.countMinLeadingZeros();
32358 unsigned TZ = Known.countMinTrailingZeros();
32359 if ((LZ + TZ) >= 32) {
32360 SDValue Op = DAG.getNode(ISD::SRL, dl, MVT::i64, N->getOperand(0),
32361 DAG.getShiftAmountConstant(TZ, MVT::i64, dl));
32362 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Op);
32363 Op = DAG.getNode(ISD::CTPOP, dl, MVT::i32, Op);
32364 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Op);
32365 Results.push_back(Op);
32366 return;
32367 }
32368 // Use a v2i64 if possible.
32369 bool NoImplicitFloatOps =
32371 Attribute::NoImplicitFloat);
32372 if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
32373 SDValue Wide =
32374 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
32375 Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
32376 // Bit count should fit in 32-bits, extract it as that and then zero
32377 // extend to i64. Otherwise we end up extracting bits 63:32 separately.
32378 Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
32379 Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
32380 DAG.getIntPtrConstant(0, dl));
32381 Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
32382 Results.push_back(Wide);
32383 }
32384 return;
32385 }
32386 case ISD::MUL: {
32387 EVT VT = N->getValueType(0);
32389 VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");
32390 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
32391 // elements are needed.
32392 MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
32393 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
32394 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
32395 SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
32396 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
32397 unsigned NumConcats = 16 / VT.getVectorNumElements();
32398 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
32399 ConcatOps[0] = Res;
32400 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
32401 Results.push_back(Res);
32402 return;
32403 }
32404 case ISD::SMULO:
32405 case ISD::UMULO: {
32406 EVT VT = N->getValueType(0);
32408 VT == MVT::v2i32 && "Unexpected VT!");
32409 bool IsSigned = N->getOpcode() == ISD::SMULO;
32410 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
32411 SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0));
32412 SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1));
32413 SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1);
32414 // Extract the high 32 bits from each result using PSHUFD.
32415 // TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD.
32416 SDValue Hi = DAG.getBitcast(MVT::v4i32, Res);
32417 Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});
32418 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi,
32419 DAG.getIntPtrConstant(0, dl));
32420
32421 // Truncate the low bits of the result. This will become PSHUFD.
32422 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
32423
32424 SDValue HiCmp;
32425 if (IsSigned) {
32426 // SMULO overflows if the high bits don't match the sign of the low.
32427 HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT));
32428 } else {
32429 // UMULO overflows if the high bits are non-zero.
32430 HiCmp = DAG.getConstant(0, dl, VT);
32431 }
32432 SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);
32433
32434 // Widen the result with by padding with undef.
32435 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
32436 DAG.getUNDEF(VT));
32437 Results.push_back(Res);
32438 Results.push_back(Ovf);
32439 return;
32440 }
32441 case X86ISD::VPMADDWD: {
32442 // Legalize types for X86ISD::VPMADDWD by widening.
32443 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
32444
32445 EVT VT = N->getValueType(0);
32446 EVT InVT = N->getOperand(0).getValueType();
32447 assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
32448 "Expected a VT that divides into 128 bits.");
32450 "Unexpected type action!");
32451 unsigned NumConcat = 128 / InVT.getSizeInBits();
32452
32453 EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
32454 InVT.getVectorElementType(),
32455 NumConcat * InVT.getVectorNumElements());
32456 EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
32458 NumConcat * VT.getVectorNumElements());
32459
32460 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
32461 Ops[0] = N->getOperand(0);
32462 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
32463 Ops[0] = N->getOperand(1);
32464 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
32465
32466 SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
32467 Results.push_back(Res);
32468 return;
32469 }
32470 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
32471 case X86ISD::FMINC:
32472 case X86ISD::FMIN:
32473 case X86ISD::FMAXC:
32474 case X86ISD::FMAX: {
32475 EVT VT = N->getValueType(0);
32476 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
32477 SDValue UNDEF = DAG.getUNDEF(VT);
32478 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
32479 N->getOperand(0), UNDEF);
32480 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
32481 N->getOperand(1), UNDEF);
32482 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
32483 return;
32484 }
32485 case ISD::SDIV:
32486 case ISD::UDIV:
32487 case ISD::SREM:
32488 case ISD::UREM: {
32489 EVT VT = N->getValueType(0);
32490 if (VT.isVector()) {
32492 "Unexpected type action!");
32493 // If this RHS is a constant splat vector we can widen this and let
32494 // division/remainder by constant optimize it.
32495 // TODO: Can we do something for non-splat?
32496 APInt SplatVal;
32497 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
32498 unsigned NumConcats = 128 / VT.getSizeInBits();
32499 SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
32500 Ops0[0] = N->getOperand(0);
32501 EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
32502 SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
32503 SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
32504 SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
32505 Results.push_back(Res);
32506 }
32507 return;
32508 }
32509
32510 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
32511 Results.push_back(V);
32512 return;
32513 }
32514 case ISD::TRUNCATE: {
32515 MVT VT = N->getSimpleValueType(0);
32516 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
32517 return;
32518
32519 // The generic legalizer will try to widen the input type to the same
32520 // number of elements as the widened result type. But this isn't always
32521 // the best thing so do some custom legalization to avoid some cases.
32522 MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
32523 SDValue In = N->getOperand(0);
32524 EVT InVT = In.getValueType();
32525 EVT InEltVT = InVT.getVectorElementType();
32526 EVT EltVT = VT.getVectorElementType();
32527 unsigned MinElts = VT.getVectorNumElements();
32528 unsigned WidenNumElts = WidenVT.getVectorNumElements();
32529 unsigned InBits = InVT.getSizeInBits();
32530
32531 // See if there are sufficient leading bits to perform a PACKUS/PACKSS.
32532 unsigned PackOpcode;
32533 if (SDValue Src =
32534 matchTruncateWithPACK(PackOpcode, VT, In, dl, DAG, Subtarget)) {
32535 if (SDValue Res = truncateVectorWithPACK(PackOpcode, VT, Src,
32536 dl, DAG, Subtarget)) {
32537 Res = widenSubVector(WidenVT, Res, false, Subtarget, DAG, dl);
32538 Results.push_back(Res);
32539 return;
32540 }
32541 }
32542
32543 if ((128 % InBits) == 0 && WidenVT.is128BitVector()) {
32544 // 128 bit and smaller inputs should avoid truncate all together and
32545 // use a shuffle.
32546 if ((InEltVT.getSizeInBits() % EltVT.getSizeInBits()) == 0) {
32547 int Scale = InEltVT.getSizeInBits() / EltVT.getSizeInBits();
32548 SmallVector<int, 16> TruncMask(WidenNumElts, -1);
32549 for (unsigned I = 0; I < MinElts; ++I)
32550 TruncMask[I] = Scale * I;
32551 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl, 128);
32552 assert(isTypeLegal(WidenVT) && isTypeLegal(WidenIn.getValueType()) &&
32553 "Illegal vector type in truncation");
32554 WidenIn = DAG.getBitcast(WidenVT, WidenIn);
32555 Results.push_back(
32556 DAG.getVectorShuffle(WidenVT, dl, WidenIn, WidenIn, TruncMask));
32557 return;
32558 }
32559 }
32560
32561 // With AVX512 there are some cases that can use a target specific
32562 // truncate node to go from 256/512 to less than 128 with zeros in the
32563 // upper elements of the 128 bit result.
32564 if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
32565 // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
32566 if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
32567 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
32568 return;
32569 }
32570 // There's one case we can widen to 512 bits and use VTRUNC.
32571 if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
32572 In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
32573 DAG.getUNDEF(MVT::v4i64));
32574 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
32575 return;
32576 }
32577 }
32578 if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
32579 getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
32580 isTypeLegal(MVT::v4i64)) {
32581 // Input needs to be split and output needs to widened. Let's use two
32582 // VTRUNCs, and shuffle their results together into the wider type.
32583 SDValue Lo, Hi;
32584 std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
32585
32586 Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
32587 Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
32588 SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
32589 { 0, 1, 2, 3, 16, 17, 18, 19,
32590 -1, -1, -1, -1, -1, -1, -1, -1 });
32591 Results.push_back(Res);
32592 return;
32593 }
32594
32595 // Attempt to widen the truncation input vector to let LowerTRUNCATE handle
32596 // this via type legalization.
32597 if ((InEltVT == MVT::i16 || InEltVT == MVT::i32 || InEltVT == MVT::i64) &&
32598 (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32) &&
32599 (!Subtarget.hasSSSE3() ||
32600 (!isTypeLegal(InVT) &&
32601 !(MinElts <= 4 && InEltVT == MVT::i64 && EltVT == MVT::i8)))) {
32602 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl,
32603 InEltVT.getSizeInBits() * WidenNumElts);
32604 Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, WidenVT, WidenIn));
32605 return;
32606 }
32607
32608 return;
32609 }
32610 case ISD::ANY_EXTEND:
32611 // Right now, only MVT::v8i8 has Custom action for an illegal type.
32612 // It's intended to custom handle the input type.
32613 assert(N->getValueType(0) == MVT::v8i8 &&
32614 "Do not know how to legalize this Node");
32615 return;
32616 case ISD::SIGN_EXTEND:
32617 case ISD::ZERO_EXTEND: {
32618 EVT VT = N->getValueType(0);
32619 SDValue In = N->getOperand(0);
32620 EVT InVT = In.getValueType();
32621 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
32622 (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
32624 "Unexpected type action!");
32625 assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode");
32626 // Custom split this so we can extend i8/i16->i32 invec. This is better
32627 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
32628 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
32629 // we allow the sra from the extend to i32 to be shared by the split.
32630 In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
32631
32632 // Fill a vector with sign bits for each element.
32633 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
32634 SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
32635
32636 // Create an unpackl and unpackh to interleave the sign bits then bitcast
32637 // to v2i64.
32638 SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
32639 {0, 4, 1, 5});
32640 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
32641 SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
32642 {2, 6, 3, 7});
32643 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
32644
32645 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
32646 Results.push_back(Res);
32647 return;
32648 }
32649
32650 if (VT == MVT::v16i32 || VT == MVT::v8i64) {
32651 if (!InVT.is128BitVector()) {
32652 // Not a 128 bit vector, but maybe type legalization will promote
32653 // it to 128 bits.
32654 if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
32655 return;
32656 InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
32657 if (!InVT.is128BitVector())
32658 return;
32659
32660 // Promote the input to 128 bits. Type legalization will turn this into
32661 // zext_inreg/sext_inreg.
32662 In = DAG.getNode(N->getOpcode(), dl, InVT, In);
32663 }
32664
32665 // Perform custom splitting instead of the two stage extend we would get
32666 // by default.
32667 EVT LoVT, HiVT;
32668 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
32669 assert(isTypeLegal(LoVT) && "Split VT not legal?");
32670
32671 SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG);
32672
32673 // We need to shift the input over by half the number of elements.
32674 unsigned NumElts = InVT.getVectorNumElements();
32675 unsigned HalfNumElts = NumElts / 2;
32676 SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
32677 for (unsigned i = 0; i != HalfNumElts; ++i)
32678 ShufMask[i] = i + HalfNumElts;
32679
32680 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
32681 Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG);
32682
32683 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
32684 Results.push_back(Res);
32685 }
32686 return;
32687 }
32688 case ISD::FP_TO_SINT:
32690 case ISD::FP_TO_UINT:
32692 bool IsStrict = N->isStrictFPOpcode();
32693 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||
32694 N->getOpcode() == ISD::STRICT_FP_TO_SINT;
32695 EVT VT = N->getValueType(0);
32696 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
32697 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
32698 EVT SrcVT = Src.getValueType();
32699
32700 SDValue Res;
32701 if (isSoftF16(SrcVT, Subtarget)) {
32702 EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
32703 if (IsStrict) {
32704 Res =
32705 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
32706 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
32707 {NVT, MVT::Other}, {Chain, Src})});
32708 Chain = Res.getValue(1);
32709 } else {
32710 Res = DAG.getNode(N->getOpcode(), dl, VT,
32711 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
32712 }
32713 Results.push_back(Res);
32714 if (IsStrict)
32715 Results.push_back(Chain);
32716
32717 return;
32718 }
32719
32720 if (VT.isVector() && Subtarget.hasFP16() &&
32721 SrcVT.getVectorElementType() == MVT::f16) {
32722 EVT EleVT = VT.getVectorElementType();
32723 EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
32724
32725 if (SrcVT != MVT::v8f16) {
32726 SDValue Tmp =
32727 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
32728 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
32729 Ops[0] = Src;
32730 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
32731 }
32732
32733 if (IsStrict) {
32734 unsigned Opc =
32736 Res =
32737 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
32738 Chain = Res.getValue(1);
32739 } else {
32740 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
32741 Res = DAG.getNode(Opc, dl, ResVT, Src);
32742 }
32743
32744 // TODO: Need to add exception check code for strict FP.
32745 if (EleVT.getSizeInBits() < 16) {
32746 MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);
32747 Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);
32748
32749 // Now widen to 128 bits.
32750 unsigned NumConcats = 128 / TmpVT.getSizeInBits();
32751 MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);
32752 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));
32753 ConcatOps[0] = Res;
32754 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
32755 }
32756
32757 Results.push_back(Res);
32758 if (IsStrict)
32759 Results.push_back(Chain);
32760
32761 return;
32762 }
32763
32764 if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
32766 "Unexpected type action!");
32767
32768 // Try to create a 128 bit vector, but don't exceed a 32 bit element.
32769 unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
32770 MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
32772 SDValue Res;
32773 SDValue Chain;
32774 if (IsStrict) {
32775 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
32776 {N->getOperand(0), Src});
32777 Chain = Res.getValue(1);
32778 } else
32779 Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
32780
32781 // Preserve what we know about the size of the original result. If the
32782 // result is v2i32, we have to manually widen the assert.
32783 if (PromoteVT == MVT::v2i32)
32784 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
32785 DAG.getUNDEF(MVT::v2i32));
32786
32787 Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
32788 Res.getValueType(), Res,
32790
32791 if (PromoteVT == MVT::v2i32)
32792 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
32793 DAG.getIntPtrConstant(0, dl));
32794
32795 // Truncate back to the original width.
32796 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
32797
32798 // Now widen to 128 bits.
32799 unsigned NumConcats = 128 / VT.getSizeInBits();
32801 VT.getVectorNumElements() * NumConcats);
32802 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
32803 ConcatOps[0] = Res;
32804 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
32805 Results.push_back(Res);
32806 if (IsStrict)
32807 Results.push_back(Chain);
32808 return;
32809 }
32810
32811
32812 if (VT == MVT::v2i32) {
32813 assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&
32814 "Strict unsigned conversion requires AVX512");
32815 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
32817 "Unexpected type action!");
32818 if (Src.getValueType() == MVT::v2f64) {
32819 if (!IsSigned && !Subtarget.hasAVX512()) {
32820 SDValue Res =
32821 expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
32822 Results.push_back(Res);
32823 return;
32824 }
32825
32826 unsigned Opc;
32827 if (IsStrict)
32829 else
32830 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
32831
32832 // If we have VLX we can emit a target specific FP_TO_UINT node,.
32833 if (!IsSigned && !Subtarget.hasVLX()) {
32834 // Otherwise we can defer to the generic legalizer which will widen
32835 // the input as well. This will be further widened during op
32836 // legalization to v8i32<-v8f64.
32837 // For strict nodes we'll need to widen ourselves.
32838 // FIXME: Fix the type legalizer to safely widen strict nodes?
32839 if (!IsStrict)
32840 return;
32841 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
32842 DAG.getConstantFP(0.0, dl, MVT::v2f64));
32843 Opc = N->getOpcode();
32844 }
32845 SDValue Res;
32846 SDValue Chain;
32847 if (IsStrict) {
32848 Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
32849 {N->getOperand(0), Src});
32850 Chain = Res.getValue(1);
32851 } else {
32852 Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
32853 }
32854 Results.push_back(Res);
32855 if (IsStrict)
32856 Results.push_back(Chain);
32857 return;
32858 }
32859
32860 // Custom widen strict v2f32->v2i32 by padding with zeros.
32861 // FIXME: Should generic type legalizer do this?
32862 if (Src.getValueType() == MVT::v2f32 && IsStrict) {
32863 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
32864 DAG.getConstantFP(0.0, dl, MVT::v2f32));
32865 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},
32866 {N->getOperand(0), Src});
32867 Results.push_back(Res);
32868 Results.push_back(Res.getValue(1));
32869 return;
32870 }
32871
32872 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
32873 // so early out here.
32874 return;
32875 }
32876
32877 assert(!VT.isVector() && "Vectors should have been handled above!");
32878
32879 if ((Subtarget.hasDQI() && VT == MVT::i64 &&
32880 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
32881 (Subtarget.hasFP16() && SrcVT == MVT::f16)) {
32882 assert(!Subtarget.is64Bit() && "i64 should be legal");
32883 unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
32884 // If we use a 128-bit result we might need to use a target specific node.
32885 unsigned SrcElts =
32886 std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
32887 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
32888 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
32889 unsigned Opc = N->getOpcode();
32890 if (NumElts != SrcElts) {
32891 if (IsStrict)
32893 else
32894 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
32895 }
32896
32897 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
32898 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
32899 DAG.getConstantFP(0.0, dl, VecInVT), Src,
32900 ZeroIdx);
32901 SDValue Chain;
32902 if (IsStrict) {
32903 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
32904 Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
32905 Chain = Res.getValue(1);
32906 } else
32907 Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
32908 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
32909 Results.push_back(Res);
32910 if (IsStrict)
32911 Results.push_back(Chain);
32912 return;
32913 }
32914
32915 if (VT == MVT::i128 && Subtarget.isTargetWin64()) {
32916 SDValue Chain;
32917 SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);
32918 Results.push_back(V);
32919 if (IsStrict)
32920 Results.push_back(Chain);
32921 return;
32922 }
32923
32924 if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
32925 Results.push_back(V);
32926 if (IsStrict)
32927 Results.push_back(Chain);
32928 }
32929 return;
32930 }
32931 case ISD::LRINT:
32932 case ISD::LLRINT: {
32933 if (SDValue V = LRINT_LLRINTHelper(N, DAG))
32934 Results.push_back(V);
32935 return;
32936 }
32937
32938 case ISD::SINT_TO_FP:
32940 case ISD::UINT_TO_FP:
32942 bool IsStrict = N->isStrictFPOpcode();
32943 bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||
32944 N->getOpcode() == ISD::STRICT_SINT_TO_FP;
32945 EVT VT = N->getValueType(0);
32946 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
32947 if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&
32948 Subtarget.hasVLX()) {
32949 if (Src.getValueType().getVectorElementType() == MVT::i16)
32950 return;
32951
32952 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)
32953 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
32954 IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)
32955 : DAG.getUNDEF(MVT::v2i32));
32956 if (IsStrict) {
32957 unsigned Opc =
32959 SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
32960 {N->getOperand(0), Src});
32961 Results.push_back(Res);
32962 Results.push_back(Res.getValue(1));
32963 } else {
32964 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
32965 Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));
32966 }
32967 return;
32968 }
32969 if (VT != MVT::v2f32)
32970 return;
32971 EVT SrcVT = Src.getValueType();
32972 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
32973 if (IsStrict) {
32974 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
32976 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
32977 {N->getOperand(0), Src});
32978 Results.push_back(Res);
32979 Results.push_back(Res.getValue(1));
32980 } else {
32981 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
32982 Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
32983 }
32984 return;
32985 }
32986 if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
32987 Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
32988 SDValue Zero = DAG.getConstant(0, dl, SrcVT);
32989 SDValue One = DAG.getConstant(1, dl, SrcVT);
32990 SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
32991 DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
32992 DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
32993 SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
32994 SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
32995 SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
32996 for (int i = 0; i != 2; ++i) {
32997 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
32998 SignSrc, DAG.getIntPtrConstant(i, dl));
32999 if (IsStrict)
33000 SignCvts[i] =
33001 DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
33002 {N->getOperand(0), Elt});
33003 else
33004 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
33005 };
33006 SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
33007 SDValue Slow, Chain;
33008 if (IsStrict) {
33009 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
33010 SignCvts[0].getValue(1), SignCvts[1].getValue(1));
33011 Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
33012 {Chain, SignCvt, SignCvt});
33013 Chain = Slow.getValue(1);
33014 } else {
33015 Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
33016 }
33017 IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
33018 IsNeg =
33019 DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
33020 SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
33021 Results.push_back(Cvt);
33022 if (IsStrict)
33023 Results.push_back(Chain);
33024 return;
33025 }
33026
33027 if (SrcVT != MVT::v2i32)
33028 return;
33029
33030 if (IsSigned || Subtarget.hasAVX512()) {
33031 if (!IsStrict)
33032 return;
33033
33034 // Custom widen strict v2i32->v2f32 to avoid scalarization.
33035 // FIXME: Should generic type legalizer do this?
33036 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
33037 DAG.getConstant(0, dl, MVT::v2i32));
33038 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
33039 {N->getOperand(0), Src});
33040 Results.push_back(Res);
33041 Results.push_back(Res.getValue(1));
33042 return;
33043 }
33044
33045 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
33046 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
33047 SDValue VBias = DAG.getConstantFP(
33048 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::v2f64);
33049 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
33050 DAG.getBitcast(MVT::v2i64, VBias));
33051 Or = DAG.getBitcast(MVT::v2f64, Or);
33052 if (IsStrict) {
33053 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
33054 {N->getOperand(0), Or, VBias});
33056 {MVT::v4f32, MVT::Other},
33057 {Sub.getValue(1), Sub});
33058 Results.push_back(Res);
33059 Results.push_back(Res.getValue(1));
33060 } else {
33061 // TODO: Are there any fast-math-flags to propagate here?
33062 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
33063 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
33064 }
33065 return;
33066 }
33068 case ISD::FP_ROUND: {
33069 bool IsStrict = N->isStrictFPOpcode();
33070 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
33071 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
33072 SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);
33073 EVT SrcVT = Src.getValueType();
33074 EVT VT = N->getValueType(0);
33075 SDValue V;
33076 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
33077 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
33078 : DAG.getUNDEF(MVT::v2f32);
33079 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
33080 }
33081 if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {
33082 assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C");
33083 if (SrcVT.getVectorElementType() != MVT::f32)
33084 return;
33085
33086 if (IsStrict)
33087 V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
33088 {Chain, Src, Rnd});
33089 else
33090 V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd);
33091
33092 Results.push_back(DAG.getBitcast(MVT::v8f16, V));
33093 if (IsStrict)
33094 Results.push_back(V.getValue(1));
33095 return;
33096 }
33097 if (!isTypeLegal(Src.getValueType()))
33098 return;
33099 EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
33100 if (IsStrict)
33101 V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
33102 {Chain, Src});
33103 else
33104 V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
33105 Results.push_back(V);
33106 if (IsStrict)
33107 Results.push_back(V.getValue(1));
33108 return;
33109 }
33110 case ISD::FP_EXTEND:
33111 case ISD::STRICT_FP_EXTEND: {
33112 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
33113 // No other ValueType for FP_EXTEND should reach this point.
33114 assert(N->getValueType(0) == MVT::v2f32 &&
33115 "Do not know how to legalize this Node");
33116 if (!Subtarget.hasFP16() || !Subtarget.hasVLX())
33117 return;
33118 bool IsStrict = N->isStrictFPOpcode();
33119 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
33120 if (Src.getValueType().getVectorElementType() != MVT::f16)
33121 return;
33122 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)
33123 : DAG.getUNDEF(MVT::v2f16);
33124 SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);
33125 if (IsStrict)
33126 V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},
33127 {N->getOperand(0), V});
33128 else
33129 V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);
33130 Results.push_back(V);
33131 if (IsStrict)
33132 Results.push_back(V.getValue(1));
33133 return;
33134 }
33136 unsigned IntNo = N->getConstantOperandVal(1);
33137 switch (IntNo) {
33138 default : llvm_unreachable("Do not know how to custom type "
33139 "legalize this intrinsic operation!");
33140 case Intrinsic::x86_rdtsc:
33141 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
33142 Results);
33143 case Intrinsic::x86_rdtscp:
33144 return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
33145 Results);
33146 case Intrinsic::x86_rdpmc:
33147 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
33148 Results);
33149 return;
33150 case Intrinsic::x86_rdpru:
33151 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,
33152 Results);
33153 return;
33154 case Intrinsic::x86_xgetbv:
33155 expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
33156 Results);
33157 return;
33158 }
33159 }
33160 case ISD::READCYCLECOUNTER: {
33161 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
33162 }
33164 EVT T = N->getValueType(0);
33165 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
33166 bool Regs64bit = T == MVT::i128;
33167 assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&
33168 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
33169 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
33170 SDValue cpInL, cpInH;
33171 std::tie(cpInL, cpInH) =
33172 DAG.SplitScalar(N->getOperand(2), dl, HalfT, HalfT);
33173 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
33174 Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue());
33175 cpInH =
33176 DAG.getCopyToReg(cpInL.getValue(0), dl, Regs64bit ? X86::RDX : X86::EDX,
33177 cpInH, cpInL.getValue(1));
33178 SDValue swapInL, swapInH;
33179 std::tie(swapInL, swapInH) =
33180 DAG.SplitScalar(N->getOperand(3), dl, HalfT, HalfT);
33181 swapInH =
33182 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
33183 swapInH, cpInH.getValue(1));
33184
33185 // In 64-bit mode we might need the base pointer in RBX, but we can't know
33186 // until later. So we keep the RBX input in a vreg and use a custom
33187 // inserter.
33188 // Since RBX will be a reserved register the register allocator will not
33189 // make sure its value will be properly saved and restored around this
33190 // live-range.
33191 SDValue Result;
33192 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
33193 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
33194 if (Regs64bit) {
33195 SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
33196 swapInH.getValue(1)};
33197 Result =
33198 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
33199 } else {
33200 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
33201 swapInH.getValue(1));
33202 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
33203 swapInL.getValue(1)};
33204 Result =
33205 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
33206 }
33207
33208 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
33209 Regs64bit ? X86::RAX : X86::EAX,
33210 HalfT, Result.getValue(1));
33211 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
33212 Regs64bit ? X86::RDX : X86::EDX,
33213 HalfT, cpOutL.getValue(2));
33214 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
33215
33216 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
33217 MVT::i32, cpOutH.getValue(2));
33218 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
33219 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
33220
33221 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
33222 Results.push_back(Success);
33223 Results.push_back(EFLAGS.getValue(1));
33224 return;
33225 }
33226 case ISD::ATOMIC_LOAD: {
33227 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
33228 bool NoImplicitFloatOps =
33230 Attribute::NoImplicitFloat);
33231 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
33232 auto *Node = cast<AtomicSDNode>(N);
33233 if (Subtarget.hasSSE1()) {
33234 // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
33235 // Then extract the lower 64-bits.
33236 MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
33237 SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
33238 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
33239 SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
33240 MVT::i64, Node->getMemOperand());
33241 if (Subtarget.hasSSE2()) {
33242 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
33243 DAG.getIntPtrConstant(0, dl));
33244 Results.push_back(Res);
33245 Results.push_back(Ld.getValue(1));
33246 return;
33247 }
33248 // We use an alternative sequence for SSE1 that extracts as v2f32 and
33249 // then casts to i64. This avoids a 128-bit stack temporary being
33250 // created by type legalization if we were to cast v4f32->v2i64.
33251 SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
33252 DAG.getIntPtrConstant(0, dl));
33253 Res = DAG.getBitcast(MVT::i64, Res);
33254 Results.push_back(Res);
33255 Results.push_back(Ld.getValue(1));
33256 return;
33257 }
33258 if (Subtarget.hasX87()) {
33259 // First load this into an 80-bit X87 register. This will put the whole
33260 // integer into the significand.
33261 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
33262 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
33264 dl, Tys, Ops, MVT::i64,
33265 Node->getMemOperand());
33266 SDValue Chain = Result.getValue(1);
33267
33268 // Now store the X87 register to a stack temporary and convert to i64.
33269 // This store is not atomic and doesn't need to be.
33270 // FIXME: We don't need a stack temporary if the result of the load
33271 // is already being stored. We could just directly store there.
33272 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
33273 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
33274 MachinePointerInfo MPI =
33276 SDValue StoreOps[] = { Chain, Result, StackPtr };
33277 Chain = DAG.getMemIntrinsicNode(
33278 X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
33279 MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore);
33280
33281 // Finally load the value back from the stack temporary and return it.
33282 // This load is not atomic and doesn't need to be.
33283 // This load will be further type legalized.
33284 Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
33285 Results.push_back(Result);
33286 Results.push_back(Result.getValue(1));
33287 return;
33288 }
33289 }
33290 // TODO: Use MOVLPS when SSE1 is available?
33291 // Delegate to generic TypeLegalization. Situations we can really handle
33292 // should have already been dealt with by AtomicExpandPass.cpp.
33293 break;
33294 }
33295 case ISD::ATOMIC_SWAP:
33306 // Delegate to generic TypeLegalization. Situations we can really handle
33307 // should have already been dealt with by AtomicExpandPass.cpp.
33308 break;
33309
33310 case ISD::BITCAST: {
33311 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
33312 EVT DstVT = N->getValueType(0);
33313 EVT SrcVT = N->getOperand(0).getValueType();
33314
33315 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
33316 // we can split using the k-register rather than memory.
33317 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
33318 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
33319 SDValue Lo, Hi;
33320 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
33321 Lo = DAG.getBitcast(MVT::i32, Lo);
33322 Hi = DAG.getBitcast(MVT::i32, Hi);
33323 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
33324 Results.push_back(Res);
33325 return;
33326 }
33327
33328 if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
33329 // FIXME: Use v4f32 for SSE1?
33330 assert(Subtarget.hasSSE2() && "Requires SSE2");
33331 assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
33332 "Unexpected type action!");
33333 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
33334 SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
33335 N->getOperand(0));
33336 Res = DAG.getBitcast(WideVT, Res);
33337 Results.push_back(Res);
33338 return;
33339 }
33340
33341 return;
33342 }
33343 case ISD::MGATHER: {
33344 EVT VT = N->getValueType(0);
33345 if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
33346 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
33347 auto *Gather = cast<MaskedGatherSDNode>(N);
33348 SDValue Index = Gather->getIndex();
33349 if (Index.getValueType() != MVT::v2i64)
33350 return;
33352 "Unexpected type action!");
33353 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
33354 SDValue Mask = Gather->getMask();
33355 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
33356 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
33357 Gather->getPassThru(),
33358 DAG.getUNDEF(VT));
33359 if (!Subtarget.hasVLX()) {
33360 // We need to widen the mask, but the instruction will only use 2
33361 // of its elements. So we can use undef.
33362 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
33363 DAG.getUNDEF(MVT::v2i1));
33364 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
33365 }
33366 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
33367 Gather->getBasePtr(), Index, Gather->getScale() };
33368 SDValue Res = DAG.getMemIntrinsicNode(
33369 X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
33370 Gather->getMemoryVT(), Gather->getMemOperand());
33371 Results.push_back(Res);
33372 Results.push_back(Res.getValue(1));
33373 return;
33374 }
33375 return;
33376 }
33377 case ISD::LOAD: {
33378 // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
33379 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
33380 // cast since type legalization will try to use an i64 load.
33381 MVT VT = N->getSimpleValueType(0);
33382 assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
33384 "Unexpected type action!");
33385 if (!ISD::isNON_EXTLoad(N))
33386 return;
33387 auto *Ld = cast<LoadSDNode>(N);
33388 if (Subtarget.hasSSE2()) {
33389 MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
33390 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
33391 Ld->getPointerInfo(), Ld->getOriginalAlign(),
33392 Ld->getMemOperand()->getFlags());
33393 SDValue Chain = Res.getValue(1);
33394 MVT VecVT = MVT::getVectorVT(LdVT, 2);
33395 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
33396 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
33397 Res = DAG.getBitcast(WideVT, Res);
33398 Results.push_back(Res);
33399 Results.push_back(Chain);
33400 return;
33401 }
33402 assert(Subtarget.hasSSE1() && "Expected SSE");
33403 SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
33404 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
33405 SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
33406 MVT::i64, Ld->getMemOperand());
33407 Results.push_back(Res);
33408 Results.push_back(Res.getValue(1));
33409 return;
33410 }
33411 case ISD::ADDRSPACECAST: {
33412 SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
33413 Results.push_back(V);
33414 return;
33415 }
33416 case ISD::BITREVERSE: {
33417 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
33418 assert(Subtarget.hasXOP() && "Expected XOP");
33419 // We can use VPPERM by copying to a vector register and back. We'll need
33420 // to move the scalar in two i32 pieces.
33421 Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
33422 return;
33423 }
33425 // f16 = extract vXf16 %vec, i64 %idx
33426 assert(N->getSimpleValueType(0) == MVT::f16 &&
33427 "Unexpected Value type of EXTRACT_VECTOR_ELT!");
33428 assert(Subtarget.hasFP16() && "Expected FP16");
33429 SDValue VecOp = N->getOperand(0);
33431 SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0));
33432 Split = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Split,
33433 N->getOperand(1));
33434 Split = DAG.getBitcast(MVT::f16, Split);
33435 Results.push_back(Split);
33436 return;
33437 }
33438 }
33439}
33440
33441const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
33442 switch ((X86ISD::NodeType)Opcode) {
33443 case X86ISD::FIRST_NUMBER: break;
33444#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
33445 NODE_NAME_CASE(BSF)
33446 NODE_NAME_CASE(BSR)
33447 NODE_NAME_CASE(FSHL)
33448 NODE_NAME_CASE(FSHR)
33449 NODE_NAME_CASE(FAND)
33450 NODE_NAME_CASE(FANDN)
33451 NODE_NAME_CASE(FOR)
33452 NODE_NAME_CASE(FXOR)
33453 NODE_NAME_CASE(FILD)
33454 NODE_NAME_CASE(FIST)
33455 NODE_NAME_CASE(FP_TO_INT_IN_MEM)
33456 NODE_NAME_CASE(FLD)
33457 NODE_NAME_CASE(FST)
33458 NODE_NAME_CASE(CALL)
33459 NODE_NAME_CASE(CALL_RVMARKER)
33461 NODE_NAME_CASE(CMP)
33462 NODE_NAME_CASE(FCMP)
33463 NODE_NAME_CASE(STRICT_FCMP)
33464 NODE_NAME_CASE(STRICT_FCMPS)
33466 NODE_NAME_CASE(UCOMI)
33467 NODE_NAME_CASE(CMPM)
33468 NODE_NAME_CASE(CMPMM)
33469 NODE_NAME_CASE(STRICT_CMPM)
33470 NODE_NAME_CASE(CMPMM_SAE)
33471 NODE_NAME_CASE(SETCC)
33472 NODE_NAME_CASE(SETCC_CARRY)
33473 NODE_NAME_CASE(FSETCC)
33474 NODE_NAME_CASE(FSETCCM)
33475 NODE_NAME_CASE(FSETCCM_SAE)
33476 NODE_NAME_CASE(CMOV)
33477 NODE_NAME_CASE(BRCOND)
33478 NODE_NAME_CASE(RET_GLUE)
33479 NODE_NAME_CASE(IRET)
33480 NODE_NAME_CASE(REP_STOS)
33481 NODE_NAME_CASE(REP_MOVS)
33482 NODE_NAME_CASE(GlobalBaseReg)
33484 NODE_NAME_CASE(WrapperRIP)
33485 NODE_NAME_CASE(MOVQ2DQ)
33486 NODE_NAME_CASE(MOVDQ2Q)
33487 NODE_NAME_CASE(MMX_MOVD2W)
33488 NODE_NAME_CASE(MMX_MOVW2D)
33489 NODE_NAME_CASE(PEXTRB)
33490 NODE_NAME_CASE(PEXTRW)
33491 NODE_NAME_CASE(INSERTPS)
33492 NODE_NAME_CASE(PINSRB)
33493 NODE_NAME_CASE(PINSRW)
33494 NODE_NAME_CASE(PSHUFB)
33495 NODE_NAME_CASE(ANDNP)
33496 NODE_NAME_CASE(BLENDI)
33498 NODE_NAME_CASE(HADD)
33499 NODE_NAME_CASE(HSUB)
33500 NODE_NAME_CASE(FHADD)
33501 NODE_NAME_CASE(FHSUB)
33502 NODE_NAME_CASE(CONFLICT)
33503 NODE_NAME_CASE(FMAX)
33504 NODE_NAME_CASE(FMAXS)
33505 NODE_NAME_CASE(FMAX_SAE)
33506 NODE_NAME_CASE(FMAXS_SAE)
33507 NODE_NAME_CASE(FMIN)
33508 NODE_NAME_CASE(FMINS)
33509 NODE_NAME_CASE(FMIN_SAE)
33510 NODE_NAME_CASE(FMINS_SAE)
33511 NODE_NAME_CASE(FMAXC)
33512 NODE_NAME_CASE(FMINC)
33513 NODE_NAME_CASE(FRSQRT)
33514 NODE_NAME_CASE(FRCP)
33515 NODE_NAME_CASE(EXTRQI)
33516 NODE_NAME_CASE(INSERTQI)
33517 NODE_NAME_CASE(TLSADDR)
33518 NODE_NAME_CASE(TLSBASEADDR)
33519 NODE_NAME_CASE(TLSCALL)
33520 NODE_NAME_CASE(TLSDESC)
33521 NODE_NAME_CASE(EH_SJLJ_SETJMP)
33522 NODE_NAME_CASE(EH_SJLJ_LONGJMP)
33523 NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
33524 NODE_NAME_CASE(EH_RETURN)
33525 NODE_NAME_CASE(TC_RETURN)
33526 NODE_NAME_CASE(FNSTCW16m)
33527 NODE_NAME_CASE(FLDCW16m)
33528 NODE_NAME_CASE(FNSTENVm)
33529 NODE_NAME_CASE(FLDENVm)
33530 NODE_NAME_CASE(LCMPXCHG_DAG)
33531 NODE_NAME_CASE(LCMPXCHG8_DAG)
33532 NODE_NAME_CASE(LCMPXCHG16_DAG)
33533 NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
33534 NODE_NAME_CASE(LADD)
33535 NODE_NAME_CASE(LSUB)
33536 NODE_NAME_CASE(LOR)
33537 NODE_NAME_CASE(LXOR)
33538 NODE_NAME_CASE(LAND)
33539 NODE_NAME_CASE(LBTS)
33540 NODE_NAME_CASE(LBTC)
33541 NODE_NAME_CASE(LBTR)
33542 NODE_NAME_CASE(LBTS_RM)
33543 NODE_NAME_CASE(LBTC_RM)
33544 NODE_NAME_CASE(LBTR_RM)
33545 NODE_NAME_CASE(AADD)
33546 NODE_NAME_CASE(AOR)
33547 NODE_NAME_CASE(AXOR)
33548 NODE_NAME_CASE(AAND)
33549 NODE_NAME_CASE(VZEXT_MOVL)
33550 NODE_NAME_CASE(VZEXT_LOAD)
33551 NODE_NAME_CASE(VEXTRACT_STORE)
33552 NODE_NAME_CASE(VTRUNC)
33553 NODE_NAME_CASE(VTRUNCS)
33554 NODE_NAME_CASE(VTRUNCUS)
33555 NODE_NAME_CASE(VMTRUNC)
33556 NODE_NAME_CASE(VMTRUNCS)
33557 NODE_NAME_CASE(VMTRUNCUS)
33558 NODE_NAME_CASE(VTRUNCSTORES)
33559 NODE_NAME_CASE(VTRUNCSTOREUS)
33560 NODE_NAME_CASE(VMTRUNCSTORES)
33561 NODE_NAME_CASE(VMTRUNCSTOREUS)
33562 NODE_NAME_CASE(VFPEXT)
33563 NODE_NAME_CASE(STRICT_VFPEXT)
33564 NODE_NAME_CASE(VFPEXT_SAE)
33565 NODE_NAME_CASE(VFPEXTS)
33566 NODE_NAME_CASE(VFPEXTS_SAE)
33567 NODE_NAME_CASE(VFPROUND)
33568 NODE_NAME_CASE(STRICT_VFPROUND)
33569 NODE_NAME_CASE(VMFPROUND)
33570 NODE_NAME_CASE(VFPROUND_RND)
33571 NODE_NAME_CASE(VFPROUNDS)
33572 NODE_NAME_CASE(VFPROUNDS_RND)
33573 NODE_NAME_CASE(VSHLDQ)
33574 NODE_NAME_CASE(VSRLDQ)
33575 NODE_NAME_CASE(VSHL)
33576 NODE_NAME_CASE(VSRL)
33577 NODE_NAME_CASE(VSRA)
33578 NODE_NAME_CASE(VSHLI)
33579 NODE_NAME_CASE(VSRLI)
33580 NODE_NAME_CASE(VSRAI)
33581 NODE_NAME_CASE(VSHLV)
33582 NODE_NAME_CASE(VSRLV)
33583 NODE_NAME_CASE(VSRAV)
33584 NODE_NAME_CASE(VROTLI)
33585 NODE_NAME_CASE(VROTRI)
33586 NODE_NAME_CASE(VPPERM)
33587 NODE_NAME_CASE(CMPP)
33588 NODE_NAME_CASE(STRICT_CMPP)
33589 NODE_NAME_CASE(PCMPEQ)
33590 NODE_NAME_CASE(PCMPGT)
33591 NODE_NAME_CASE(PHMINPOS)
33592 NODE_NAME_CASE(ADD)
33593 NODE_NAME_CASE(SUB)
33594 NODE_NAME_CASE(ADC)
33595 NODE_NAME_CASE(SBB)
33596 NODE_NAME_CASE(SMUL)
33597 NODE_NAME_CASE(UMUL)
33598 NODE_NAME_CASE(OR)
33599 NODE_NAME_CASE(XOR)
33600 NODE_NAME_CASE(AND)
33601 NODE_NAME_CASE(BEXTR)
33603 NODE_NAME_CASE(BZHI)
33604 NODE_NAME_CASE(PDEP)
33605 NODE_NAME_CASE(PEXT)
33606 NODE_NAME_CASE(MUL_IMM)
33607 NODE_NAME_CASE(MOVMSK)
33608 NODE_NAME_CASE(PTEST)
33609 NODE_NAME_CASE(TESTP)
33610 NODE_NAME_CASE(KORTEST)
33611 NODE_NAME_CASE(KTEST)
33612 NODE_NAME_CASE(KADD)
33613 NODE_NAME_CASE(KSHIFTL)
33614 NODE_NAME_CASE(KSHIFTR)
33615 NODE_NAME_CASE(PACKSS)
33616 NODE_NAME_CASE(PACKUS)
33617 NODE_NAME_CASE(PALIGNR)
33618 NODE_NAME_CASE(VALIGN)
33619 NODE_NAME_CASE(VSHLD)
33620 NODE_NAME_CASE(VSHRD)
33621 NODE_NAME_CASE(VSHLDV)
33622 NODE_NAME_CASE(VSHRDV)
33623 NODE_NAME_CASE(PSHUFD)
33624 NODE_NAME_CASE(PSHUFHW)
33625 NODE_NAME_CASE(PSHUFLW)
33626 NODE_NAME_CASE(SHUFP)
33627 NODE_NAME_CASE(SHUF128)
33628 NODE_NAME_CASE(MOVLHPS)
33629 NODE_NAME_CASE(MOVHLPS)
33630 NODE_NAME_CASE(MOVDDUP)
33631 NODE_NAME_CASE(MOVSHDUP)
33632 NODE_NAME_CASE(MOVSLDUP)
33633 NODE_NAME_CASE(MOVSD)
33634 NODE_NAME_CASE(MOVSS)
33635 NODE_NAME_CASE(MOVSH)
33636 NODE_NAME_CASE(UNPCKL)
33637 NODE_NAME_CASE(UNPCKH)
33638 NODE_NAME_CASE(VBROADCAST)
33639 NODE_NAME_CASE(VBROADCAST_LOAD)
33640 NODE_NAME_CASE(VBROADCASTM)
33641 NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
33642 NODE_NAME_CASE(VPERMILPV)
33643 NODE_NAME_CASE(VPERMILPI)
33644 NODE_NAME_CASE(VPERM2X128)
33645 NODE_NAME_CASE(VPERMV)
33646 NODE_NAME_CASE(VPERMV3)
33647 NODE_NAME_CASE(VPERMI)
33648 NODE_NAME_CASE(VPTERNLOG)
33649 NODE_NAME_CASE(VFIXUPIMM)
33650 NODE_NAME_CASE(VFIXUPIMM_SAE)
33651 NODE_NAME_CASE(VFIXUPIMMS)
33652 NODE_NAME_CASE(VFIXUPIMMS_SAE)
33653 NODE_NAME_CASE(VRANGE)
33654 NODE_NAME_CASE(VRANGE_SAE)
33655 NODE_NAME_CASE(VRANGES)
33656 NODE_NAME_CASE(VRANGES_SAE)
33657 NODE_NAME_CASE(PMULUDQ)
33658 NODE_NAME_CASE(PMULDQ)
33659 NODE_NAME_CASE(PSADBW)
33660 NODE_NAME_CASE(DBPSADBW)
33661 NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
33662 NODE_NAME_CASE(VAARG_64)
33663 NODE_NAME_CASE(VAARG_X32)
33664 NODE_NAME_CASE(DYN_ALLOCA)
33665 NODE_NAME_CASE(MFENCE)
33666 NODE_NAME_CASE(SEG_ALLOCA)
33667 NODE_NAME_CASE(PROBED_ALLOCA)
33670 NODE_NAME_CASE(RDPKRU)
33671 NODE_NAME_CASE(WRPKRU)
33672 NODE_NAME_CASE(VPMADDUBSW)
33673 NODE_NAME_CASE(VPMADDWD)
33674 NODE_NAME_CASE(VPSHA)
33675 NODE_NAME_CASE(VPSHL)
33676 NODE_NAME_CASE(VPCOM)
33677 NODE_NAME_CASE(VPCOMU)
33678 NODE_NAME_CASE(VPERMIL2)
33680 NODE_NAME_CASE(STRICT_FMSUB)
33682 NODE_NAME_CASE(STRICT_FNMADD)
33684 NODE_NAME_CASE(STRICT_FNMSUB)
33685 NODE_NAME_CASE(FMADDSUB)
33686 NODE_NAME_CASE(FMSUBADD)
33687 NODE_NAME_CASE(FMADD_RND)
33688 NODE_NAME_CASE(FNMADD_RND)
33689 NODE_NAME_CASE(FMSUB_RND)
33690 NODE_NAME_CASE(FNMSUB_RND)
33691 NODE_NAME_CASE(FMADDSUB_RND)
33692 NODE_NAME_CASE(FMSUBADD_RND)
33693 NODE_NAME_CASE(VFMADDC)
33694 NODE_NAME_CASE(VFMADDC_RND)
33695 NODE_NAME_CASE(VFCMADDC)
33696 NODE_NAME_CASE(VFCMADDC_RND)
33697 NODE_NAME_CASE(VFMULC)
33698 NODE_NAME_CASE(VFMULC_RND)
33699 NODE_NAME_CASE(VFCMULC)
33700 NODE_NAME_CASE(VFCMULC_RND)
33701 NODE_NAME_CASE(VFMULCSH)
33702 NODE_NAME_CASE(VFMULCSH_RND)
33703 NODE_NAME_CASE(VFCMULCSH)
33704 NODE_NAME_CASE(VFCMULCSH_RND)
33705 NODE_NAME_CASE(VFMADDCSH)
33706 NODE_NAME_CASE(VFMADDCSH_RND)
33707 NODE_NAME_CASE(VFCMADDCSH)
33708 NODE_NAME_CASE(VFCMADDCSH_RND)
33709 NODE_NAME_CASE(VPMADD52H)
33710 NODE_NAME_CASE(VPMADD52L)
33711 NODE_NAME_CASE(VRNDSCALE)
33712 NODE_NAME_CASE(STRICT_VRNDSCALE)
33713 NODE_NAME_CASE(VRNDSCALE_SAE)
33714 NODE_NAME_CASE(VRNDSCALES)
33715 NODE_NAME_CASE(VRNDSCALES_SAE)
33716 NODE_NAME_CASE(VREDUCE)
33717 NODE_NAME_CASE(VREDUCE_SAE)
33718 NODE_NAME_CASE(VREDUCES)
33719 NODE_NAME_CASE(VREDUCES_SAE)
33720 NODE_NAME_CASE(VGETMANT)
33721 NODE_NAME_CASE(VGETMANT_SAE)
33722 NODE_NAME_CASE(VGETMANTS)
33723 NODE_NAME_CASE(VGETMANTS_SAE)
33724 NODE_NAME_CASE(PCMPESTR)
33725 NODE_NAME_CASE(PCMPISTR)
33727 NODE_NAME_CASE(COMPRESS)
33729 NODE_NAME_CASE(SELECTS)
33730 NODE_NAME_CASE(ADDSUB)
33731 NODE_NAME_CASE(RCP14)
33732 NODE_NAME_CASE(RCP14S)
33733 NODE_NAME_CASE(RCP28)
33734 NODE_NAME_CASE(RCP28_SAE)
33735 NODE_NAME_CASE(RCP28S)
33736 NODE_NAME_CASE(RCP28S_SAE)
33737 NODE_NAME_CASE(EXP2)
33738 NODE_NAME_CASE(EXP2_SAE)
33739 NODE_NAME_CASE(RSQRT14)
33740 NODE_NAME_CASE(RSQRT14S)
33741 NODE_NAME_CASE(RSQRT28)
33742 NODE_NAME_CASE(RSQRT28_SAE)
33743 NODE_NAME_CASE(RSQRT28S)
33744 NODE_NAME_CASE(RSQRT28S_SAE)
33745 NODE_NAME_CASE(FADD_RND)
33746 NODE_NAME_CASE(FADDS)
33747 NODE_NAME_CASE(FADDS_RND)
33748 NODE_NAME_CASE(FSUB_RND)
33749 NODE_NAME_CASE(FSUBS)
33750 NODE_NAME_CASE(FSUBS_RND)
33751 NODE_NAME_CASE(FMUL_RND)
33752 NODE_NAME_CASE(FMULS)
33753 NODE_NAME_CASE(FMULS_RND)
33754 NODE_NAME_CASE(FDIV_RND)
33755 NODE_NAME_CASE(FDIVS)
33756 NODE_NAME_CASE(FDIVS_RND)
33757 NODE_NAME_CASE(FSQRT_RND)
33758 NODE_NAME_CASE(FSQRTS)
33759 NODE_NAME_CASE(FSQRTS_RND)
33760 NODE_NAME_CASE(FGETEXP)
33761 NODE_NAME_CASE(FGETEXP_SAE)
33762 NODE_NAME_CASE(FGETEXPS)
33763 NODE_NAME_CASE(FGETEXPS_SAE)
33764 NODE_NAME_CASE(SCALEF)
33765 NODE_NAME_CASE(SCALEF_RND)
33766 NODE_NAME_CASE(SCALEFS)
33767 NODE_NAME_CASE(SCALEFS_RND)
33768 NODE_NAME_CASE(MULHRS)
33769 NODE_NAME_CASE(SINT_TO_FP_RND)
33770 NODE_NAME_CASE(UINT_TO_FP_RND)
33771 NODE_NAME_CASE(CVTTP2SI)
33772 NODE_NAME_CASE(CVTTP2UI)
33773 NODE_NAME_CASE(STRICT_CVTTP2SI)
33774 NODE_NAME_CASE(STRICT_CVTTP2UI)
33775 NODE_NAME_CASE(MCVTTP2SI)
33776 NODE_NAME_CASE(MCVTTP2UI)
33777 NODE_NAME_CASE(CVTTP2SI_SAE)
33778 NODE_NAME_CASE(CVTTP2UI_SAE)
33779 NODE_NAME_CASE(CVTTS2SI)
33780 NODE_NAME_CASE(CVTTS2UI)
33781 NODE_NAME_CASE(CVTTS2SI_SAE)
33782 NODE_NAME_CASE(CVTTS2UI_SAE)
33783 NODE_NAME_CASE(CVTSI2P)
33784 NODE_NAME_CASE(CVTUI2P)
33785 NODE_NAME_CASE(STRICT_CVTSI2P)
33786 NODE_NAME_CASE(STRICT_CVTUI2P)
33787 NODE_NAME_CASE(MCVTSI2P)
33788 NODE_NAME_CASE(MCVTUI2P)
33789 NODE_NAME_CASE(VFPCLASS)
33790 NODE_NAME_CASE(VFPCLASSS)
33791 NODE_NAME_CASE(MULTISHIFT)
33792 NODE_NAME_CASE(SCALAR_SINT_TO_FP)
33793 NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
33794 NODE_NAME_CASE(SCALAR_UINT_TO_FP)
33795 NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
33796 NODE_NAME_CASE(CVTPS2PH)
33797 NODE_NAME_CASE(STRICT_CVTPS2PH)
33798 NODE_NAME_CASE(CVTPS2PH_SAE)
33799 NODE_NAME_CASE(MCVTPS2PH)
33800 NODE_NAME_CASE(MCVTPS2PH_SAE)
33801 NODE_NAME_CASE(CVTPH2PS)
33802 NODE_NAME_CASE(STRICT_CVTPH2PS)
33803 NODE_NAME_CASE(CVTPH2PS_SAE)
33804 NODE_NAME_CASE(CVTP2SI)
33805 NODE_NAME_CASE(CVTP2UI)
33806 NODE_NAME_CASE(MCVTP2SI)
33807 NODE_NAME_CASE(MCVTP2UI)
33808 NODE_NAME_CASE(CVTP2SI_RND)
33809 NODE_NAME_CASE(CVTP2UI_RND)
33810 NODE_NAME_CASE(CVTS2SI)
33811 NODE_NAME_CASE(CVTS2UI)
33812 NODE_NAME_CASE(CVTS2SI_RND)
33813 NODE_NAME_CASE(CVTS2UI_RND)
33814 NODE_NAME_CASE(CVTNE2PS2BF16)
33815 NODE_NAME_CASE(CVTNEPS2BF16)
33816 NODE_NAME_CASE(MCVTNEPS2BF16)
33817 NODE_NAME_CASE(DPBF16PS)
33818 NODE_NAME_CASE(LWPINS)
33819 NODE_NAME_CASE(MGATHER)
33820 NODE_NAME_CASE(MSCATTER)
33821 NODE_NAME_CASE(VPDPBUSD)
33822 NODE_NAME_CASE(VPDPBUSDS)
33823 NODE_NAME_CASE(VPDPWSSD)
33824 NODE_NAME_CASE(VPDPWSSDS)
33825 NODE_NAME_CASE(VPSHUFBITQMB)
33826 NODE_NAME_CASE(GF2P8MULB)
33827 NODE_NAME_CASE(GF2P8AFFINEQB)
33828 NODE_NAME_CASE(GF2P8AFFINEINVQB)
33829 NODE_NAME_CASE(NT_CALL)
33830 NODE_NAME_CASE(NT_BRIND)
33831 NODE_NAME_CASE(UMWAIT)
33832 NODE_NAME_CASE(TPAUSE)
33833 NODE_NAME_CASE(ENQCMD)
33834 NODE_NAME_CASE(ENQCMDS)
33835 NODE_NAME_CASE(VP2INTERSECT)
33836 NODE_NAME_CASE(VPDPBSUD)
33837 NODE_NAME_CASE(VPDPBSUDS)
33838 NODE_NAME_CASE(VPDPBUUD)
33839 NODE_NAME_CASE(VPDPBUUDS)
33840 NODE_NAME_CASE(VPDPBSSD)
33841 NODE_NAME_CASE(VPDPBSSDS)
33842 NODE_NAME_CASE(AESENC128KL)
33843 NODE_NAME_CASE(AESDEC128KL)
33844 NODE_NAME_CASE(AESENC256KL)
33845 NODE_NAME_CASE(AESDEC256KL)
33846 NODE_NAME_CASE(AESENCWIDE128KL)
33847 NODE_NAME_CASE(AESDECWIDE128KL)
33848 NODE_NAME_CASE(AESENCWIDE256KL)
33849 NODE_NAME_CASE(AESDECWIDE256KL)
33850 NODE_NAME_CASE(CMPCCXADD)
33851 NODE_NAME_CASE(TESTUI)
33852 NODE_NAME_CASE(FP80_ADD)
33853 NODE_NAME_CASE(STRICT_FP80_ADD)
33854 }
33855 return nullptr;
33856#undef NODE_NAME_CASE
33857}
33858
33859/// Return true if the addressing mode represented by AM is legal for this
33860/// target, for a load/store of the specified type.
33862 const AddrMode &AM, Type *Ty,
33863 unsigned AS,
33864 Instruction *I) const {
33865 // X86 supports extremely general addressing modes.
33867
33868 // X86 allows a sign-extended 32-bit immediate field as a displacement.
33869 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
33870 return false;
33871
33872 if (AM.BaseGV) {
33873 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
33874
33875 // If a reference to this global requires an extra load, we can't fold it.
33876 if (isGlobalStubReference(GVFlags))
33877 return false;
33878
33879 // If BaseGV requires a register for the PIC base, we cannot also have a
33880 // BaseReg specified.
33881 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
33882 return false;
33883
33884 // If lower 4G is not available, then we must use rip-relative addressing.
33885 if ((M != CodeModel::Small || isPositionIndependent()) &&
33886 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
33887 return false;
33888 }
33889
33890 switch (AM.Scale) {
33891 case 0:
33892 case 1:
33893 case 2:
33894 case 4:
33895 case 8:
33896 // These scales always work.
33897 break;
33898 case 3:
33899 case 5:
33900 case 9:
33901 // These scales are formed with basereg+scalereg. Only accept if there is
33902 // no basereg yet.
33903 if (AM.HasBaseReg)
33904 return false;
33905 break;
33906 default: // Other stuff never works.
33907 return false;
33908 }
33909
33910 return true;
33911}
33912
33914 unsigned Bits = Ty->getScalarSizeInBits();
33915
33916 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
33917 // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
33918 if (Subtarget.hasXOP() &&
33919 (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
33920 return false;
33921
33922 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
33923 // shifts just as cheap as scalar ones.
33924 if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
33925 return false;
33926
33927 // AVX512BW has shifts such as vpsllvw.
33928 if (Subtarget.hasBWI() && Bits == 16)
33929 return false;
33930
33931 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
33932 // fully general vector.
33933 return true;
33934}
33935
33936bool X86TargetLowering::isBinOp(unsigned Opcode) const {
33937 switch (Opcode) {
33938 // These are non-commutative binops.
33939 // TODO: Add more X86ISD opcodes once we have test coverage.
33940 case X86ISD::ANDNP:
33941 case X86ISD::PCMPGT:
33942 case X86ISD::FMAX:
33943 case X86ISD::FMIN:
33944 case X86ISD::FANDN:
33945 case X86ISD::VPSHA:
33946 case X86ISD::VPSHL:
33947 case X86ISD::VSHLV:
33948 case X86ISD::VSRLV:
33949 case X86ISD::VSRAV:
33950 return true;
33951 }
33952
33953 return TargetLoweringBase::isBinOp(Opcode);
33954}
33955
33956bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
33957 switch (Opcode) {
33958 // TODO: Add more X86ISD opcodes once we have test coverage.
33959 case X86ISD::PCMPEQ:
33960 case X86ISD::PMULDQ:
33961 case X86ISD::PMULUDQ:
33962 case X86ISD::FMAXC:
33963 case X86ISD::FMINC:
33964 case X86ISD::FAND:
33965 case X86ISD::FOR:
33966 case X86ISD::FXOR:
33967 return true;
33968 }
33969
33971}
33972
33974 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
33975 return false;
33976 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
33977 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
33978 return NumBits1 > NumBits2;
33979}
33980
33982 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
33983 return false;
33984
33985 if (!isTypeLegal(EVT::getEVT(Ty1)))
33986 return false;
33987
33988 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
33989
33990 // Assuming the caller doesn't have a zeroext or signext return parameter,
33991 // truncation all the way down to i1 is valid.
33992 return true;
33993}
33994
33996 return isInt<32>(Imm);
33997}
33998
34000 // Can also use sub to handle negated immediates.
34001 return isInt<32>(Imm);
34002}
34003
34005 return isInt<32>(Imm);
34006}
34007
34009 if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
34010 return false;
34011 unsigned NumBits1 = VT1.getSizeInBits();
34012 unsigned NumBits2 = VT2.getSizeInBits();
34013 return NumBits1 > NumBits2;
34014}
34015
34017 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
34018 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
34019}
34020
34022 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
34023 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
34024}
34025
34027 EVT VT1 = Val.getValueType();
34028 if (isZExtFree(VT1, VT2))
34029 return true;
34030
34031 if (Val.getOpcode() != ISD::LOAD)
34032 return false;
34033
34034 if (!VT1.isSimple() || !VT1.isInteger() ||
34035 !VT2.isSimple() || !VT2.isInteger())
34036 return false;
34037
34038 switch (VT1.getSimpleVT().SimpleTy) {
34039 default: break;
34040 case MVT::i8:
34041 case MVT::i16:
34042 case MVT::i32:
34043 // X86 has 8, 16, and 32-bit zero-extending loads.
34044 return true;
34045 }
34046
34047 return false;
34048}
34049
34051 SmallVectorImpl<Use *> &Ops) const {
34052 using namespace llvm::PatternMatch;
34053
34054 FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
34055 if (!VTy)
34056 return false;
34057
34058 if (I->getOpcode() == Instruction::Mul &&
34059 VTy->getElementType()->isIntegerTy(64)) {
34060 for (auto &Op : I->operands()) {
34061 // Make sure we are not already sinking this operand
34062 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
34063 continue;
34064
34065 // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or
34066 // the PMULUDQ pattern where the input is a zext_inreg from vXi32.
34067 if (Subtarget.hasSSE41() &&
34068 match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),
34069 m_SpecificInt(32)))) {
34070 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
34071 Ops.push_back(&Op);
34072 } else if (Subtarget.hasSSE2() &&
34073 match(Op.get(),
34074 m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) {
34075 Ops.push_back(&Op);
34076 }
34077 }
34078
34079 return !Ops.empty();
34080 }
34081
34082 // A uniform shift amount in a vector shift or funnel shift may be much
34083 // cheaper than a generic variable vector shift, so make that pattern visible
34084 // to SDAG by sinking the shuffle instruction next to the shift.
34085 int ShiftAmountOpNum = -1;
34086 if (I->isShift())
34087 ShiftAmountOpNum = 1;
34088 else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
34089 if (II->getIntrinsicID() == Intrinsic::fshl ||
34090 II->getIntrinsicID() == Intrinsic::fshr)
34091 ShiftAmountOpNum = 2;
34092 }
34093
34094 if (ShiftAmountOpNum == -1)
34095 return false;
34096
34097 auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
34098 if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
34099 isVectorShiftByScalarCheap(I->getType())) {
34100 Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
34101 return true;
34102 }
34103
34104 return false;
34105}
34106
34108 if (!Subtarget.is64Bit())
34109 return false;
34111}
34112
34114 if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
34115 return false;
34116
34117 EVT SrcVT = ExtVal.getOperand(0).getValueType();
34118
34119 // There is no extending load for vXi1.
34120 if (SrcVT.getScalarType() == MVT::i1)
34121 return false;
34122
34123 return true;
34124}
34125
34127 EVT VT) const {
34128 if (!Subtarget.hasAnyFMA())
34129 return false;
34130
34131 VT = VT.getScalarType();
34132
34133 if (!VT.isSimple())
34134 return false;
34135
34136 switch (VT.getSimpleVT().SimpleTy) {
34137 case MVT::f16:
34138 return Subtarget.hasFP16();
34139 case MVT::f32:
34140 case MVT::f64:
34141 return true;
34142 default:
34143 break;
34144 }
34145
34146 return false;
34147}
34148
34150 // i16 instructions are longer (0x66 prefix) and potentially slower.
34151 return !(SrcVT == MVT::i32 && DestVT == MVT::i16);
34152}
34153
34155 EVT VT) const {
34156 // TODO: This is too general. There are cases where pre-AVX512 codegen would
34157 // benefit. The transform may also be profitable for scalar code.
34158 if (!Subtarget.hasAVX512())
34159 return false;
34160 if (!Subtarget.hasVLX() && !VT.is512BitVector())
34161 return false;
34162 if (!VT.isVector() || VT.getScalarType() == MVT::i1)
34163 return false;
34164
34165 return true;
34166}
34167
34168/// Targets can use this to indicate that they only support *some*
34169/// VECTOR_SHUFFLE operations, those with specific masks.
34170/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
34171/// are assumed to be legal.
34173 if (!VT.isSimple())
34174 return false;
34175
34176 // Not for i1 vectors
34177 if (VT.getSimpleVT().getScalarType() == MVT::i1)
34178 return false;
34179
34180 // Very little shuffling can be done for 64-bit vectors right now.
34181 if (VT.getSimpleVT().getSizeInBits() == 64)
34182 return false;
34183
34184 // We only care that the types being shuffled are legal. The lowering can
34185 // handle any possible shuffle mask that results.
34186 return isTypeLegal(VT.getSimpleVT());
34187}
34188
34190 EVT VT) const {
34191 // Don't convert an 'and' into a shuffle that we don't directly support.
34192 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
34193 if (!Subtarget.hasAVX2())
34194 if (VT == MVT::v32i8 || VT == MVT::v16i16)
34195 return false;
34196
34197 // Just delegate to the generic legality, clear masks aren't special.
34198 return isShuffleMaskLegal(Mask, VT);
34199}
34200
34202 // If the subtarget is using thunks, we need to not generate jump tables.
34203 if (Subtarget.useIndirectThunkBranches())
34204 return false;
34205
34206 // Otherwise, fallback on the generic logic.
34208}
34209
34211 EVT ConditionVT) const {
34212 // Avoid 8 and 16 bit types because they increase the chance for unnecessary
34213 // zero-extensions.
34214 if (ConditionVT.getSizeInBits() < 32)
34215 return MVT::i32;
34217 ConditionVT);
34218}
34219
34220//===----------------------------------------------------------------------===//
34221// X86 Scheduler Hooks
34222//===----------------------------------------------------------------------===//
34223
34224// Returns true if EFLAG is consumed after this iterator in the rest of the
34225// basic block or any successors of the basic block.
34227 MachineBasicBlock *BB) {
34228 // Scan forward through BB for a use/def of EFLAGS.
34229 for (const MachineInstr &mi : llvm::make_range(std::next(Itr), BB->end())) {
34230 if (mi.readsRegister(X86::EFLAGS, /*TRI=*/nullptr))
34231 return true;
34232 // If we found a def, we can stop searching.
34233 if (mi.definesRegister(X86::EFLAGS, /*TRI=*/nullptr))
34234 return false;
34235 }
34236
34237 // If we hit the end of the block, check whether EFLAGS is live into a
34238 // successor.
34239 for (MachineBasicBlock *Succ : BB->successors())
34240 if (Succ->isLiveIn(X86::EFLAGS))
34241 return true;
34242
34243 return false;
34244}
34245
34246/// Utility function to emit xbegin specifying the start of an RTM region.
34248 const TargetInstrInfo *TII) {
34249 const MIMetadata MIMD(MI);
34250
34251 const BasicBlock *BB = MBB->getBasicBlock();
34253
34254 // For the v = xbegin(), we generate
34255 //
34256 // thisMBB:
34257 // xbegin sinkMBB
34258 //
34259 // mainMBB:
34260 // s0 = -1
34261 //
34262 // fallBB:
34263 // eax = # XABORT_DEF
34264 // s1 = eax
34265 //
34266 // sinkMBB:
34267 // v = phi(s0/mainBB, s1/fallBB)
34268
34269 MachineBasicBlock *thisMBB = MBB;
34270 MachineFunction *MF = MBB->getParent();
34271 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
34272 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
34273 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
34274 MF->insert(I, mainMBB);
34275 MF->insert(I, fallMBB);
34276 MF->insert(I, sinkMBB);
34277
34278 if (isEFLAGSLiveAfter(MI, MBB)) {
34279 mainMBB->addLiveIn(X86::EFLAGS);
34280 fallMBB->addLiveIn(X86::EFLAGS);
34281 sinkMBB->addLiveIn(X86::EFLAGS);
34282 }
34283
34284 // Transfer the remainder of BB and its successor edges to sinkMBB.
34285 sinkMBB->splice(sinkMBB->begin(), MBB,
34286 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
34288
34290 Register DstReg = MI.getOperand(0).getReg();
34291 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
34292 Register mainDstReg = MRI.createVirtualRegister(RC);
34293 Register fallDstReg = MRI.createVirtualRegister(RC);
34294
34295 // thisMBB:
34296 // xbegin fallMBB
34297 // # fallthrough to mainMBB
34298 // # abortion to fallMBB
34299 BuildMI(thisMBB, MIMD, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
34300 thisMBB->addSuccessor(mainMBB);
34301 thisMBB->addSuccessor(fallMBB);
34302
34303 // mainMBB:
34304 // mainDstReg := -1
34305 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
34306 BuildMI(mainMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
34307 mainMBB->addSuccessor(sinkMBB);
34308
34309 // fallMBB:
34310 // ; pseudo instruction to model hardware's definition from XABORT
34311 // EAX := XABORT_DEF
34312 // fallDstReg := EAX
34313 BuildMI(fallMBB, MIMD, TII->get(X86::XABORT_DEF));
34314 BuildMI(fallMBB, MIMD, TII->get(TargetOpcode::COPY), fallDstReg)
34315 .addReg(X86::EAX);
34316 fallMBB->addSuccessor(sinkMBB);
34317
34318 // sinkMBB:
34319 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
34320 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
34321 .addReg(mainDstReg).addMBB(mainMBB)
34322 .addReg(fallDstReg).addMBB(fallMBB);
34323
34324 MI.eraseFromParent();
34325 return sinkMBB;
34326}
34327
34329X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
34330 MachineBasicBlock *MBB) const {
34331 // Emit va_arg instruction on X86-64.
34332
34333 // Operands to this pseudo-instruction:
34334 // 0 ) Output : destination address (reg)
34335 // 1-5) Input : va_list address (addr, i64mem)
34336 // 6 ) ArgSize : Size (in bytes) of vararg type
34337 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
34338 // 8 ) Align : Alignment of type
34339 // 9 ) EFLAGS (implicit-def)
34340
34341 assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!");
34342 static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
34343
34344 Register DestReg = MI.getOperand(0).getReg();
34345 MachineOperand &Base = MI.getOperand(1);
34346 MachineOperand &Scale = MI.getOperand(2);
34347 MachineOperand &Index = MI.getOperand(3);
34348 MachineOperand &Disp = MI.getOperand(4);
34349 MachineOperand &Segment = MI.getOperand(5);
34350 unsigned ArgSize = MI.getOperand(6).getImm();
34351 unsigned ArgMode = MI.getOperand(7).getImm();
34352 Align Alignment = Align(MI.getOperand(8).getImm());
34353
34354 MachineFunction *MF = MBB->getParent();
34355
34356 // Memory Reference
34357 assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand");
34358
34359 MachineMemOperand *OldMMO = MI.memoperands().front();
34360
34361 // Clone the MMO into two separate MMOs for loading and storing
34362 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
34363 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
34364 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
34365 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
34366
34367 // Machine Information
34368 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34370 const TargetRegisterClass *AddrRegClass =
34372 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
34373 const MIMetadata MIMD(MI);
34374
34375 // struct va_list {
34376 // i32 gp_offset
34377 // i32 fp_offset
34378 // i64 overflow_area (address)
34379 // i64 reg_save_area (address)
34380 // }
34381 // sizeof(va_list) = 24
34382 // alignment(va_list) = 8
34383
34384 unsigned TotalNumIntRegs = 6;
34385 unsigned TotalNumXMMRegs = 8;
34386 bool UseGPOffset = (ArgMode == 1);
34387 bool UseFPOffset = (ArgMode == 2);
34388 unsigned MaxOffset = TotalNumIntRegs * 8 +
34389 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
34390
34391 /* Align ArgSize to a multiple of 8 */
34392 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
34393 bool NeedsAlign = (Alignment > 8);
34394
34395 MachineBasicBlock *thisMBB = MBB;
34396 MachineBasicBlock *overflowMBB;
34397 MachineBasicBlock *offsetMBB;
34398 MachineBasicBlock *endMBB;
34399
34400 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
34401 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
34402 unsigned OffsetReg = 0;
34403
34404 if (!UseGPOffset && !UseFPOffset) {
34405 // If we only pull from the overflow region, we don't create a branch.
34406 // We don't need to alter control flow.
34407 OffsetDestReg = 0; // unused
34408 OverflowDestReg = DestReg;
34409
34410 offsetMBB = nullptr;
34411 overflowMBB = thisMBB;
34412 endMBB = thisMBB;
34413 } else {
34414 // First emit code to check if gp_offset (or fp_offset) is below the bound.
34415 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
34416 // If not, pull from overflow_area. (branch to overflowMBB)
34417 //
34418 // thisMBB
34419 // | .
34420 // | .
34421 // offsetMBB overflowMBB
34422 // | .
34423 // | .
34424 // endMBB
34425
34426 // Registers for the PHI in endMBB
34427 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
34428 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
34429
34430 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
34431 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34432 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34433 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34434
34436
34437 // Insert the new basic blocks
34438 MF->insert(MBBIter, offsetMBB);
34439 MF->insert(MBBIter, overflowMBB);
34440 MF->insert(MBBIter, endMBB);
34441
34442 // Transfer the remainder of MBB and its successor edges to endMBB.
34443 endMBB->splice(endMBB->begin(), thisMBB,
34444 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
34445 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
34446
34447 // Make offsetMBB and overflowMBB successors of thisMBB
34448 thisMBB->addSuccessor(offsetMBB);
34449 thisMBB->addSuccessor(overflowMBB);
34450
34451 // endMBB is a successor of both offsetMBB and overflowMBB
34452 offsetMBB->addSuccessor(endMBB);
34453 overflowMBB->addSuccessor(endMBB);
34454
34455 // Load the offset value into a register
34456 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
34457 BuildMI(thisMBB, MIMD, TII->get(X86::MOV32rm), OffsetReg)
34458 .add(Base)
34459 .add(Scale)
34460 .add(Index)
34461 .addDisp(Disp, UseFPOffset ? 4 : 0)
34462 .add(Segment)
34463 .setMemRefs(LoadOnlyMMO);
34464
34465 // Check if there is enough room left to pull this argument.
34466 BuildMI(thisMBB, MIMD, TII->get(X86::CMP32ri))
34467 .addReg(OffsetReg)
34468 .addImm(MaxOffset + 8 - ArgSizeA8);
34469
34470 // Branch to "overflowMBB" if offset >= max
34471 // Fall through to "offsetMBB" otherwise
34472 BuildMI(thisMBB, MIMD, TII->get(X86::JCC_1))
34473 .addMBB(overflowMBB).addImm(X86::COND_AE);
34474 }
34475
34476 // In offsetMBB, emit code to use the reg_save_area.
34477 if (offsetMBB) {
34478 assert(OffsetReg != 0);
34479
34480 // Read the reg_save_area address.
34481 Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
34482 BuildMI(
34483 offsetMBB, MIMD,
34484 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
34485 RegSaveReg)
34486 .add(Base)
34487 .add(Scale)
34488 .add(Index)
34489 .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
34490 .add(Segment)
34491 .setMemRefs(LoadOnlyMMO);
34492
34493 if (Subtarget.isTarget64BitLP64()) {
34494 // Zero-extend the offset
34495 Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
34496 BuildMI(offsetMBB, MIMD, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
34497 .addImm(0)
34498 .addReg(OffsetReg)
34499 .addImm(X86::sub_32bit);
34500
34501 // Add the offset to the reg_save_area to get the final address.
34502 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD64rr), OffsetDestReg)
34503 .addReg(OffsetReg64)
34504 .addReg(RegSaveReg);
34505 } else {
34506 // Add the offset to the reg_save_area to get the final address.
34507 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32rr), OffsetDestReg)
34508 .addReg(OffsetReg)
34509 .addReg(RegSaveReg);
34510 }
34511
34512 // Compute the offset for the next argument
34513 Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
34514 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32ri), NextOffsetReg)
34515 .addReg(OffsetReg)
34516 .addImm(UseFPOffset ? 16 : 8);
34517
34518 // Store it back into the va_list.
34519 BuildMI(offsetMBB, MIMD, TII->get(X86::MOV32mr))
34520 .add(Base)
34521 .add(Scale)
34522 .add(Index)
34523 .addDisp(Disp, UseFPOffset ? 4 : 0)
34524 .add(Segment)
34525 .addReg(NextOffsetReg)
34526 .setMemRefs(StoreOnlyMMO);
34527
34528 // Jump to endMBB
34529 BuildMI(offsetMBB, MIMD, TII->get(X86::JMP_1))
34530 .addMBB(endMBB);
34531 }
34532
34533 //
34534 // Emit code to use overflow area
34535 //
34536
34537 // Load the overflow_area address into a register.
34538 Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
34539 BuildMI(overflowMBB, MIMD,
34540 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
34541 OverflowAddrReg)
34542 .add(Base)
34543 .add(Scale)
34544 .add(Index)
34545 .addDisp(Disp, 8)
34546 .add(Segment)
34547 .setMemRefs(LoadOnlyMMO);
34548
34549 // If we need to align it, do so. Otherwise, just copy the address
34550 // to OverflowDestReg.
34551 if (NeedsAlign) {
34552 // Align the overflow address
34553 Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
34554
34555 // aligned_addr = (addr + (align-1)) & ~(align-1)
34556 BuildMI(
34557 overflowMBB, MIMD,
34558 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
34559 TmpReg)
34560 .addReg(OverflowAddrReg)
34561 .addImm(Alignment.value() - 1);
34562
34563 BuildMI(
34564 overflowMBB, MIMD,
34565 TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
34566 OverflowDestReg)
34567 .addReg(TmpReg)
34568 .addImm(~(uint64_t)(Alignment.value() - 1));
34569 } else {
34570 BuildMI(overflowMBB, MIMD, TII->get(TargetOpcode::COPY), OverflowDestReg)
34571 .addReg(OverflowAddrReg);
34572 }
34573
34574 // Compute the next overflow address after this argument.
34575 // (the overflow address should be kept 8-byte aligned)
34576 Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
34577 BuildMI(
34578 overflowMBB, MIMD,
34579 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
34580 NextAddrReg)
34581 .addReg(OverflowDestReg)
34582 .addImm(ArgSizeA8);
34583
34584 // Store the new overflow address.
34585 BuildMI(overflowMBB, MIMD,
34586 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
34587 .add(Base)
34588 .add(Scale)
34589 .add(Index)
34590 .addDisp(Disp, 8)
34591 .add(Segment)
34592 .addReg(NextAddrReg)
34593 .setMemRefs(StoreOnlyMMO);
34594
34595 // If we branched, emit the PHI to the front of endMBB.
34596 if (offsetMBB) {
34597 BuildMI(*endMBB, endMBB->begin(), MIMD,
34598 TII->get(X86::PHI), DestReg)
34599 .addReg(OffsetDestReg).addMBB(offsetMBB)
34600 .addReg(OverflowDestReg).addMBB(overflowMBB);
34601 }
34602
34603 // Erase the pseudo instruction
34604 MI.eraseFromParent();
34605
34606 return endMBB;
34607}
34608
34609// The EFLAGS operand of SelectItr might be missing a kill marker
34610// because there were multiple uses of EFLAGS, and ISel didn't know
34611// which to mark. Figure out whether SelectItr should have had a
34612// kill marker, and set it if it should. Returns the correct kill
34613// marker value.
34616 const TargetRegisterInfo* TRI) {
34617 if (isEFLAGSLiveAfter(SelectItr, BB))
34618 return false;
34619
34620 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
34621 // out. SelectMI should have a kill flag on EFLAGS.
34622 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
34623 return true;
34624}
34625
34626// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
34627// together with other CMOV pseudo-opcodes into a single basic-block with
34628// conditional jump around it.
34630 switch (MI.getOpcode()) {
34631 case X86::CMOV_FR16:
34632 case X86::CMOV_FR16X:
34633 case X86::CMOV_FR32:
34634 case X86::CMOV_FR32X:
34635 case X86::CMOV_FR64:
34636 case X86::CMOV_FR64X:
34637 case X86::CMOV_GR8:
34638 case X86::CMOV_GR16:
34639 case X86::CMOV_GR32:
34640 case X86::CMOV_RFP32:
34641 case X86::CMOV_RFP64:
34642 case X86::CMOV_RFP80:
34643 case X86::CMOV_VR64:
34644 case X86::CMOV_VR128:
34645 case X86::CMOV_VR128X:
34646 case X86::CMOV_VR256:
34647 case X86::CMOV_VR256X:
34648 case X86::CMOV_VR512:
34649 case X86::CMOV_VK1:
34650 case X86::CMOV_VK2:
34651 case X86::CMOV_VK4:
34652 case X86::CMOV_VK8:
34653 case X86::CMOV_VK16:
34654 case X86::CMOV_VK32:
34655 case X86::CMOV_VK64:
34656 return true;
34657
34658 default:
34659 return false;
34660 }
34661}
34662
34663// Helper function, which inserts PHI functions into SinkMBB:
34664// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
34665// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
34666// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
34667// the last PHI function inserted.
34670 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
34671 MachineBasicBlock *SinkMBB) {
34672 MachineFunction *MF = TrueMBB->getParent();
34674 const MIMetadata MIMD(*MIItBegin);
34675
34676 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
34678
34679 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
34680
34681 // As we are creating the PHIs, we have to be careful if there is more than
34682 // one. Later CMOVs may reference the results of earlier CMOVs, but later
34683 // PHIs have to reference the individual true/false inputs from earlier PHIs.
34684 // That also means that PHI construction must work forward from earlier to
34685 // later, and that the code must maintain a mapping from earlier PHI's
34686 // destination registers, and the registers that went into the PHI.
34689
34690 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
34691 Register DestReg = MIIt->getOperand(0).getReg();
34692 Register Op1Reg = MIIt->getOperand(1).getReg();
34693 Register Op2Reg = MIIt->getOperand(2).getReg();
34694
34695 // If this CMOV we are generating is the opposite condition from
34696 // the jump we generated, then we have to swap the operands for the
34697 // PHI that is going to be generated.
34698 if (MIIt->getOperand(3).getImm() == OppCC)
34699 std::swap(Op1Reg, Op2Reg);
34700
34701 if (RegRewriteTable.contains(Op1Reg))
34702 Op1Reg = RegRewriteTable[Op1Reg].first;
34703
34704 if (RegRewriteTable.contains(Op2Reg))
34705 Op2Reg = RegRewriteTable[Op2Reg].second;
34706
34707 MIB =
34708 BuildMI(*SinkMBB, SinkInsertionPoint, MIMD, TII->get(X86::PHI), DestReg)
34709 .addReg(Op1Reg)
34710 .addMBB(FalseMBB)
34711 .addReg(Op2Reg)
34712 .addMBB(TrueMBB);
34713
34714 // Add this PHI to the rewrite table.
34715 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
34716 }
34717
34718 return MIB;
34719}
34720
34721// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
34723X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
34724 MachineInstr &SecondCascadedCMOV,
34725 MachineBasicBlock *ThisMBB) const {
34726 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34727 const MIMetadata MIMD(FirstCMOV);
34728
34729 // We lower cascaded CMOVs such as
34730 //
34731 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
34732 //
34733 // to two successive branches.
34734 //
34735 // Without this, we would add a PHI between the two jumps, which ends up
34736 // creating a few copies all around. For instance, for
34737 //
34738 // (sitofp (zext (fcmp une)))
34739 //
34740 // we would generate:
34741 //
34742 // ucomiss %xmm1, %xmm0
34743 // movss <1.0f>, %xmm0
34744 // movaps %xmm0, %xmm1
34745 // jne .LBB5_2
34746 // xorps %xmm1, %xmm1
34747 // .LBB5_2:
34748 // jp .LBB5_4
34749 // movaps %xmm1, %xmm0
34750 // .LBB5_4:
34751 // retq
34752 //
34753 // because this custom-inserter would have generated:
34754 //
34755 // A
34756 // | \
34757 // | B
34758 // | /
34759 // C
34760 // | \
34761 // | D
34762 // | /
34763 // E
34764 //
34765 // A: X = ...; Y = ...
34766 // B: empty
34767 // C: Z = PHI [X, A], [Y, B]
34768 // D: empty
34769 // E: PHI [X, C], [Z, D]
34770 //
34771 // If we lower both CMOVs in a single step, we can instead generate:
34772 //
34773 // A
34774 // | \
34775 // | C
34776 // | /|
34777 // |/ |
34778 // | |
34779 // | D
34780 // | /
34781 // E
34782 //
34783 // A: X = ...; Y = ...
34784 // D: empty
34785 // E: PHI [X, A], [X, C], [Y, D]
34786 //
34787 // Which, in our sitofp/fcmp example, gives us something like:
34788 //
34789 // ucomiss %xmm1, %xmm0
34790 // movss <1.0f>, %xmm0
34791 // jne .LBB5_4
34792 // jp .LBB5_4
34793 // xorps %xmm0, %xmm0
34794 // .LBB5_4:
34795 // retq
34796 //
34797
34798 // We lower cascaded CMOV into two successive branches to the same block.
34799 // EFLAGS is used by both, so mark it as live in the second.
34800 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
34801 MachineFunction *F = ThisMBB->getParent();
34802 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
34803 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
34804 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
34805
34806 MachineFunction::iterator It = ++ThisMBB->getIterator();
34807 F->insert(It, FirstInsertedMBB);
34808 F->insert(It, SecondInsertedMBB);
34809 F->insert(It, SinkMBB);
34810
34811 // For a cascaded CMOV, we lower it to two successive branches to
34812 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
34813 // the FirstInsertedMBB.
34814 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
34815
34816 // If the EFLAGS register isn't dead in the terminator, then claim that it's
34817 // live into the sink and copy blocks.
34818 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
34819 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
34820 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
34821 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
34822 SinkMBB->addLiveIn(X86::EFLAGS);
34823 }
34824
34825 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
34826 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
34827 std::next(MachineBasicBlock::iterator(FirstCMOV)),
34828 ThisMBB->end());
34829 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
34830
34831 // Fallthrough block for ThisMBB.
34832 ThisMBB->addSuccessor(FirstInsertedMBB);
34833 // The true block target of the first branch is always SinkMBB.
34834 ThisMBB->addSuccessor(SinkMBB);
34835 // Fallthrough block for FirstInsertedMBB.
34836 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
34837 // The true block for the branch of FirstInsertedMBB.
34838 FirstInsertedMBB->addSuccessor(SinkMBB);
34839 // This is fallthrough.
34840 SecondInsertedMBB->addSuccessor(SinkMBB);
34841
34842 // Create the conditional branch instructions.
34843 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
34844 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
34845
34846 X86::CondCode SecondCC =
34847 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
34848 BuildMI(FirstInsertedMBB, MIMD, TII->get(X86::JCC_1))
34849 .addMBB(SinkMBB)
34850 .addImm(SecondCC);
34851
34852 // SinkMBB:
34853 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
34854 Register DestReg = SecondCascadedCMOV.getOperand(0).getReg();
34855 Register Op1Reg = FirstCMOV.getOperand(1).getReg();
34856 Register Op2Reg = FirstCMOV.getOperand(2).getReg();
34858 BuildMI(*SinkMBB, SinkMBB->begin(), MIMD, TII->get(X86::PHI), DestReg)
34859 .addReg(Op1Reg)
34860 .addMBB(SecondInsertedMBB)
34861 .addReg(Op2Reg)
34862 .addMBB(ThisMBB);
34863
34864 // The second SecondInsertedMBB provides the same incoming value as the
34865 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
34866 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
34867
34868 // Now remove the CMOVs.
34869 FirstCMOV.eraseFromParent();
34870 SecondCascadedCMOV.eraseFromParent();
34871
34872 return SinkMBB;
34873}
34874
34876X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
34877 MachineBasicBlock *ThisMBB) const {
34878 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34879 const MIMetadata MIMD(MI);
34880
34881 // To "insert" a SELECT_CC instruction, we actually have to insert the
34882 // diamond control-flow pattern. The incoming instruction knows the
34883 // destination vreg to set, the condition code register to branch on, the
34884 // true/false values to select between and a branch opcode to use.
34885
34886 // ThisMBB:
34887 // ...
34888 // TrueVal = ...
34889 // cmpTY ccX, r1, r2
34890 // bCC copy1MBB
34891 // fallthrough --> FalseMBB
34892
34893 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
34894 // as described above, by inserting a BB, and then making a PHI at the join
34895 // point to select the true and false operands of the CMOV in the PHI.
34896 //
34897 // The code also handles two different cases of multiple CMOV opcodes
34898 // in a row.
34899 //
34900 // Case 1:
34901 // In this case, there are multiple CMOVs in a row, all which are based on
34902 // the same condition setting (or the exact opposite condition setting).
34903 // In this case we can lower all the CMOVs using a single inserted BB, and
34904 // then make a number of PHIs at the join point to model the CMOVs. The only
34905 // trickiness here, is that in a case like:
34906 //
34907 // t2 = CMOV cond1 t1, f1
34908 // t3 = CMOV cond1 t2, f2
34909 //
34910 // when rewriting this into PHIs, we have to perform some renaming on the
34911 // temps since you cannot have a PHI operand refer to a PHI result earlier
34912 // in the same block. The "simple" but wrong lowering would be:
34913 //
34914 // t2 = PHI t1(BB1), f1(BB2)
34915 // t3 = PHI t2(BB1), f2(BB2)
34916 //
34917 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
34918 // renaming is to note that on the path through BB1, t2 is really just a
34919 // copy of t1, and do that renaming, properly generating:
34920 //
34921 // t2 = PHI t1(BB1), f1(BB2)
34922 // t3 = PHI t1(BB1), f2(BB2)
34923 //
34924 // Case 2:
34925 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
34926 // function - EmitLoweredCascadedSelect.
34927
34928 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
34930 MachineInstr *LastCMOV = &MI;
34932
34933 // Check for case 1, where there are multiple CMOVs with the same condition
34934 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
34935 // number of jumps the most.
34936
34937 if (isCMOVPseudo(MI)) {
34938 // See if we have a string of CMOVS with the same condition. Skip over
34939 // intervening debug insts.
34940 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
34941 (NextMIIt->getOperand(3).getImm() == CC ||
34942 NextMIIt->getOperand(3).getImm() == OppCC)) {
34943 LastCMOV = &*NextMIIt;
34944 NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
34945 }
34946 }
34947
34948 // This checks for case 2, but only do this if we didn't already find
34949 // case 1, as indicated by LastCMOV == MI.
34950 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
34951 NextMIIt->getOpcode() == MI.getOpcode() &&
34952 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
34953 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
34954 NextMIIt->getOperand(1).isKill()) {
34955 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
34956 }
34957
34958 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
34959 MachineFunction *F = ThisMBB->getParent();
34960 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
34961 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
34962
34963 MachineFunction::iterator It = ++ThisMBB->getIterator();
34964 F->insert(It, FalseMBB);
34965 F->insert(It, SinkMBB);
34966
34967 // Set the call frame size on entry to the new basic blocks.
34968 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
34969 FalseMBB->setCallFrameSize(CallFrameSize);
34970 SinkMBB->setCallFrameSize(CallFrameSize);
34971
34972 // If the EFLAGS register isn't dead in the terminator, then claim that it's
34973 // live into the sink and copy blocks.
34974 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
34975 if (!LastCMOV->killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
34976 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
34977 FalseMBB->addLiveIn(X86::EFLAGS);
34978 SinkMBB->addLiveIn(X86::EFLAGS);
34979 }
34980
34981 // Transfer any debug instructions inside the CMOV sequence to the sunk block.
34983 MachineBasicBlock::iterator(LastCMOV));
34984 for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))
34985 if (MI.isDebugInstr())
34986 SinkMBB->push_back(MI.removeFromParent());
34987
34988 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
34989 SinkMBB->splice(SinkMBB->end(), ThisMBB,
34990 std::next(MachineBasicBlock::iterator(LastCMOV)),
34991 ThisMBB->end());
34992 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
34993
34994 // Fallthrough block for ThisMBB.
34995 ThisMBB->addSuccessor(FalseMBB);
34996 // The true block target of the first (or only) branch is always a SinkMBB.
34997 ThisMBB->addSuccessor(SinkMBB);
34998 // Fallthrough block for FalseMBB.
34999 FalseMBB->addSuccessor(SinkMBB);
35000
35001 // Create the conditional branch instruction.
35002 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
35003
35004 // SinkMBB:
35005 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
35006 // ...
35009 std::next(MachineBasicBlock::iterator(LastCMOV));
35010 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
35011
35012 // Now remove the CMOV(s).
35013 ThisMBB->erase(MIItBegin, MIItEnd);
35014
35015 return SinkMBB;
35016}
35017
35018static unsigned getSUBriOpcode(bool IsLP64) {
35019 if (IsLP64)
35020 return X86::SUB64ri32;
35021 else
35022 return X86::SUB32ri;
35023}
35024
35026X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
35027 MachineBasicBlock *MBB) const {
35028 MachineFunction *MF = MBB->getParent();
35029 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35030 const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
35031 const MIMetadata MIMD(MI);
35032 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
35033
35034 const unsigned ProbeSize = getStackProbeSize(*MF);
35035
35037 MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35038 MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35039 MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35040
35042 MF->insert(MBBIter, testMBB);
35043 MF->insert(MBBIter, blockMBB);
35044 MF->insert(MBBIter, tailMBB);
35045
35046 Register sizeVReg = MI.getOperand(1).getReg();
35047
35048 Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
35049
35050 Register TmpStackPtr = MRI.createVirtualRegister(
35051 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
35052 Register FinalStackPtr = MRI.createVirtualRegister(
35053 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
35054
35055 BuildMI(*MBB, {MI}, MIMD, TII->get(TargetOpcode::COPY), TmpStackPtr)
35056 .addReg(physSPReg);
35057 {
35058 const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
35059 BuildMI(*MBB, {MI}, MIMD, TII->get(Opc), FinalStackPtr)
35060 .addReg(TmpStackPtr)
35061 .addReg(sizeVReg);
35062 }
35063
35064 // test rsp size
35065
35066 BuildMI(testMBB, MIMD,
35067 TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
35068 .addReg(FinalStackPtr)
35069 .addReg(physSPReg);
35070
35071 BuildMI(testMBB, MIMD, TII->get(X86::JCC_1))
35072 .addMBB(tailMBB)
35074 testMBB->addSuccessor(blockMBB);
35075 testMBB->addSuccessor(tailMBB);
35076
35077 // Touch the block then extend it. This is done on the opposite side of
35078 // static probe where we allocate then touch, to avoid the need of probing the
35079 // tail of the static alloca. Possible scenarios are:
35080 //
35081 // + ---- <- ------------ <- ------------- <- ------------ +
35082 // | |
35083 // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
35084 // | |
35085 // + <- ----------- <- ------------ <- ----------- <- ------------ +
35086 //
35087 // The property we want to enforce is to never have more than [page alloc] between two probes.
35088
35089 const unsigned XORMIOpc =
35090 TFI.Uses64BitFramePtr ? X86::XOR64mi32 : X86::XOR32mi;
35091 addRegOffset(BuildMI(blockMBB, MIMD, TII->get(XORMIOpc)), physSPReg, false, 0)
35092 .addImm(0);
35093
35094 BuildMI(blockMBB, MIMD, TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr)),
35095 physSPReg)
35096 .addReg(physSPReg)
35097 .addImm(ProbeSize);
35098
35099 BuildMI(blockMBB, MIMD, TII->get(X86::JMP_1)).addMBB(testMBB);
35100 blockMBB->addSuccessor(testMBB);
35101
35102 // Replace original instruction by the expected stack ptr
35103 BuildMI(tailMBB, MIMD, TII->get(TargetOpcode::COPY),
35104 MI.getOperand(0).getReg())
35105 .addReg(FinalStackPtr);
35106
35107 tailMBB->splice(tailMBB->end(), MBB,
35108 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
35110 MBB->addSuccessor(testMBB);
35111
35112 // Delete the original pseudo instruction.
35113 MI.eraseFromParent();
35114
35115 // And we're done.
35116 return tailMBB;
35117}
35118
35120X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
35121 MachineBasicBlock *BB) const {
35122 MachineFunction *MF = BB->getParent();
35123 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35124 const MIMetadata MIMD(MI);
35125 const BasicBlock *LLVM_BB = BB->getBasicBlock();
35126
35127 assert(MF->shouldSplitStack());
35128
35129 const bool Is64Bit = Subtarget.is64Bit();
35130 const bool IsLP64 = Subtarget.isTarget64BitLP64();
35131
35132 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
35133 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
35134
35135 // BB:
35136 // ... [Till the alloca]
35137 // If stacklet is not large enough, jump to mallocMBB
35138 //
35139 // bumpMBB:
35140 // Allocate by subtracting from RSP
35141 // Jump to continueMBB
35142 //
35143 // mallocMBB:
35144 // Allocate by call to runtime
35145 //
35146 // continueMBB:
35147 // ...
35148 // [rest of original BB]
35149 //
35150
35151 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35152 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35153 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35154
35156 const TargetRegisterClass *AddrRegClass =
35158
35159 Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
35160 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
35161 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
35162 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
35163 sizeVReg = MI.getOperand(1).getReg(),
35164 physSPReg =
35165 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
35166
35167 MachineFunction::iterator MBBIter = ++BB->getIterator();
35168
35169 MF->insert(MBBIter, bumpMBB);
35170 MF->insert(MBBIter, mallocMBB);
35171 MF->insert(MBBIter, continueMBB);
35172
35173 continueMBB->splice(continueMBB->begin(), BB,
35174 std::next(MachineBasicBlock::iterator(MI)), BB->end());
35175 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
35176
35177 // Add code to the main basic block to check if the stack limit has been hit,
35178 // and if so, jump to mallocMBB otherwise to bumpMBB.
35179 BuildMI(BB, MIMD, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
35180 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
35181 .addReg(tmpSPVReg).addReg(sizeVReg);
35182 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
35183 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
35184 .addReg(SPLimitVReg);
35185 BuildMI(BB, MIMD, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
35186
35187 // bumpMBB simply decreases the stack pointer, since we know the current
35188 // stacklet has enough space.
35189 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), physSPReg)
35190 .addReg(SPLimitVReg);
35191 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
35192 .addReg(SPLimitVReg);
35193 BuildMI(bumpMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
35194
35195 // Calls into a routine in libgcc to allocate more space from the heap.
35196 const uint32_t *RegMask =
35198 if (IsLP64) {
35199 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV64rr), X86::RDI)
35200 .addReg(sizeVReg);
35201 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
35202 .addExternalSymbol("__morestack_allocate_stack_space")
35203 .addRegMask(RegMask)
35204 .addReg(X86::RDI, RegState::Implicit)
35205 .addReg(X86::RAX, RegState::ImplicitDefine);
35206 } else if (Is64Bit) {
35207 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV32rr), X86::EDI)
35208 .addReg(sizeVReg);
35209 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
35210 .addExternalSymbol("__morestack_allocate_stack_space")
35211 .addRegMask(RegMask)
35212 .addReg(X86::EDI, RegState::Implicit)
35213 .addReg(X86::EAX, RegState::ImplicitDefine);
35214 } else {
35215 BuildMI(mallocMBB, MIMD, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
35216 .addImm(12);
35217 BuildMI(mallocMBB, MIMD, TII->get(X86::PUSH32r)).addReg(sizeVReg);
35218 BuildMI(mallocMBB, MIMD, TII->get(X86::CALLpcrel32))
35219 .addExternalSymbol("__morestack_allocate_stack_space")
35220 .addRegMask(RegMask)
35221 .addReg(X86::EAX, RegState::ImplicitDefine);
35222 }
35223
35224 if (!Is64Bit)
35225 BuildMI(mallocMBB, MIMD, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
35226 .addImm(16);
35227
35228 BuildMI(mallocMBB, MIMD, TII->get(TargetOpcode::COPY), mallocPtrVReg)
35229 .addReg(IsLP64 ? X86::RAX : X86::EAX);
35230 BuildMI(mallocMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
35231
35232 // Set up the CFG correctly.
35233 BB->addSuccessor(bumpMBB);
35234 BB->addSuccessor(mallocMBB);
35235 mallocMBB->addSuccessor(continueMBB);
35236 bumpMBB->addSuccessor(continueMBB);
35237
35238 // Take care of the PHI nodes.
35239 BuildMI(*continueMBB, continueMBB->begin(), MIMD, TII->get(X86::PHI),
35240 MI.getOperand(0).getReg())
35241 .addReg(mallocPtrVReg)
35242 .addMBB(mallocMBB)
35243 .addReg(bumpSPPtrVReg)
35244 .addMBB(bumpMBB);
35245
35246 // Delete the original pseudo instruction.
35247 MI.eraseFromParent();
35248
35249 // And we're done.
35250 return continueMBB;
35251}
35252
35254X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
35255 MachineBasicBlock *BB) const {
35256 MachineFunction *MF = BB->getParent();
35257 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
35258 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
35259 const MIMetadata MIMD(MI);
35260
35263 "SEH does not use catchret!");
35264
35265 // Only 32-bit EH needs to worry about manually restoring stack pointers.
35266 if (!Subtarget.is32Bit())
35267 return BB;
35268
35269 // C++ EH creates a new target block to hold the restore code, and wires up
35270 // the new block to the return destination with a normal JMP_4.
35271 MachineBasicBlock *RestoreMBB =
35273 assert(BB->succ_size() == 1);
35274 MF->insert(std::next(BB->getIterator()), RestoreMBB);
35275 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
35276 BB->addSuccessor(RestoreMBB);
35277 MI.getOperand(0).setMBB(RestoreMBB);
35278
35279 // Marking this as an EH pad but not a funclet entry block causes PEI to
35280 // restore stack pointers in the block.
35281 RestoreMBB->setIsEHPad(true);
35282
35283 auto RestoreMBBI = RestoreMBB->begin();
35284 BuildMI(*RestoreMBB, RestoreMBBI, MIMD, TII.get(X86::JMP_4)).addMBB(TargetMBB);
35285 return BB;
35286}
35287
35289X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
35290 MachineBasicBlock *BB) const {
35291 // So, here we replace TLSADDR with the sequence:
35292 // adjust_stackdown -> TLSADDR -> adjust_stackup.
35293 // We need this because TLSADDR is lowered into calls
35294 // inside MC, therefore without the two markers shrink-wrapping
35295 // may push the prologue/epilogue pass them.
35296 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
35297 const MIMetadata MIMD(MI);
35298 MachineFunction &MF = *BB->getParent();
35299
35300 // Emit CALLSEQ_START right before the instruction.
35301 BB->getParent()->getFrameInfo().setAdjustsStack(true);
35302 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
35303 MachineInstrBuilder CallseqStart =
35304 BuildMI(MF, MIMD, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
35305 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
35306
35307 // Emit CALLSEQ_END right after the instruction.
35308 // We don't call erase from parent because we want to keep the
35309 // original instruction around.
35310 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
35311 MachineInstrBuilder CallseqEnd =
35312 BuildMI(MF, MIMD, TII.get(AdjStackUp)).addImm(0).addImm(0);
35313 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
35314
35315 return BB;
35316}
35317
35319X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
35320 MachineBasicBlock *BB) const {
35321 // This is pretty easy. We're taking the value that we received from
35322 // our load from the relocation, sticking it in either RDI (x86-64)
35323 // or EAX and doing an indirect call. The return value will then
35324 // be in the normal return register.
35325 MachineFunction *F = BB->getParent();
35326 const X86InstrInfo *TII = Subtarget.getInstrInfo();
35327 const MIMetadata MIMD(MI);
35328
35329 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
35330 assert(MI.getOperand(3).isGlobal() && "This should be a global");
35331
35332 // Get a register mask for the lowered call.
35333 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
35334 // proper register mask.
35335 const uint32_t *RegMask =
35336 Subtarget.is64Bit() ?
35339 if (Subtarget.is64Bit()) {
35341 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV64rm), X86::RDI)
35342 .addReg(X86::RIP)
35343 .addImm(0)
35344 .addReg(0)
35345 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
35346 MI.getOperand(3).getTargetFlags())
35347 .addReg(0);
35348 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL64m));
35349 addDirectMem(MIB, X86::RDI);
35350 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
35351 } else if (!isPositionIndependent()) {
35353 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
35354 .addReg(0)
35355 .addImm(0)
35356 .addReg(0)
35357 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
35358 MI.getOperand(3).getTargetFlags())
35359 .addReg(0);
35360 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
35361 addDirectMem(MIB, X86::EAX);
35362 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
35363 } else {
35365 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
35366 .addReg(TII->getGlobalBaseReg(F))
35367 .addImm(0)
35368 .addReg(0)
35369 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
35370 MI.getOperand(3).getTargetFlags())
35371 .addReg(0);
35372 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
35373 addDirectMem(MIB, X86::EAX);
35374 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
35375 }
35376
35377 MI.eraseFromParent(); // The pseudo instruction is gone now.
35378 return BB;
35379}
35380
35381static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
35382 switch (RPOpc) {
35383 case X86::INDIRECT_THUNK_CALL32:
35384 return X86::CALLpcrel32;
35385 case X86::INDIRECT_THUNK_CALL64:
35386 return X86::CALL64pcrel32;
35387 case X86::INDIRECT_THUNK_TCRETURN32:
35388 return X86::TCRETURNdi;
35389 case X86::INDIRECT_THUNK_TCRETURN64:
35390 return X86::TCRETURNdi64;
35391 }
35392 llvm_unreachable("not indirect thunk opcode");
35393}
35394
35395static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
35396 unsigned Reg) {
35397 if (Subtarget.useRetpolineExternalThunk()) {
35398 // When using an external thunk for retpolines, we pick names that match the
35399 // names GCC happens to use as well. This helps simplify the implementation
35400 // of the thunks for kernels where they have no easy ability to create
35401 // aliases and are doing non-trivial configuration of the thunk's body. For
35402 // example, the Linux kernel will do boot-time hot patching of the thunk
35403 // bodies and cannot easily export aliases of these to loaded modules.
35404 //
35405 // Note that at any point in the future, we may need to change the semantics
35406 // of how we implement retpolines and at that time will likely change the
35407 // name of the called thunk. Essentially, there is no hard guarantee that
35408 // LLVM will generate calls to specific thunks, we merely make a best-effort
35409 // attempt to help out kernels and other systems where duplicating the
35410 // thunks is costly.
35411 switch (Reg) {
35412 case X86::EAX:
35413 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35414 return "__x86_indirect_thunk_eax";
35415 case X86::ECX:
35416 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35417 return "__x86_indirect_thunk_ecx";
35418 case X86::EDX:
35419 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35420 return "__x86_indirect_thunk_edx";
35421 case X86::EDI:
35422 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35423 return "__x86_indirect_thunk_edi";
35424 case X86::R11:
35425 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
35426 return "__x86_indirect_thunk_r11";
35427 }
35428 llvm_unreachable("unexpected reg for external indirect thunk");
35429 }
35430
35431 if (Subtarget.useRetpolineIndirectCalls() ||
35432 Subtarget.useRetpolineIndirectBranches()) {
35433 // When targeting an internal COMDAT thunk use an LLVM-specific name.
35434 switch (Reg) {
35435 case X86::EAX:
35436 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35437 return "__llvm_retpoline_eax";
35438 case X86::ECX:
35439 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35440 return "__llvm_retpoline_ecx";
35441 case X86::EDX:
35442 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35443 return "__llvm_retpoline_edx";
35444 case X86::EDI:
35445 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35446 return "__llvm_retpoline_edi";
35447 case X86::R11:
35448 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
35449 return "__llvm_retpoline_r11";
35450 }
35451 llvm_unreachable("unexpected reg for retpoline");
35452 }
35453
35454 if (Subtarget.useLVIControlFlowIntegrity()) {
35455 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
35456 return "__llvm_lvi_thunk_r11";
35457 }
35458 llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
35459}
35460
35462X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
35463 MachineBasicBlock *BB) const {
35464 // Copy the virtual register into the R11 physical register and
35465 // call the retpoline thunk.
35466 const MIMetadata MIMD(MI);
35467 const X86InstrInfo *TII = Subtarget.getInstrInfo();
35468 Register CalleeVReg = MI.getOperand(0).getReg();
35469 unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
35470
35471 // Find an available scratch register to hold the callee. On 64-bit, we can
35472 // just use R11, but we scan for uses anyway to ensure we don't generate
35473 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
35474 // already a register use operand to the call to hold the callee. If none
35475 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
35476 // register and ESI is the base pointer to realigned stack frames with VLAs.
35477 SmallVector<unsigned, 3> AvailableRegs;
35478 if (Subtarget.is64Bit())
35479 AvailableRegs.push_back(X86::R11);
35480 else
35481 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
35482
35483 // Zero out any registers that are already used.
35484 for (const auto &MO : MI.operands()) {
35485 if (MO.isReg() && MO.isUse())
35486 for (unsigned &Reg : AvailableRegs)
35487 if (Reg == MO.getReg())
35488 Reg = 0;
35489 }
35490
35491 // Choose the first remaining non-zero available register.
35492 unsigned AvailableReg = 0;
35493 for (unsigned MaybeReg : AvailableRegs) {
35494 if (MaybeReg) {
35495 AvailableReg = MaybeReg;
35496 break;
35497 }
35498 }
35499 if (!AvailableReg)
35500 report_fatal_error("calling convention incompatible with retpoline, no "
35501 "available registers");
35502
35503 const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
35504
35505 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), AvailableReg)
35506 .addReg(CalleeVReg);
35507 MI.getOperand(0).ChangeToES(Symbol);
35508 MI.setDesc(TII->get(Opc));
35510 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
35511 return BB;
35512}
35513
35514/// SetJmp implies future control flow change upon calling the corresponding
35515/// LongJmp.
35516/// Instead of using the 'return' instruction, the long jump fixes the stack and
35517/// performs an indirect branch. To do so it uses the registers that were stored
35518/// in the jump buffer (when calling SetJmp).
35519/// In case the shadow stack is enabled we need to fix it as well, because some
35520/// return addresses will be skipped.
35521/// The function will save the SSP for future fixing in the function
35522/// emitLongJmpShadowStackFix.
35523/// \sa emitLongJmpShadowStackFix
35524/// \param [in] MI The temporary Machine Instruction for the builtin.
35525/// \param [in] MBB The Machine Basic Block that will be modified.
35526void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
35527 MachineBasicBlock *MBB) const {
35528 const MIMetadata MIMD(MI);
35529 MachineFunction *MF = MBB->getParent();
35530 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35533
35534 // Memory Reference.
35535 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
35536 MI.memoperands_end());
35537
35538 // Initialize a register with zero.
35539 MVT PVT = getPointerTy(MF->getDataLayout());
35540 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
35541 Register ZReg = MRI.createVirtualRegister(PtrRC);
35542 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
35543 BuildMI(*MBB, MI, MIMD, TII->get(XorRROpc))
35544 .addDef(ZReg)
35545 .addReg(ZReg, RegState::Undef)
35546 .addReg(ZReg, RegState::Undef);
35547
35548 // Read the current SSP Register value to the zeroed register.
35549 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
35550 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
35551 BuildMI(*MBB, MI, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
35552
35553 // Write the SSP register value to offset 3 in input memory buffer.
35554 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
35555 MIB = BuildMI(*MBB, MI, MIMD, TII->get(PtrStoreOpc));
35556 const int64_t SSPOffset = 3 * PVT.getStoreSize();
35557 const unsigned MemOpndSlot = 1;
35558 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35559 if (i == X86::AddrDisp)
35560 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
35561 else
35562 MIB.add(MI.getOperand(MemOpndSlot + i));
35563 }
35564 MIB.addReg(SSPCopyReg);
35565 MIB.setMemRefs(MMOs);
35566}
35567
35569X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
35570 MachineBasicBlock *MBB) const {
35571 const MIMetadata MIMD(MI);
35572 MachineFunction *MF = MBB->getParent();
35573 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35574 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
35576
35577 const BasicBlock *BB = MBB->getBasicBlock();
35579
35580 // Memory Reference
35581 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
35582 MI.memoperands_end());
35583
35584 unsigned DstReg;
35585 unsigned MemOpndSlot = 0;
35586
35587 unsigned CurOp = 0;
35588
35589 DstReg = MI.getOperand(CurOp++).getReg();
35590 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
35591 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
35592 (void)TRI;
35593 Register mainDstReg = MRI.createVirtualRegister(RC);
35594 Register restoreDstReg = MRI.createVirtualRegister(RC);
35595
35596 MemOpndSlot = CurOp;
35597
35598 MVT PVT = getPointerTy(MF->getDataLayout());
35599 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
35600 "Invalid Pointer Size!");
35601
35602 // For v = setjmp(buf), we generate
35603 //
35604 // thisMBB:
35605 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
35606 // SjLjSetup restoreMBB
35607 //
35608 // mainMBB:
35609 // v_main = 0
35610 //
35611 // sinkMBB:
35612 // v = phi(main, restore)
35613 //
35614 // restoreMBB:
35615 // if base pointer being used, load it from frame
35616 // v_restore = 1
35617
35618 MachineBasicBlock *thisMBB = MBB;
35619 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
35620 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35621 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
35622 MF->insert(I, mainMBB);
35623 MF->insert(I, sinkMBB);
35624 MF->push_back(restoreMBB);
35625 restoreMBB->setMachineBlockAddressTaken();
35626
35628
35629 // Transfer the remainder of BB and its successor edges to sinkMBB.
35630 sinkMBB->splice(sinkMBB->begin(), MBB,
35631 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
35633
35634 // thisMBB:
35635 unsigned PtrStoreOpc = 0;
35636 unsigned LabelReg = 0;
35637 const int64_t LabelOffset = 1 * PVT.getStoreSize();
35638 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
35640
35641 // Prepare IP either in reg or imm.
35642 if (!UseImmLabel) {
35643 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
35644 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
35645 LabelReg = MRI.createVirtualRegister(PtrRC);
35646 if (Subtarget.is64Bit()) {
35647 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA64r), LabelReg)
35648 .addReg(X86::RIP)
35649 .addImm(0)
35650 .addReg(0)
35651 .addMBB(restoreMBB)
35652 .addReg(0);
35653 } else {
35654 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
35655 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA32r), LabelReg)
35656 .addReg(XII->getGlobalBaseReg(MF))
35657 .addImm(0)
35658 .addReg(0)
35659 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
35660 .addReg(0);
35661 }
35662 } else
35663 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
35664 // Store IP
35665 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrStoreOpc));
35666 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35667 if (i == X86::AddrDisp)
35668 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
35669 else
35670 MIB.add(MI.getOperand(MemOpndSlot + i));
35671 }
35672 if (!UseImmLabel)
35673 MIB.addReg(LabelReg);
35674 else
35675 MIB.addMBB(restoreMBB);
35676 MIB.setMemRefs(MMOs);
35677
35678 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
35679 emitSetJmpShadowStackFix(MI, thisMBB);
35680 }
35681
35682 // Setup
35683 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::EH_SjLj_Setup))
35684 .addMBB(restoreMBB);
35685
35686 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
35687 MIB.addRegMask(RegInfo->getNoPreservedMask());
35688 thisMBB->addSuccessor(mainMBB);
35689 thisMBB->addSuccessor(restoreMBB);
35690
35691 // mainMBB:
35692 // EAX = 0
35693 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32r0), mainDstReg);
35694 mainMBB->addSuccessor(sinkMBB);
35695
35696 // sinkMBB:
35697 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
35698 .addReg(mainDstReg)
35699 .addMBB(mainMBB)
35700 .addReg(restoreDstReg)
35701 .addMBB(restoreMBB);
35702
35703 // restoreMBB:
35704 if (RegInfo->hasBasePointer(*MF)) {
35705 const bool Uses64BitFramePtr =
35706 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
35708 X86FI->setRestoreBasePointer(MF);
35709 Register FramePtr = RegInfo->getFrameRegister(*MF);
35710 Register BasePtr = RegInfo->getBaseRegister();
35711 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
35712 addRegOffset(BuildMI(restoreMBB, MIMD, TII->get(Opm), BasePtr),
35713 FramePtr, true, X86FI->getRestoreBasePointerOffset())
35715 }
35716 BuildMI(restoreMBB, MIMD, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
35717 BuildMI(restoreMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
35718 restoreMBB->addSuccessor(sinkMBB);
35719
35720 MI.eraseFromParent();
35721 return sinkMBB;
35722}
35723
35724/// Fix the shadow stack using the previously saved SSP pointer.
35725/// \sa emitSetJmpShadowStackFix
35726/// \param [in] MI The temporary Machine Instruction for the builtin.
35727/// \param [in] MBB The Machine Basic Block that will be modified.
35728/// \return The sink MBB that will perform the future indirect branch.
35730X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
35731 MachineBasicBlock *MBB) const {
35732 const MIMetadata MIMD(MI);
35733 MachineFunction *MF = MBB->getParent();
35734 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35736
35737 // Memory Reference
35738 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
35739 MI.memoperands_end());
35740
35741 MVT PVT = getPointerTy(MF->getDataLayout());
35742 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
35743
35744 // checkSspMBB:
35745 // xor vreg1, vreg1
35746 // rdssp vreg1
35747 // test vreg1, vreg1
35748 // je sinkMBB # Jump if Shadow Stack is not supported
35749 // fallMBB:
35750 // mov buf+24/12(%rip), vreg2
35751 // sub vreg1, vreg2
35752 // jbe sinkMBB # No need to fix the Shadow Stack
35753 // fixShadowMBB:
35754 // shr 3/2, vreg2
35755 // incssp vreg2 # fix the SSP according to the lower 8 bits
35756 // shr 8, vreg2
35757 // je sinkMBB
35758 // fixShadowLoopPrepareMBB:
35759 // shl vreg2
35760 // mov 128, vreg3
35761 // fixShadowLoopMBB:
35762 // incssp vreg3
35763 // dec vreg2
35764 // jne fixShadowLoopMBB # Iterate until you finish fixing
35765 // # the Shadow Stack
35766 // sinkMBB:
35767
35769 const BasicBlock *BB = MBB->getBasicBlock();
35770
35771 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
35772 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
35773 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
35774 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
35775 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
35776 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35777 MF->insert(I, checkSspMBB);
35778 MF->insert(I, fallMBB);
35779 MF->insert(I, fixShadowMBB);
35780 MF->insert(I, fixShadowLoopPrepareMBB);
35781 MF->insert(I, fixShadowLoopMBB);
35782 MF->insert(I, sinkMBB);
35783
35784 // Transfer the remainder of BB and its successor edges to sinkMBB.
35785 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
35786 MBB->end());
35788
35789 MBB->addSuccessor(checkSspMBB);
35790
35791 // Initialize a register with zero.
35792 Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
35793 BuildMI(checkSspMBB, MIMD, TII->get(X86::MOV32r0), ZReg);
35794
35795 if (PVT == MVT::i64) {
35796 Register TmpZReg = MRI.createVirtualRegister(PtrRC);
35797 BuildMI(checkSspMBB, MIMD, TII->get(X86::SUBREG_TO_REG), TmpZReg)
35798 .addImm(0)
35799 .addReg(ZReg)
35800 .addImm(X86::sub_32bit);
35801 ZReg = TmpZReg;
35802 }
35803
35804 // Read the current SSP Register value to the zeroed register.
35805 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
35806 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
35807 BuildMI(checkSspMBB, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
35808
35809 // Check whether the result of the SSP register is zero and jump directly
35810 // to the sink.
35811 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
35812 BuildMI(checkSspMBB, MIMD, TII->get(TestRROpc))
35813 .addReg(SSPCopyReg)
35814 .addReg(SSPCopyReg);
35815 BuildMI(checkSspMBB, MIMD, TII->get(X86::JCC_1))
35816 .addMBB(sinkMBB)
35818 checkSspMBB->addSuccessor(sinkMBB);
35819 checkSspMBB->addSuccessor(fallMBB);
35820
35821 // Reload the previously saved SSP register value.
35822 Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
35823 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
35824 const int64_t SPPOffset = 3 * PVT.getStoreSize();
35826 BuildMI(fallMBB, MIMD, TII->get(PtrLoadOpc), PrevSSPReg);
35827 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35828 const MachineOperand &MO = MI.getOperand(i);
35829 if (i == X86::AddrDisp)
35830 MIB.addDisp(MO, SPPOffset);
35831 else if (MO.isReg()) // Don't add the whole operand, we don't want to
35832 // preserve kill flags.
35833 MIB.addReg(MO.getReg());
35834 else
35835 MIB.add(MO);
35836 }
35837 MIB.setMemRefs(MMOs);
35838
35839 // Subtract the current SSP from the previous SSP.
35840 Register SspSubReg = MRI.createVirtualRegister(PtrRC);
35841 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
35842 BuildMI(fallMBB, MIMD, TII->get(SubRROpc), SspSubReg)
35843 .addReg(PrevSSPReg)
35844 .addReg(SSPCopyReg);
35845
35846 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
35847 BuildMI(fallMBB, MIMD, TII->get(X86::JCC_1))
35848 .addMBB(sinkMBB)
35850 fallMBB->addSuccessor(sinkMBB);
35851 fallMBB->addSuccessor(fixShadowMBB);
35852
35853 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
35854 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
35855 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
35856 Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
35857 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspFirstShrReg)
35858 .addReg(SspSubReg)
35859 .addImm(Offset);
35860
35861 // Increase SSP when looking only on the lower 8 bits of the delta.
35862 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
35863 BuildMI(fixShadowMBB, MIMD, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
35864
35865 // Reset the lower 8 bits.
35866 Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
35867 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspSecondShrReg)
35868 .addReg(SspFirstShrReg)
35869 .addImm(8);
35870
35871 // Jump if the result of the shift is zero.
35872 BuildMI(fixShadowMBB, MIMD, TII->get(X86::JCC_1))
35873 .addMBB(sinkMBB)
35875 fixShadowMBB->addSuccessor(sinkMBB);
35876 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
35877
35878 // Do a single shift left.
35879 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64ri : X86::SHL32ri;
35880 Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
35881 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(ShlR1Opc), SspAfterShlReg)
35882 .addReg(SspSecondShrReg)
35883 .addImm(1);
35884
35885 // Save the value 128 to a register (will be used next with incssp).
35886 Register Value128InReg = MRI.createVirtualRegister(PtrRC);
35887 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
35888 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(MovRIOpc), Value128InReg)
35889 .addImm(128);
35890 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
35891
35892 // Since incssp only looks at the lower 8 bits, we might need to do several
35893 // iterations of incssp until we finish fixing the shadow stack.
35894 Register DecReg = MRI.createVirtualRegister(PtrRC);
35895 Register CounterReg = MRI.createVirtualRegister(PtrRC);
35896 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::PHI), CounterReg)
35897 .addReg(SspAfterShlReg)
35898 .addMBB(fixShadowLoopPrepareMBB)
35899 .addReg(DecReg)
35900 .addMBB(fixShadowLoopMBB);
35901
35902 // Every iteration we increase the SSP by 128.
35903 BuildMI(fixShadowLoopMBB, MIMD, TII->get(IncsspOpc)).addReg(Value128InReg);
35904
35905 // Every iteration we decrement the counter by 1.
35906 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
35907 BuildMI(fixShadowLoopMBB, MIMD, TII->get(DecROpc), DecReg).addReg(CounterReg);
35908
35909 // Jump if the counter is not zero yet.
35910 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::JCC_1))
35911 .addMBB(fixShadowLoopMBB)
35913 fixShadowLoopMBB->addSuccessor(sinkMBB);
35914 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
35915
35916 return sinkMBB;
35917}
35918
35920X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
35921 MachineBasicBlock *MBB) const {
35922 const MIMetadata MIMD(MI);
35923 MachineFunction *MF = MBB->getParent();
35924 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35926
35927 // Memory Reference
35928 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
35929 MI.memoperands_end());
35930
35931 MVT PVT = getPointerTy(MF->getDataLayout());
35932 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
35933 "Invalid Pointer Size!");
35934
35935 const TargetRegisterClass *RC =
35936 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
35937 Register Tmp = MRI.createVirtualRegister(RC);
35938 // Since FP is only updated here but NOT referenced, it's treated as GPR.
35939 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
35940 Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
35941 Register SP = RegInfo->getStackRegister();
35942
35944
35945 const int64_t LabelOffset = 1 * PVT.getStoreSize();
35946 const int64_t SPOffset = 2 * PVT.getStoreSize();
35947
35948 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
35949 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
35950
35951 MachineBasicBlock *thisMBB = MBB;
35952
35953 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
35954 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
35955 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
35956 }
35957
35958 // Reload FP
35959 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), FP);
35960 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35961 const MachineOperand &MO = MI.getOperand(i);
35962 if (MO.isReg()) // Don't add the whole operand, we don't want to
35963 // preserve kill flags.
35964 MIB.addReg(MO.getReg());
35965 else
35966 MIB.add(MO);
35967 }
35968 MIB.setMemRefs(MMOs);
35969
35970 // Reload IP
35971 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), Tmp);
35972 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35973 const MachineOperand &MO = MI.getOperand(i);
35974 if (i == X86::AddrDisp)
35975 MIB.addDisp(MO, LabelOffset);
35976 else if (MO.isReg()) // Don't add the whole operand, we don't want to
35977 // preserve kill flags.
35978 MIB.addReg(MO.getReg());
35979 else
35980 MIB.add(MO);
35981 }
35982 MIB.setMemRefs(MMOs);
35983
35984 // Reload SP
35985 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), SP);
35986 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35987 if (i == X86::AddrDisp)
35988 MIB.addDisp(MI.getOperand(i), SPOffset);
35989 else
35990 MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
35991 // the last instruction of the expansion.
35992 }
35993 MIB.setMemRefs(MMOs);
35994
35995 // Jump
35996 BuildMI(*thisMBB, MI, MIMD, TII->get(IJmpOpc)).addReg(Tmp);
35997
35998 MI.eraseFromParent();
35999 return thisMBB;
36000}
36001
36002void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
36004 MachineBasicBlock *DispatchBB,
36005 int FI) const {
36006 const MIMetadata MIMD(MI);
36007 MachineFunction *MF = MBB->getParent();
36009 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36010
36011 MVT PVT = getPointerTy(MF->getDataLayout());
36012 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
36013
36014 unsigned Op = 0;
36015 unsigned VR = 0;
36016
36017 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
36019
36020 if (UseImmLabel) {
36021 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
36022 } else {
36023 const TargetRegisterClass *TRC =
36024 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
36025 VR = MRI->createVirtualRegister(TRC);
36026 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
36027
36028 if (Subtarget.is64Bit())
36029 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA64r), VR)
36030 .addReg(X86::RIP)
36031 .addImm(1)
36032 .addReg(0)
36033 .addMBB(DispatchBB)
36034 .addReg(0);
36035 else
36036 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA32r), VR)
36037 .addReg(0) /* TII->getGlobalBaseReg(MF) */
36038 .addImm(1)
36039 .addReg(0)
36040 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
36041 .addReg(0);
36042 }
36043
36044 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MIMD, TII->get(Op));
36045 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
36046 if (UseImmLabel)
36047 MIB.addMBB(DispatchBB);
36048 else
36049 MIB.addReg(VR);
36050}
36051
36053X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
36054 MachineBasicBlock *BB) const {
36055 const MIMetadata MIMD(MI);
36056 MachineFunction *MF = BB->getParent();
36058 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36059 int FI = MF->getFrameInfo().getFunctionContextIndex();
36060
36061 // Get a mapping of the call site numbers to all of the landing pads they're
36062 // associated with.
36064 unsigned MaxCSNum = 0;
36065 for (auto &MBB : *MF) {
36066 if (!MBB.isEHPad())
36067 continue;
36068
36069 MCSymbol *Sym = nullptr;
36070 for (const auto &MI : MBB) {
36071 if (MI.isDebugInstr())
36072 continue;
36073
36074 assert(MI.isEHLabel() && "expected EH_LABEL");
36075 Sym = MI.getOperand(0).getMCSymbol();
36076 break;
36077 }
36078
36079 if (!MF->hasCallSiteLandingPad(Sym))
36080 continue;
36081
36082 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
36083 CallSiteNumToLPad[CSI].push_back(&MBB);
36084 MaxCSNum = std::max(MaxCSNum, CSI);
36085 }
36086 }
36087
36088 // Get an ordered list of the machine basic blocks for the jump table.
36089 std::vector<MachineBasicBlock *> LPadList;
36091 LPadList.reserve(CallSiteNumToLPad.size());
36092
36093 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
36094 for (auto &LP : CallSiteNumToLPad[CSI]) {
36095 LPadList.push_back(LP);
36096 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
36097 }
36098 }
36099
36100 assert(!LPadList.empty() &&
36101 "No landing pad destinations for the dispatch jump table!");
36102
36103 // Create the MBBs for the dispatch code.
36104
36105 // Shove the dispatch's address into the return slot in the function context.
36106 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
36107 DispatchBB->setIsEHPad(true);
36108
36109 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
36110 BuildMI(TrapBB, MIMD, TII->get(X86::TRAP));
36111 DispatchBB->addSuccessor(TrapBB);
36112
36113 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
36114 DispatchBB->addSuccessor(DispContBB);
36115
36116 // Insert MBBs.
36117 MF->push_back(DispatchBB);
36118 MF->push_back(DispContBB);
36119 MF->push_back(TrapBB);
36120
36121 // Insert code into the entry block that creates and registers the function
36122 // context.
36123 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
36124
36125 // Create the jump table and associated information
36126 unsigned JTE = getJumpTableEncoding();
36127 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
36128 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
36129
36130 const X86RegisterInfo &RI = TII->getRegisterInfo();
36131 // Add a register mask with no preserved registers. This results in all
36132 // registers being marked as clobbered.
36133 if (RI.hasBasePointer(*MF)) {
36134 const bool FPIs64Bit =
36135 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
36136 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
36137 MFI->setRestoreBasePointer(MF);
36138
36139 Register FP = RI.getFrameRegister(*MF);
36140 Register BP = RI.getBaseRegister();
36141 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
36142 addRegOffset(BuildMI(DispatchBB, MIMD, TII->get(Op), BP), FP, true,
36145 } else {
36146 BuildMI(DispatchBB, MIMD, TII->get(X86::NOOP))
36148 }
36149
36150 // IReg is used as an index in a memory operand and therefore can't be SP
36151 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
36152 addFrameReference(BuildMI(DispatchBB, MIMD, TII->get(X86::MOV32rm), IReg), FI,
36153 Subtarget.is64Bit() ? 8 : 4);
36154 BuildMI(DispatchBB, MIMD, TII->get(X86::CMP32ri))
36155 .addReg(IReg)
36156 .addImm(LPadList.size());
36157 BuildMI(DispatchBB, MIMD, TII->get(X86::JCC_1))
36158 .addMBB(TrapBB)
36160
36161 if (Subtarget.is64Bit()) {
36162 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
36163 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
36164
36165 // leaq .LJTI0_0(%rip), BReg
36166 BuildMI(DispContBB, MIMD, TII->get(X86::LEA64r), BReg)
36167 .addReg(X86::RIP)
36168 .addImm(1)
36169 .addReg(0)
36170 .addJumpTableIndex(MJTI)
36171 .addReg(0);
36172 // movzx IReg64, IReg
36173 BuildMI(DispContBB, MIMD, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
36174 .addImm(0)
36175 .addReg(IReg)
36176 .addImm(X86::sub_32bit);
36177
36178 switch (JTE) {
36180 // jmpq *(BReg,IReg64,8)
36181 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64m))
36182 .addReg(BReg)
36183 .addImm(8)
36184 .addReg(IReg64)
36185 .addImm(0)
36186 .addReg(0);
36187 break;
36189 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
36190 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
36191 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
36192
36193 // movl (BReg,IReg64,4), OReg
36194 BuildMI(DispContBB, MIMD, TII->get(X86::MOV32rm), OReg)
36195 .addReg(BReg)
36196 .addImm(4)
36197 .addReg(IReg64)
36198 .addImm(0)
36199 .addReg(0);
36200 // movsx OReg64, OReg
36201 BuildMI(DispContBB, MIMD, TII->get(X86::MOVSX64rr32), OReg64)
36202 .addReg(OReg);
36203 // addq BReg, OReg64, TReg
36204 BuildMI(DispContBB, MIMD, TII->get(X86::ADD64rr), TReg)
36205 .addReg(OReg64)
36206 .addReg(BReg);
36207 // jmpq *TReg
36208 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64r)).addReg(TReg);
36209 break;
36210 }
36211 default:
36212 llvm_unreachable("Unexpected jump table encoding");
36213 }
36214 } else {
36215 // jmpl *.LJTI0_0(,IReg,4)
36216 BuildMI(DispContBB, MIMD, TII->get(X86::JMP32m))
36217 .addReg(0)
36218 .addImm(4)
36219 .addReg(IReg)
36220 .addJumpTableIndex(MJTI)
36221 .addReg(0);
36222 }
36223
36224 // Add the jump table entries as successors to the MBB.
36226 for (auto &LP : LPadList)
36227 if (SeenMBBs.insert(LP).second)
36228 DispContBB->addSuccessor(LP);
36229
36230 // N.B. the order the invoke BBs are processed in doesn't matter here.
36232 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
36233 for (MachineBasicBlock *MBB : InvokeBBs) {
36234 // Remove the landing pad successor from the invoke block and replace it
36235 // with the new dispatch block.
36236 // Keep a copy of Successors since it's modified inside the loop.
36238 MBB->succ_rend());
36239 // FIXME: Avoid quadratic complexity.
36240 for (auto *MBBS : Successors) {
36241 if (MBBS->isEHPad()) {
36242 MBB->removeSuccessor(MBBS);
36243 MBBLPads.push_back(MBBS);
36244 }
36245 }
36246
36247 MBB->addSuccessor(DispatchBB);
36248
36249 // Find the invoke call and mark all of the callee-saved registers as
36250 // 'implicit defined' so that they're spilled. This prevents code from
36251 // moving instructions to before the EH block, where they will never be
36252 // executed.
36253 for (auto &II : reverse(*MBB)) {
36254 if (!II.isCall())
36255 continue;
36256
36258 for (auto &MOp : II.operands())
36259 if (MOp.isReg())
36260 DefRegs[MOp.getReg()] = true;
36261
36262 MachineInstrBuilder MIB(*MF, &II);
36263 for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
36264 unsigned Reg = SavedRegs[RegIdx];
36265 if (!DefRegs[Reg])
36267 }
36268
36269 break;
36270 }
36271 }
36272
36273 // Mark all former landing pads as non-landing pads. The dispatch is the only
36274 // landing pad now.
36275 for (auto &LP : MBBLPads)
36276 LP->setIsEHPad(false);
36277
36278 // The instruction is gone now.
36279 MI.eraseFromParent();
36280 return BB;
36281}
36282
36285 MachineBasicBlock *BB) const {
36286 MachineFunction *MF = BB->getParent();
36287 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36288 const MIMetadata MIMD(MI);
36289
36290 auto TMMImmToTMMReg = [](unsigned Imm) {
36291 assert (Imm < 8 && "Illegal tmm index");
36292 return X86::TMM0 + Imm;
36293 };
36294 switch (MI.getOpcode()) {
36295 default: llvm_unreachable("Unexpected instr type to insert");
36296 case X86::TLS_addr32:
36297 case X86::TLS_addr64:
36298 case X86::TLS_addrX32:
36299 case X86::TLS_base_addr32:
36300 case X86::TLS_base_addr64:
36301 case X86::TLS_base_addrX32:
36302 case X86::TLS_desc32:
36303 case X86::TLS_desc64:
36304 return EmitLoweredTLSAddr(MI, BB);
36305 case X86::INDIRECT_THUNK_CALL32:
36306 case X86::INDIRECT_THUNK_CALL64:
36307 case X86::INDIRECT_THUNK_TCRETURN32:
36308 case X86::INDIRECT_THUNK_TCRETURN64:
36309 return EmitLoweredIndirectThunk(MI, BB);
36310 case X86::CATCHRET:
36311 return EmitLoweredCatchRet(MI, BB);
36312 case X86::SEG_ALLOCA_32:
36313 case X86::SEG_ALLOCA_64:
36314 return EmitLoweredSegAlloca(MI, BB);
36315 case X86::PROBED_ALLOCA_32:
36316 case X86::PROBED_ALLOCA_64:
36317 return EmitLoweredProbedAlloca(MI, BB);
36318 case X86::TLSCall_32:
36319 case X86::TLSCall_64:
36320 return EmitLoweredTLSCall(MI, BB);
36321 case X86::CMOV_FR16:
36322 case X86::CMOV_FR16X:
36323 case X86::CMOV_FR32:
36324 case X86::CMOV_FR32X:
36325 case X86::CMOV_FR64:
36326 case X86::CMOV_FR64X:
36327 case X86::CMOV_GR8:
36328 case X86::CMOV_GR16:
36329 case X86::CMOV_GR32:
36330 case X86::CMOV_RFP32:
36331 case X86::CMOV_RFP64:
36332 case X86::CMOV_RFP80:
36333 case X86::CMOV_VR64:
36334 case X86::CMOV_VR128:
36335 case X86::CMOV_VR128X:
36336 case X86::CMOV_VR256:
36337 case X86::CMOV_VR256X:
36338 case X86::CMOV_VR512:
36339 case X86::CMOV_VK1:
36340 case X86::CMOV_VK2:
36341 case X86::CMOV_VK4:
36342 case X86::CMOV_VK8:
36343 case X86::CMOV_VK16:
36344 case X86::CMOV_VK32:
36345 case X86::CMOV_VK64:
36346 return EmitLoweredSelect(MI, BB);
36347
36348 case X86::FP80_ADDr:
36349 case X86::FP80_ADDm32: {
36350 // Change the floating point control register to use double extended
36351 // precision when performing the addition.
36352 int OrigCWFrameIdx =
36353 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
36354 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
36355 OrigCWFrameIdx);
36356
36357 // Load the old value of the control word...
36358 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
36359 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
36360 OrigCWFrameIdx);
36361
36362 // OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended
36363 // precision.
36364 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
36365 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
36366 .addReg(OldCW, RegState::Kill)
36367 .addImm(0x300);
36368
36369 // Extract to 16 bits.
36370 Register NewCW16 =
36371 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
36372 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
36373 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
36374
36375 // Prepare memory for FLDCW.
36376 int NewCWFrameIdx =
36377 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
36378 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
36379 NewCWFrameIdx)
36380 .addReg(NewCW16, RegState::Kill);
36381
36382 // Reload the modified control word now...
36383 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
36384 NewCWFrameIdx);
36385
36386 // Do the addition.
36387 if (MI.getOpcode() == X86::FP80_ADDr) {
36388 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80))
36389 .add(MI.getOperand(0))
36390 .add(MI.getOperand(1))
36391 .add(MI.getOperand(2));
36392 } else {
36393 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80m32))
36394 .add(MI.getOperand(0))
36395 .add(MI.getOperand(1))
36396 .add(MI.getOperand(2))
36397 .add(MI.getOperand(3))
36398 .add(MI.getOperand(4))
36399 .add(MI.getOperand(5))
36400 .add(MI.getOperand(6));
36401 }
36402
36403 // Reload the original control word now.
36404 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
36405 OrigCWFrameIdx);
36406
36407 MI.eraseFromParent(); // The pseudo instruction is gone now.
36408 return BB;
36409 }
36410
36411 case X86::FP32_TO_INT16_IN_MEM:
36412 case X86::FP32_TO_INT32_IN_MEM:
36413 case X86::FP32_TO_INT64_IN_MEM:
36414 case X86::FP64_TO_INT16_IN_MEM:
36415 case X86::FP64_TO_INT32_IN_MEM:
36416 case X86::FP64_TO_INT64_IN_MEM:
36417 case X86::FP80_TO_INT16_IN_MEM:
36418 case X86::FP80_TO_INT32_IN_MEM:
36419 case X86::FP80_TO_INT64_IN_MEM: {
36420 // Change the floating point control register to use "round towards zero"
36421 // mode when truncating to an integer value.
36422 int OrigCWFrameIdx =
36423 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
36424 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
36425 OrigCWFrameIdx);
36426
36427 // Load the old value of the control word...
36428 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
36429 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
36430 OrigCWFrameIdx);
36431
36432 // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
36433 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
36434 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
36435 .addReg(OldCW, RegState::Kill).addImm(0xC00);
36436
36437 // Extract to 16 bits.
36438 Register NewCW16 =
36439 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
36440 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
36441 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
36442
36443 // Prepare memory for FLDCW.
36444 int NewCWFrameIdx =
36445 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
36446 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
36447 NewCWFrameIdx)
36448 .addReg(NewCW16, RegState::Kill);
36449
36450 // Reload the modified control word now...
36451 addFrameReference(BuildMI(*BB, MI, MIMD,
36452 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
36453
36454 // Get the X86 opcode to use.
36455 unsigned Opc;
36456 switch (MI.getOpcode()) {
36457 // clang-format off
36458 default: llvm_unreachable("illegal opcode!");
36459 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
36460 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
36461 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
36462 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
36463 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
36464 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
36465 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
36466 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
36467 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
36468 // clang-format on
36469 }
36470
36472 addFullAddress(BuildMI(*BB, MI, MIMD, TII->get(Opc)), AM)
36473 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
36474
36475 // Reload the original control word now.
36476 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
36477 OrigCWFrameIdx);
36478
36479 MI.eraseFromParent(); // The pseudo instruction is gone now.
36480 return BB;
36481 }
36482
36483 // xbegin
36484 case X86::XBEGIN:
36485 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
36486
36487 case X86::VAARG_64:
36488 case X86::VAARG_X32:
36489 return EmitVAARGWithCustomInserter(MI, BB);
36490
36491 case X86::EH_SjLj_SetJmp32:
36492 case X86::EH_SjLj_SetJmp64:
36493 return emitEHSjLjSetJmp(MI, BB);
36494
36495 case X86::EH_SjLj_LongJmp32:
36496 case X86::EH_SjLj_LongJmp64:
36497 return emitEHSjLjLongJmp(MI, BB);
36498
36499 case X86::Int_eh_sjlj_setup_dispatch:
36500 return EmitSjLjDispatchBlock(MI, BB);
36501
36502 case TargetOpcode::STATEPOINT:
36503 // As an implementation detail, STATEPOINT shares the STACKMAP format at
36504 // this point in the process. We diverge later.
36505 return emitPatchPoint(MI, BB);
36506
36507 case TargetOpcode::STACKMAP:
36508 case TargetOpcode::PATCHPOINT:
36509 return emitPatchPoint(MI, BB);
36510
36511 case TargetOpcode::PATCHABLE_EVENT_CALL:
36512 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
36513 return BB;
36514
36515 case X86::LCMPXCHG8B: {
36516 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36517 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
36518 // requires a memory operand. If it happens that current architecture is
36519 // i686 and for current function we need a base pointer
36520 // - which is ESI for i686 - register allocator would not be able to
36521 // allocate registers for an address in form of X(%reg, %reg, Y)
36522 // - there never would be enough unreserved registers during regalloc
36523 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
36524 // We are giving a hand to register allocator by precomputing the address in
36525 // a new vreg using LEA.
36526
36527 // If it is not i686 or there is no base pointer - nothing to do here.
36528 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
36529 return BB;
36530
36531 // Even though this code does not necessarily needs the base pointer to
36532 // be ESI, we check for that. The reason: if this assert fails, there are
36533 // some changes happened in the compiler base pointer handling, which most
36534 // probably have to be addressed somehow here.
36535 assert(TRI->getBaseRegister() == X86::ESI &&
36536 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
36537 "base pointer in mind");
36538
36540 MVT SPTy = getPointerTy(MF->getDataLayout());
36541 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
36542 Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
36543
36545 // Regalloc does not need any help when the memory operand of CMPXCHG8B
36546 // does not use index register.
36547 if (AM.IndexReg == X86::NoRegister)
36548 return BB;
36549
36550 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
36551 // four operand definitions that are E[ABCD] registers. We skip them and
36552 // then insert the LEA.
36553 MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
36554 while (RMBBI != BB->rend() &&
36555 (RMBBI->definesRegister(X86::EAX, /*TRI=*/nullptr) ||
36556 RMBBI->definesRegister(X86::EBX, /*TRI=*/nullptr) ||
36557 RMBBI->definesRegister(X86::ECX, /*TRI=*/nullptr) ||
36558 RMBBI->definesRegister(X86::EDX, /*TRI=*/nullptr))) {
36559 ++RMBBI;
36560 }
36563 BuildMI(*BB, *MBBI, MIMD, TII->get(X86::LEA32r), computedAddrVReg), AM);
36564
36565 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
36566
36567 return BB;
36568 }
36569 case X86::LCMPXCHG16B_NO_RBX: {
36570 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36571 Register BasePtr = TRI->getBaseRegister();
36572 if (TRI->hasBasePointer(*MF) &&
36573 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
36574 if (!BB->isLiveIn(BasePtr))
36575 BB->addLiveIn(BasePtr);
36576 // Save RBX into a virtual register.
36577 Register SaveRBX =
36578 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36579 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
36580 .addReg(X86::RBX);
36581 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36583 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
36584 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
36585 MIB.add(MI.getOperand(Idx));
36586 MIB.add(MI.getOperand(X86::AddrNumOperands));
36587 MIB.addReg(SaveRBX);
36588 } else {
36589 // Simple case, just copy the virtual register to RBX.
36590 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::RBX)
36591 .add(MI.getOperand(X86::AddrNumOperands));
36593 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B));
36594 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
36595 MIB.add(MI.getOperand(Idx));
36596 }
36597 MI.eraseFromParent();
36598 return BB;
36599 }
36600 case X86::MWAITX: {
36601 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36602 Register BasePtr = TRI->getBaseRegister();
36603 bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
36604 // If no need to save the base pointer, we generate MWAITXrrr,
36605 // else we generate pseudo MWAITX_SAVE_RBX.
36606 if (!IsRBX || !TRI->hasBasePointer(*MF)) {
36607 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
36608 .addReg(MI.getOperand(0).getReg());
36609 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
36610 .addReg(MI.getOperand(1).getReg());
36611 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EBX)
36612 .addReg(MI.getOperand(2).getReg());
36613 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITXrrr));
36614 MI.eraseFromParent();
36615 } else {
36616 if (!BB->isLiveIn(BasePtr)) {
36617 BB->addLiveIn(BasePtr);
36618 }
36619 // Parameters can be copied into ECX and EAX but not EBX yet.
36620 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
36621 .addReg(MI.getOperand(0).getReg());
36622 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
36623 .addReg(MI.getOperand(1).getReg());
36624 assert(Subtarget.is64Bit() && "Expected 64-bit mode!");
36625 // Save RBX into a virtual register.
36626 Register SaveRBX =
36627 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36628 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
36629 .addReg(X86::RBX);
36630 // Generate mwaitx pseudo.
36631 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36632 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITX_SAVE_RBX))
36633 .addDef(Dst) // Destination tied in with SaveRBX.
36634 .addReg(MI.getOperand(2).getReg()) // input value of EBX.
36635 .addUse(SaveRBX); // Save of base pointer.
36636 MI.eraseFromParent();
36637 }
36638 return BB;
36639 }
36640 case TargetOpcode::PREALLOCATED_SETUP: {
36641 assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
36642 auto MFI = MF->getInfo<X86MachineFunctionInfo>();
36643 MFI->setHasPreallocatedCall(true);
36644 int64_t PreallocatedId = MI.getOperand(0).getImm();
36645 size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
36646 assert(StackAdjustment != 0 && "0 stack adjustment");
36647 LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
36648 << StackAdjustment << "\n");
36649 BuildMI(*BB, MI, MIMD, TII->get(X86::SUB32ri), X86::ESP)
36650 .addReg(X86::ESP)
36651 .addImm(StackAdjustment);
36652 MI.eraseFromParent();
36653 return BB;
36654 }
36655 case TargetOpcode::PREALLOCATED_ARG: {
36656 assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
36657 int64_t PreallocatedId = MI.getOperand(1).getImm();
36658 int64_t ArgIdx = MI.getOperand(2).getImm();
36659 auto MFI = MF->getInfo<X86MachineFunctionInfo>();
36660 size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
36661 LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
36662 << ", arg offset " << ArgOffset << "\n");
36663 // stack pointer + offset
36664 addRegOffset(BuildMI(*BB, MI, MIMD, TII->get(X86::LEA32r),
36665 MI.getOperand(0).getReg()),
36666 X86::ESP, false, ArgOffset);
36667 MI.eraseFromParent();
36668 return BB;
36669 }
36670 case X86::PTDPBSSD:
36671 case X86::PTDPBSUD:
36672 case X86::PTDPBUSD:
36673 case X86::PTDPBUUD:
36674 case X86::PTDPBF16PS:
36675 case X86::PTDPFP16PS: {
36676 unsigned Opc;
36677 switch (MI.getOpcode()) {
36678 // clang-format off
36679 default: llvm_unreachable("illegal opcode!");
36680 case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
36681 case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
36682 case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
36683 case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
36684 case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
36685 case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;
36686 // clang-format on
36687 }
36688
36689 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
36690 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
36691 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
36692 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
36693 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
36694
36695 MI.eraseFromParent(); // The pseudo is gone now.
36696 return BB;
36697 }
36698 case X86::PTILEZERO: {
36699 unsigned Imm = MI.getOperand(0).getImm();
36700 BuildMI(*BB, MI, MIMD, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
36701 MI.eraseFromParent(); // The pseudo is gone now.
36702 return BB;
36703 }
36704 case X86::PTILELOADD:
36705 case X86::PTILELOADDT1:
36706 case X86::PTILESTORED: {
36707 unsigned Opc;
36708 switch (MI.getOpcode()) {
36709 default: llvm_unreachable("illegal opcode!");
36710#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
36711 case X86::PTILELOADD:
36712 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
36713 break;
36714 case X86::PTILELOADDT1:
36715 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDT1);
36716 break;
36717 case X86::PTILESTORED:
36718 Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED);
36719 break;
36720#undef GET_EGPR_IF_ENABLED
36721 }
36722
36723 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
36724 unsigned CurOp = 0;
36725 if (Opc != X86::TILESTORED && Opc != X86::TILESTORED_EVEX)
36726 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
36728
36729 MIB.add(MI.getOperand(CurOp++)); // base
36730 MIB.add(MI.getOperand(CurOp++)); // scale
36731 MIB.add(MI.getOperand(CurOp++)); // index -- stride
36732 MIB.add(MI.getOperand(CurOp++)); // displacement
36733 MIB.add(MI.getOperand(CurOp++)); // segment
36734
36735 if (Opc == X86::TILESTORED || Opc == X86::TILESTORED_EVEX)
36736 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
36738
36739 MI.eraseFromParent(); // The pseudo is gone now.
36740 return BB;
36741 }
36742 case X86::PTCMMIMFP16PS:
36743 case X86::PTCMMRLFP16PS: {
36744 const MIMetadata MIMD(MI);
36745 unsigned Opc;
36746 switch (MI.getOpcode()) {
36747 // clang-format off
36748 default: llvm_unreachable("Unexpected instruction!");
36749 case X86::PTCMMIMFP16PS: Opc = X86::TCMMIMFP16PS; break;
36750 case X86::PTCMMRLFP16PS: Opc = X86::TCMMRLFP16PS; break;
36751 // clang-format on
36752 }
36753 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
36754 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
36755 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
36756 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
36757 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
36758 MI.eraseFromParent(); // The pseudo is gone now.
36759 return BB;
36760 }
36761 }
36762}
36763
36764//===----------------------------------------------------------------------===//
36765// X86 Optimization Hooks
36766//===----------------------------------------------------------------------===//
36767
36768bool
36770 const APInt &DemandedBits,
36771 const APInt &DemandedElts,
36772 TargetLoweringOpt &TLO) const {
36773 EVT VT = Op.getValueType();
36774 unsigned Opcode = Op.getOpcode();
36775 unsigned EltSize = VT.getScalarSizeInBits();
36776
36777 if (VT.isVector()) {
36778 // If the constant is only all signbits in the active bits, then we should
36779 // extend it to the entire constant to allow it act as a boolean constant
36780 // vector.
36781 auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
36782 if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
36783 return false;
36784 for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
36785 if (!DemandedElts[i] || V.getOperand(i).isUndef())
36786 continue;
36787 const APInt &Val = V.getConstantOperandAPInt(i);
36788 if (Val.getBitWidth() > Val.getNumSignBits() &&
36789 Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
36790 return true;
36791 }
36792 return false;
36793 };
36794 // For vectors - if we have a constant, then try to sign extend.
36795 // TODO: Handle AND cases.
36796 unsigned ActiveBits = DemandedBits.getActiveBits();
36797 if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
36798 (Opcode == ISD::OR || Opcode == ISD::XOR || Opcode == X86ISD::ANDNP) &&
36799 NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
36800 EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
36801 EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
36803 SDValue NewC =
36805 Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
36806 SDValue NewOp =
36807 TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
36808 return TLO.CombineTo(Op, NewOp);
36809 }
36810 return false;
36811 }
36812
36813 // Only optimize Ands to prevent shrinking a constant that could be
36814 // matched by movzx.
36815 if (Opcode != ISD::AND)
36816 return false;
36817
36818 // Make sure the RHS really is a constant.
36819 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
36820 if (!C)
36821 return false;
36822
36823 const APInt &Mask = C->getAPIntValue();
36824
36825 // Clear all non-demanded bits initially.
36826 APInt ShrunkMask = Mask & DemandedBits;
36827
36828 // Find the width of the shrunk mask.
36829 unsigned Width = ShrunkMask.getActiveBits();
36830
36831 // If the mask is all 0s there's nothing to do here.
36832 if (Width == 0)
36833 return false;
36834
36835 // Find the next power of 2 width, rounding up to a byte.
36836 Width = llvm::bit_ceil(std::max(Width, 8U));
36837 // Truncate the width to size to handle illegal types.
36838 Width = std::min(Width, EltSize);
36839
36840 // Calculate a possible zero extend mask for this constant.
36841 APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
36842
36843 // If we aren't changing the mask, just return true to keep it and prevent
36844 // the caller from optimizing.
36845 if (ZeroExtendMask == Mask)
36846 return true;
36847
36848 // Make sure the new mask can be represented by a combination of mask bits
36849 // and non-demanded bits.
36850 if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
36851 return false;
36852
36853 // Replace the constant with the zero extend mask.
36854 SDLoc DL(Op);
36855 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
36856 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
36857 return TLO.CombineTo(Op, NewOp);
36858}
36859
36861 KnownBits &Known,
36862 const APInt &DemandedElts,
36863 const SelectionDAG &DAG, unsigned Depth) {
36864 KnownBits Known2;
36865 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
36866 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
36867 Known = DAG.computeKnownBits(RHS, DemandedSrcElts, Depth + 1);
36868 Known2 = DAG.computeKnownBits(LHS, DemandedSrcElts, Depth + 1);
36869 Known = KnownBits::abdu(Known, Known2).zext(16);
36870 // Known = (((D0 + D1) + (D2 + D3)) + ((D4 + D5) + (D6 + D7)))
36871 Known = KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/true, /*NUW=*/true,
36872 Known, Known);
36873 Known = KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/true, /*NUW=*/true,
36874 Known, Known);
36875 Known = KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/true, /*NUW=*/true,
36876 Known, Known);
36877 Known = Known.zext(64);
36878}
36879
36881 KnownBits &Known,
36882 const APInt &DemandedElts,
36883 const SelectionDAG &DAG,
36884 unsigned Depth) const {
36885 unsigned BitWidth = Known.getBitWidth();
36886 unsigned NumElts = DemandedElts.getBitWidth();
36887 unsigned Opc = Op.getOpcode();
36888 EVT VT = Op.getValueType();
36889 assert((Opc >= ISD::BUILTIN_OP_END ||
36890 Opc == ISD::INTRINSIC_WO_CHAIN ||
36891 Opc == ISD::INTRINSIC_W_CHAIN ||
36892 Opc == ISD::INTRINSIC_VOID) &&
36893 "Should use MaskedValueIsZero if you don't know whether Op"
36894 " is a target node!");
36895
36896 Known.resetAll();
36897 switch (Opc) {
36898 default: break;
36899 case X86ISD::MUL_IMM: {
36900 KnownBits Known2;
36901 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
36902 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
36903 Known = KnownBits::mul(Known, Known2);
36904 break;
36905 }
36906 case X86ISD::SETCC:
36907 Known.Zero.setBitsFrom(1);
36908 break;
36909 case X86ISD::MOVMSK: {
36910 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
36911 Known.Zero.setBitsFrom(NumLoBits);
36912 break;
36913 }
36914 case X86ISD::PEXTRB:
36915 case X86ISD::PEXTRW: {
36916 SDValue Src = Op.getOperand(0);
36917 EVT SrcVT = Src.getValueType();
36918 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
36919 Op.getConstantOperandVal(1));
36920 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
36921 Known = Known.anyextOrTrunc(BitWidth);
36922 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
36923 break;
36924 }
36925 case X86ISD::VSRAI:
36926 case X86ISD::VSHLI:
36927 case X86ISD::VSRLI: {
36928 unsigned ShAmt = Op.getConstantOperandVal(1);
36929 if (ShAmt >= VT.getScalarSizeInBits()) {
36930 // Out of range logical bit shifts are guaranteed to be zero.
36931 // Out of range arithmetic bit shifts splat the sign bit.
36932 if (Opc != X86ISD::VSRAI) {
36933 Known.setAllZero();
36934 break;
36935 }
36936
36937 ShAmt = VT.getScalarSizeInBits() - 1;
36938 }
36939
36940 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
36941 if (Opc == X86ISD::VSHLI) {
36942 Known.Zero <<= ShAmt;
36943 Known.One <<= ShAmt;
36944 // Low bits are known zero.
36945 Known.Zero.setLowBits(ShAmt);
36946 } else if (Opc == X86ISD::VSRLI) {
36947 Known.Zero.lshrInPlace(ShAmt);
36948 Known.One.lshrInPlace(ShAmt);
36949 // High bits are known zero.
36950 Known.Zero.setHighBits(ShAmt);
36951 } else {
36952 Known.Zero.ashrInPlace(ShAmt);
36953 Known.One.ashrInPlace(ShAmt);
36954 }
36955 break;
36956 }
36957 case X86ISD::PACKUS: {
36958 // PACKUS is just a truncation if the upper half is zero.
36959 APInt DemandedLHS, DemandedRHS;
36960 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
36961
36962 Known.One = APInt::getAllOnes(BitWidth * 2);
36963 Known.Zero = APInt::getAllOnes(BitWidth * 2);
36964
36965 KnownBits Known2;
36966 if (!!DemandedLHS) {
36967 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
36968 Known = Known.intersectWith(Known2);
36969 }
36970 if (!!DemandedRHS) {
36971 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
36972 Known = Known.intersectWith(Known2);
36973 }
36974
36975 if (Known.countMinLeadingZeros() < BitWidth)
36976 Known.resetAll();
36977 Known = Known.trunc(BitWidth);
36978 break;
36979 }
36980 case X86ISD::PSHUFB: {
36981 SDValue Src = Op.getOperand(0);
36982 SDValue Idx = Op.getOperand(1);
36983
36984 // If the index vector is never negative (MSB is zero), then all elements
36985 // come from the source vector. This is useful for cases where
36986 // PSHUFB is being used as a LUT (ctpop etc.) - the target shuffle handling
36987 // below will handle the more common constant shuffle mask case.
36988 KnownBits KnownIdx = DAG.computeKnownBits(Idx, DemandedElts, Depth + 1);
36989 if (KnownIdx.isNonNegative())
36990 Known = DAG.computeKnownBits(Src, Depth + 1);
36991 break;
36992 }
36993 case X86ISD::VBROADCAST: {
36994 SDValue Src = Op.getOperand(0);
36995 if (!Src.getSimpleValueType().isVector()) {
36996 Known = DAG.computeKnownBits(Src, Depth + 1);
36997 return;
36998 }
36999 break;
37000 }
37001 case X86ISD::AND: {
37002 if (Op.getResNo() == 0) {
37003 KnownBits Known2;
37004 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37005 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37006 Known &= Known2;
37007 }
37008 break;
37009 }
37010 case X86ISD::ANDNP: {
37011 KnownBits Known2;
37012 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37013 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37014
37015 // ANDNP = (~X & Y);
37016 Known.One &= Known2.Zero;
37017 Known.Zero |= Known2.One;
37018 break;
37019 }
37020 case X86ISD::FOR: {
37021 KnownBits Known2;
37022 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37023 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37024
37025 Known |= Known2;
37026 break;
37027 }
37028 case X86ISD::PSADBW: {
37029 SDValue LHS = Op.getOperand(0);
37030 SDValue RHS = Op.getOperand(1);
37031 assert(VT.getScalarType() == MVT::i64 &&
37032 LHS.getValueType() == RHS.getValueType() &&
37033 LHS.getValueType().getScalarType() == MVT::i8 &&
37034 "Unexpected PSADBW types");
37035 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
37036 break;
37037 }
37038 case X86ISD::PCMPGT:
37039 case X86ISD::PCMPEQ: {
37040 KnownBits KnownLhs =
37041 DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37042 KnownBits KnownRhs =
37043 DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37044 std::optional<bool> Res = Opc == X86ISD::PCMPEQ
37045 ? KnownBits::eq(KnownLhs, KnownRhs)
37046 : KnownBits::sgt(KnownLhs, KnownRhs);
37047 if (Res) {
37048 if (*Res)
37049 Known.setAllOnes();
37050 else
37051 Known.setAllZero();
37052 }
37053 break;
37054 }
37055 case X86ISD::PMULUDQ: {
37056 KnownBits Known2;
37057 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37058 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37059
37060 Known = Known.trunc(BitWidth / 2).zext(BitWidth);
37061 Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
37062 Known = KnownBits::mul(Known, Known2);
37063 break;
37064 }
37065 case X86ISD::CMOV: {
37066 Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
37067 // If we don't know any bits, early out.
37068 if (Known.isUnknown())
37069 break;
37070 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
37071
37072 // Only known if known in both the LHS and RHS.
37073 Known = Known.intersectWith(Known2);
37074 break;
37075 }
37076 case X86ISD::BEXTR:
37077 case X86ISD::BEXTRI: {
37078 SDValue Op0 = Op.getOperand(0);
37079 SDValue Op1 = Op.getOperand(1);
37080
37081 if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
37082 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
37083 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
37084
37085 // If the length is 0, the result is 0.
37086 if (Length == 0) {
37087 Known.setAllZero();
37088 break;
37089 }
37090
37091 if ((Shift + Length) <= BitWidth) {
37092 Known = DAG.computeKnownBits(Op0, Depth + 1);
37093 Known = Known.extractBits(Length, Shift);
37094 Known = Known.zextOrTrunc(BitWidth);
37095 }
37096 }
37097 break;
37098 }
37099 case X86ISD::PDEP: {
37100 KnownBits Known2;
37101 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37102 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37103 // Zeros are retained from the mask operand. But not ones.
37104 Known.One.clearAllBits();
37105 // The result will have at least as many trailing zeros as the non-mask
37106 // operand since bits can only map to the same or higher bit position.
37107 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
37108 break;
37109 }
37110 case X86ISD::PEXT: {
37111 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37112 // The result has as many leading zeros as the number of zeroes in the mask.
37113 unsigned Count = Known.Zero.popcount();
37114 Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
37115 Known.One.clearAllBits();
37116 break;
37117 }
37118 case X86ISD::VTRUNC:
37119 case X86ISD::VTRUNCS:
37120 case X86ISD::VTRUNCUS:
37121 case X86ISD::CVTSI2P:
37122 case X86ISD::CVTUI2P:
37123 case X86ISD::CVTP2SI:
37124 case X86ISD::CVTP2UI:
37125 case X86ISD::MCVTP2SI:
37126 case X86ISD::MCVTP2UI:
37127 case X86ISD::CVTTP2SI:
37128 case X86ISD::CVTTP2UI:
37129 case X86ISD::MCVTTP2SI:
37130 case X86ISD::MCVTTP2UI:
37131 case X86ISD::MCVTSI2P:
37132 case X86ISD::MCVTUI2P:
37133 case X86ISD::VFPROUND:
37134 case X86ISD::VMFPROUND:
37135 case X86ISD::CVTPS2PH:
37136 case X86ISD::MCVTPS2PH: {
37137 // Truncations/Conversions - upper elements are known zero.
37138 EVT SrcVT = Op.getOperand(0).getValueType();
37139 if (SrcVT.isVector()) {
37140 unsigned NumSrcElts = SrcVT.getVectorNumElements();
37141 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
37142 Known.setAllZero();
37143 }
37144 break;
37145 }
37152 // Strict Conversions - upper elements are known zero.
37153 EVT SrcVT = Op.getOperand(1).getValueType();
37154 if (SrcVT.isVector()) {
37155 unsigned NumSrcElts = SrcVT.getVectorNumElements();
37156 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
37157 Known.setAllZero();
37158 }
37159 break;
37160 }
37161 case X86ISD::MOVQ2DQ: {
37162 // Move from MMX to XMM. Upper half of XMM should be 0.
37163 if (DemandedElts.countr_zero() >= (NumElts / 2))
37164 Known.setAllZero();
37165 break;
37166 }
37168 APInt UndefElts;
37169 SmallVector<APInt, 16> EltBits;
37170 if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits,
37171 /*AllowWholeUndefs*/ false,
37172 /*AllowPartialUndefs*/ false)) {
37173 Known.Zero.setAllBits();
37174 Known.One.setAllBits();
37175 for (unsigned I = 0; I != NumElts; ++I) {
37176 if (!DemandedElts[I])
37177 continue;
37178 if (UndefElts[I]) {
37179 Known.resetAll();
37180 break;
37181 }
37182 KnownBits Known2 = KnownBits::makeConstant(EltBits[I]);
37183 Known = Known.intersectWith(Known2);
37184 }
37185 return;
37186 }
37187 break;
37188 }
37190 switch (Op->getConstantOperandVal(0)) {
37191 case Intrinsic::x86_sse2_psad_bw:
37192 case Intrinsic::x86_avx2_psad_bw:
37193 case Intrinsic::x86_avx512_psad_bw_512: {
37194 SDValue LHS = Op.getOperand(1);
37195 SDValue RHS = Op.getOperand(2);
37196 assert(VT.getScalarType() == MVT::i64 &&
37197 LHS.getValueType() == RHS.getValueType() &&
37198 LHS.getValueType().getScalarType() == MVT::i8 &&
37199 "Unexpected PSADBW types");
37200 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
37201 break;
37202 }
37203 }
37204 break;
37205 }
37206 }
37207
37208 // Handle target shuffles.
37209 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
37210 if (isTargetShuffle(Opc)) {
37213 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
37214 unsigned NumOps = Ops.size();
37215 unsigned NumElts = VT.getVectorNumElements();
37216 if (Mask.size() == NumElts) {
37217 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
37218 Known.Zero.setAllBits(); Known.One.setAllBits();
37219 for (unsigned i = 0; i != NumElts; ++i) {
37220 if (!DemandedElts[i])
37221 continue;
37222 int M = Mask[i];
37223 if (M == SM_SentinelUndef) {
37224 // For UNDEF elements, we don't know anything about the common state
37225 // of the shuffle result.
37226 Known.resetAll();
37227 break;
37228 }
37229 if (M == SM_SentinelZero) {
37230 Known.One.clearAllBits();
37231 continue;
37232 }
37233 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
37234 "Shuffle index out of range");
37235
37236 unsigned OpIdx = (unsigned)M / NumElts;
37237 unsigned EltIdx = (unsigned)M % NumElts;
37238 if (Ops[OpIdx].getValueType() != VT) {
37239 // TODO - handle target shuffle ops with different value types.
37240 Known.resetAll();
37241 break;
37242 }
37243 DemandedOps[OpIdx].setBit(EltIdx);
37244 }
37245 // Known bits are the values that are shared by every demanded element.
37246 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
37247 if (!DemandedOps[i])
37248 continue;
37249 KnownBits Known2 =
37250 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
37251 Known = Known.intersectWith(Known2);
37252 }
37253 }
37254 }
37255 }
37256}
37257
37259 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
37260 unsigned Depth) const {
37261 EVT VT = Op.getValueType();
37262 unsigned VTBits = VT.getScalarSizeInBits();
37263 unsigned Opcode = Op.getOpcode();
37264 switch (Opcode) {
37266 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
37267 return VTBits;
37268
37269 case X86ISD::VTRUNC: {
37270 SDValue Src = Op.getOperand(0);
37271 MVT SrcVT = Src.getSimpleValueType();
37272 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
37273 assert(VTBits < NumSrcBits && "Illegal truncation input type");
37274 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
37275 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
37276 if (Tmp > (NumSrcBits - VTBits))
37277 return Tmp - (NumSrcBits - VTBits);
37278 return 1;
37279 }
37280
37281 case X86ISD::PACKSS: {
37282 // PACKSS is just a truncation if the sign bits extend to the packed size.
37283 APInt DemandedLHS, DemandedRHS;
37284 getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
37285 DemandedRHS);
37286
37287 // Helper to detect PACKSSDW(BITCAST(PACKSSDW(X)),BITCAST(PACKSSDW(Y)))
37288 // patterns often used to compact vXi64 allsignbit patterns.
37289 auto NumSignBitsPACKSS = [&](SDValue V, const APInt &Elts) -> unsigned {
37291 if (BC.getOpcode() == X86ISD::PACKSS &&
37292 BC.getScalarValueSizeInBits() == 16 &&
37293 V.getScalarValueSizeInBits() == 32) {
37296 if (BC0.getScalarValueSizeInBits() == 64 &&
37297 BC1.getScalarValueSizeInBits() == 64 &&
37298 DAG.ComputeNumSignBits(BC0, Depth + 1) == 64 &&
37299 DAG.ComputeNumSignBits(BC1, Depth + 1) == 64)
37300 return 32;
37301 }
37302 return DAG.ComputeNumSignBits(V, Elts, Depth + 1);
37303 };
37304
37305 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
37306 unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
37307 if (!!DemandedLHS)
37308 Tmp0 = NumSignBitsPACKSS(Op.getOperand(0), DemandedLHS);
37309 if (!!DemandedRHS)
37310 Tmp1 = NumSignBitsPACKSS(Op.getOperand(1), DemandedRHS);
37311 unsigned Tmp = std::min(Tmp0, Tmp1);
37312 if (Tmp > (SrcBits - VTBits))
37313 return Tmp - (SrcBits - VTBits);
37314 return 1;
37315 }
37316
37317 case X86ISD::VBROADCAST: {
37318 SDValue Src = Op.getOperand(0);
37319 if (!Src.getSimpleValueType().isVector())
37320 return DAG.ComputeNumSignBits(Src, Depth + 1);
37321 break;
37322 }
37323
37324 case X86ISD::VSHLI: {
37325 SDValue Src = Op.getOperand(0);
37326 const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
37327 if (ShiftVal.uge(VTBits))
37328 return VTBits; // Shifted all bits out --> zero.
37329 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
37330 if (ShiftVal.uge(Tmp))
37331 return 1; // Shifted all sign bits out --> unknown.
37332 return Tmp - ShiftVal.getZExtValue();
37333 }
37334
37335 case X86ISD::VSRAI: {
37336 SDValue Src = Op.getOperand(0);
37337 APInt ShiftVal = Op.getConstantOperandAPInt(1);
37338 if (ShiftVal.uge(VTBits - 1))
37339 return VTBits; // Sign splat.
37340 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
37341 ShiftVal += Tmp;
37342 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
37343 }
37344
37345 case X86ISD::FSETCC:
37346 // cmpss/cmpsd return zero/all-bits result values in the bottom element.
37347 if (VT == MVT::f32 || VT == MVT::f64 ||
37348 ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
37349 return VTBits;
37350 break;
37351
37352 case X86ISD::PCMPGT:
37353 case X86ISD::PCMPEQ:
37354 case X86ISD::CMPP:
37355 case X86ISD::VPCOM:
37356 case X86ISD::VPCOMU:
37357 // Vector compares return zero/all-bits result values.
37358 return VTBits;
37359
37360 case X86ISD::ANDNP: {
37361 unsigned Tmp0 =
37362 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
37363 if (Tmp0 == 1) return 1; // Early out.
37364 unsigned Tmp1 =
37365 DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
37366 return std::min(Tmp0, Tmp1);
37367 }
37368
37369 case X86ISD::CMOV: {
37370 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
37371 if (Tmp0 == 1) return 1; // Early out.
37372 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
37373 return std::min(Tmp0, Tmp1);
37374 }
37375 }
37376
37377 // Handle target shuffles.
37378 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
37379 if (isTargetShuffle(Opcode)) {
37382 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
37383 unsigned NumOps = Ops.size();
37384 unsigned NumElts = VT.getVectorNumElements();
37385 if (Mask.size() == NumElts) {
37386 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
37387 for (unsigned i = 0; i != NumElts; ++i) {
37388 if (!DemandedElts[i])
37389 continue;
37390 int M = Mask[i];
37391 if (M == SM_SentinelUndef) {
37392 // For UNDEF elements, we don't know anything about the common state
37393 // of the shuffle result.
37394 return 1;
37395 } else if (M == SM_SentinelZero) {
37396 // Zero = all sign bits.
37397 continue;
37398 }
37399 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
37400 "Shuffle index out of range");
37401
37402 unsigned OpIdx = (unsigned)M / NumElts;
37403 unsigned EltIdx = (unsigned)M % NumElts;
37404 if (Ops[OpIdx].getValueType() != VT) {
37405 // TODO - handle target shuffle ops with different value types.
37406 return 1;
37407 }
37408 DemandedOps[OpIdx].setBit(EltIdx);
37409 }
37410 unsigned Tmp0 = VTBits;
37411 for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
37412 if (!DemandedOps[i])
37413 continue;
37414 unsigned Tmp1 =
37415 DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
37416 Tmp0 = std::min(Tmp0, Tmp1);
37417 }
37418 return Tmp0;
37419 }
37420 }
37421 }
37422
37423 // Fallback case.
37424 return 1;
37425}
37426
37428 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
37429 return N->getOperand(0);
37430 return N;
37431}
37432
37433// Helper to look for a normal load that can be narrowed into a vzload with the
37434// specified VT and memory VT. Returns SDValue() on failure.
37436 SelectionDAG &DAG) {
37437 // Can't if the load is volatile or atomic.
37438 if (!LN->isSimple())
37439 return SDValue();
37440
37441 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37442 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
37443 return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
37444 LN->getPointerInfo(), LN->getOriginalAlign(),
37445 LN->getMemOperand()->getFlags());
37446}
37447
37448// Attempt to match a combined shuffle mask against supported unary shuffle
37449// instructions.
37450// TODO: Investigate sharing more of this with shuffle lowering.
37451static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
37452 bool AllowFloatDomain, bool AllowIntDomain,
37453 SDValue V1, const SelectionDAG &DAG,
37454 const X86Subtarget &Subtarget, unsigned &Shuffle,
37455 MVT &SrcVT, MVT &DstVT) {
37456 unsigned NumMaskElts = Mask.size();
37457 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
37458
37459 // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
37460 if (Mask[0] == 0 &&
37461 (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {
37462 if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
37464 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
37465 Shuffle = X86ISD::VZEXT_MOVL;
37466 if (MaskEltSize == 16)
37467 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
37468 else
37469 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
37470 return true;
37471 }
37472 }
37473
37474 // Match against a ANY/SIGN/ZERO_EXTEND_VECTOR_INREG instruction.
37475 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
37476 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
37477 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
37478 unsigned MaxScale = 64 / MaskEltSize;
37479 bool UseSign = V1.getScalarValueSizeInBits() == MaskEltSize &&
37480 DAG.ComputeNumSignBits(V1) == MaskEltSize;
37481 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
37482 bool MatchAny = true;
37483 bool MatchZero = true;
37484 bool MatchSign = UseSign;
37485 unsigned NumDstElts = NumMaskElts / Scale;
37486 for (unsigned i = 0;
37487 i != NumDstElts && (MatchAny || MatchSign || MatchZero); ++i) {
37488 if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
37489 MatchAny = MatchSign = MatchZero = false;
37490 break;
37491 }
37492 unsigned Pos = (i * Scale) + 1;
37493 unsigned Len = Scale - 1;
37494 MatchAny &= isUndefInRange(Mask, Pos, Len);
37495 MatchZero &= isUndefOrZeroInRange(Mask, Pos, Len);
37496 MatchSign &= isUndefOrEqualInRange(Mask, (int)i, Pos, Len);
37497 }
37498 if (MatchAny || MatchSign || MatchZero) {
37499 assert((MatchSign || MatchZero) &&
37500 "Failed to match sext/zext but matched aext?");
37501 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
37502 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType()
37503 : MVT::getIntegerVT(MaskEltSize);
37504 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
37505
37506 Shuffle = unsigned(
37507 MatchAny ? ISD::ANY_EXTEND
37508 : (MatchSign ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND));
37509 if (SrcVT.getVectorNumElements() != NumDstElts)
37510 Shuffle = DAG.getOpcode_EXTEND_VECTOR_INREG(Shuffle);
37511
37512 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
37513 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
37514 return true;
37515 }
37516 }
37517 }
37518
37519 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
37520 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||
37521 (MaskEltSize == 16 && Subtarget.hasFP16())) &&
37522 isUndefOrEqual(Mask[0], 0) &&
37523 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
37524 Shuffle = X86ISD::VZEXT_MOVL;
37525 if (MaskEltSize == 16)
37526 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
37527 else
37528 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
37529 return true;
37530 }
37531
37532 // Check if we have SSE3 which will let us use MOVDDUP etc. The
37533 // instructions are no slower than UNPCKLPD but has the option to
37534 // fold the input operand into even an unaligned memory load.
37535 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
37536 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) {
37537 Shuffle = X86ISD::MOVDDUP;
37538 SrcVT = DstVT = MVT::v2f64;
37539 return true;
37540 }
37541 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
37542 Shuffle = X86ISD::MOVSLDUP;
37543 SrcVT = DstVT = MVT::v4f32;
37544 return true;
37545 }
37546 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) {
37547 Shuffle = X86ISD::MOVSHDUP;
37548 SrcVT = DstVT = MVT::v4f32;
37549 return true;
37550 }
37551 }
37552
37553 if (MaskVT.is256BitVector() && AllowFloatDomain) {
37554 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
37555 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
37556 Shuffle = X86ISD::MOVDDUP;
37557 SrcVT = DstVT = MVT::v4f64;
37558 return true;
37559 }
37560 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
37561 V1)) {
37562 Shuffle = X86ISD::MOVSLDUP;
37563 SrcVT = DstVT = MVT::v8f32;
37564 return true;
37565 }
37566 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG,
37567 V1)) {
37568 Shuffle = X86ISD::MOVSHDUP;
37569 SrcVT = DstVT = MVT::v8f32;
37570 return true;
37571 }
37572 }
37573
37574 if (MaskVT.is512BitVector() && AllowFloatDomain) {
37575 assert(Subtarget.hasAVX512() &&
37576 "AVX512 required for 512-bit vector shuffles");
37577 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
37578 V1)) {
37579 Shuffle = X86ISD::MOVDDUP;
37580 SrcVT = DstVT = MVT::v8f64;
37581 return true;
37582 }
37584 MaskVT, Mask,
37585 {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) {
37586 Shuffle = X86ISD::MOVSLDUP;
37587 SrcVT = DstVT = MVT::v16f32;
37588 return true;
37589 }
37591 MaskVT, Mask,
37592 {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) {
37593 Shuffle = X86ISD::MOVSHDUP;
37594 SrcVT = DstVT = MVT::v16f32;
37595 return true;
37596 }
37597 }
37598
37599 return false;
37600}
37601
37602// Attempt to match a combined shuffle mask against supported unary immediate
37603// permute instructions.
37604// TODO: Investigate sharing more of this with shuffle lowering.
37606 const APInt &Zeroable,
37607 bool AllowFloatDomain, bool AllowIntDomain,
37608 const SelectionDAG &DAG,
37609 const X86Subtarget &Subtarget,
37610 unsigned &Shuffle, MVT &ShuffleVT,
37611 unsigned &PermuteImm) {
37612 unsigned NumMaskElts = Mask.size();
37613 unsigned InputSizeInBits = MaskVT.getSizeInBits();
37614 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
37615 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
37616 bool ContainsZeros = isAnyZero(Mask);
37617
37618 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
37619 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
37620 // Check for lane crossing permutes.
37621 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
37622 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
37623 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
37624 Shuffle = X86ISD::VPERMI;
37625 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
37626 PermuteImm = getV4X86ShuffleImm(Mask);
37627 return true;
37628 }
37629 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
37630 SmallVector<int, 4> RepeatedMask;
37631 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
37632 Shuffle = X86ISD::VPERMI;
37633 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
37634 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
37635 return true;
37636 }
37637 }
37638 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
37639 // VPERMILPD can permute with a non-repeating shuffle.
37640 Shuffle = X86ISD::VPERMILPI;
37641 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
37642 PermuteImm = 0;
37643 for (int i = 0, e = Mask.size(); i != e; ++i) {
37644 int M = Mask[i];
37645 if (M == SM_SentinelUndef)
37646 continue;
37647 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
37648 PermuteImm |= (M & 1) << i;
37649 }
37650 return true;
37651 }
37652 }
37653
37654 // We are checking for shuffle match or shift match. Loop twice so we can
37655 // order which we try and match first depending on target preference.
37656 for (unsigned Order = 0; Order < 2; ++Order) {
37657 if (Subtarget.preferLowerShuffleAsShift() ? (Order == 1) : (Order == 0)) {
37658 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
37659 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
37660 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
37661 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
37662 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
37663 SmallVector<int, 4> RepeatedMask;
37664 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
37665 // Narrow the repeated mask to create 32-bit element permutes.
37666 SmallVector<int, 4> WordMask = RepeatedMask;
37667 if (MaskScalarSizeInBits == 64)
37668 narrowShuffleMaskElts(2, RepeatedMask, WordMask);
37669
37670 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
37671 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
37672 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
37673 PermuteImm = getV4X86ShuffleImm(WordMask);
37674 return true;
37675 }
37676 }
37677
37678 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
37679 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
37680 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
37681 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
37682 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
37683 SmallVector<int, 4> RepeatedMask;
37684 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
37685 ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
37686 ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
37687
37688 // PSHUFLW: permute lower 4 elements only.
37689 if (isUndefOrInRange(LoMask, 0, 4) &&
37690 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
37691 Shuffle = X86ISD::PSHUFLW;
37692 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
37693 PermuteImm = getV4X86ShuffleImm(LoMask);
37694 return true;
37695 }
37696
37697 // PSHUFHW: permute upper 4 elements only.
37698 if (isUndefOrInRange(HiMask, 4, 8) &&
37699 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
37700 // Offset the HiMask so that we can create the shuffle immediate.
37701 int OffsetHiMask[4];
37702 for (int i = 0; i != 4; ++i)
37703 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
37704
37705 Shuffle = X86ISD::PSHUFHW;
37706 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
37707 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
37708 return true;
37709 }
37710 }
37711 }
37712 } else {
37713 // Attempt to match against bit rotates.
37714 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
37715 ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
37716 Subtarget.hasAVX512())) {
37717 int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
37718 Subtarget, Mask);
37719 if (0 < RotateAmt) {
37720 Shuffle = X86ISD::VROTLI;
37721 PermuteImm = (unsigned)RotateAmt;
37722 return true;
37723 }
37724 }
37725 }
37726 // Attempt to match against byte/bit shifts.
37727 if (AllowIntDomain &&
37728 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
37729 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
37730 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
37731 int ShiftAmt =
37732 matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0,
37733 Zeroable, Subtarget);
37734 if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
37735 32 <= ShuffleVT.getScalarSizeInBits())) {
37736 // Byte shifts can be slower so only match them on second attempt.
37737 if (Order == 0 &&
37738 (Shuffle == X86ISD::VSHLDQ || Shuffle == X86ISD::VSRLDQ))
37739 continue;
37740
37741 PermuteImm = (unsigned)ShiftAmt;
37742 return true;
37743 }
37744
37745 }
37746 }
37747
37748 return false;
37749}
37750
37751// Attempt to match a combined unary shuffle mask against supported binary
37752// shuffle instructions.
37753// TODO: Investigate sharing more of this with shuffle lowering.
37754static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
37755 bool AllowFloatDomain, bool AllowIntDomain,
37756 SDValue &V1, SDValue &V2, const SDLoc &DL,
37757 SelectionDAG &DAG, const X86Subtarget &Subtarget,
37758 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
37759 bool IsUnary) {
37760 unsigned NumMaskElts = Mask.size();
37761 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
37762 unsigned SizeInBits = MaskVT.getSizeInBits();
37763
37764 if (MaskVT.is128BitVector()) {
37765 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&
37766 AllowFloatDomain) {
37767 V2 = V1;
37768 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
37769 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
37770 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
37771 return true;
37772 }
37773 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) &&
37774 AllowFloatDomain) {
37775 V2 = V1;
37776 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
37777 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
37778 return true;
37779 }
37780 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) &&
37781 Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
37782 std::swap(V1, V2);
37783 Shuffle = X86ISD::MOVSD;
37784 SrcVT = DstVT = MVT::v2f64;
37785 return true;
37786 }
37787 if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) &&
37788 (AllowFloatDomain || !Subtarget.hasSSE41())) {
37789 Shuffle = X86ISD::MOVSS;
37790 SrcVT = DstVT = MVT::v4f32;
37791 return true;
37792 }
37793 if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7},
37794 DAG) &&
37795 Subtarget.hasFP16()) {
37796 Shuffle = X86ISD::MOVSH;
37797 SrcVT = DstVT = MVT::v8f16;
37798 return true;
37799 }
37800 }
37801
37802 // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
37803 if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
37804 ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
37805 ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
37806 if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
37807 Subtarget)) {
37808 DstVT = MaskVT;
37809 return true;
37810 }
37811 }
37812 // TODO: Can we handle this inside matchShuffleWithPACK?
37813 if (MaskVT == MVT::v4i32 && Subtarget.hasSSE2() &&
37814 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2, 4, 6}, DAG) &&
37815 V1.getScalarValueSizeInBits() == 64 &&
37816 V2.getScalarValueSizeInBits() == 64) {
37817 // Use (SSE41) PACKUSWD if the leading zerobits goto the lowest 16-bits.
37818 unsigned MinLZV1 = DAG.computeKnownBits(V1).countMinLeadingZeros();
37819 unsigned MinLZV2 = DAG.computeKnownBits(V2).countMinLeadingZeros();
37820 if (Subtarget.hasSSE41() && MinLZV1 >= 48 && MinLZV2 >= 48) {
37821 SrcVT = MVT::v4i32;
37822 DstVT = MVT::v8i16;
37823 Shuffle = X86ISD::PACKUS;
37824 return true;
37825 }
37826 // Use PACKUSBW if the leading zerobits goto the lowest 8-bits.
37827 if (MinLZV1 >= 56 && MinLZV2 >= 56) {
37828 SrcVT = MVT::v8i16;
37829 DstVT = MVT::v16i8;
37830 Shuffle = X86ISD::PACKUS;
37831 return true;
37832 }
37833 // Use PACKSSWD if the signbits extend to the lowest 16-bits.
37834 if (DAG.ComputeNumSignBits(V1) > 48 && DAG.ComputeNumSignBits(V2) > 48) {
37835 SrcVT = MVT::v4i32;
37836 DstVT = MVT::v8i16;
37837 Shuffle = X86ISD::PACKSS;
37838 return true;
37839 }
37840 }
37841
37842 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
37843 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
37844 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
37845 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
37846 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
37847 (MaskVT.is512BitVector() && Subtarget.hasAVX512() &&
37848 (32 <= EltSizeInBits || Subtarget.hasBWI()))) {
37849 if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
37850 Subtarget)) {
37851 SrcVT = DstVT = MaskVT;
37852 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
37853 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
37854 return true;
37855 }
37856 }
37857
37858 // Attempt to match against a OR if we're performing a blend shuffle and the
37859 // non-blended source element is zero in each case.
37860 // TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits.
37861 if (SizeInBits == V1.getValueSizeInBits() &&
37862 SizeInBits == V2.getValueSizeInBits() &&
37863 (EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
37864 (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
37865 bool IsBlend = true;
37866 unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
37867 unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
37868 unsigned Scale1 = NumV1Elts / NumMaskElts;
37869 unsigned Scale2 = NumV2Elts / NumMaskElts;
37870 APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);
37871 APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);
37872 for (unsigned i = 0; i != NumMaskElts; ++i) {
37873 int M = Mask[i];
37874 if (M == SM_SentinelUndef)
37875 continue;
37876 if (M == SM_SentinelZero) {
37877 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
37878 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
37879 continue;
37880 }
37881 if (M == (int)i) {
37882 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
37883 continue;
37884 }
37885 if (M == (int)(i + NumMaskElts)) {
37886 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
37887 continue;
37888 }
37889 IsBlend = false;
37890 break;
37891 }
37892 if (IsBlend) {
37893 if (DAG.MaskedVectorIsZero(V1, DemandedZeroV1) &&
37894 DAG.MaskedVectorIsZero(V2, DemandedZeroV2)) {
37895 Shuffle = ISD::OR;
37896 SrcVT = DstVT = MaskVT.changeTypeToInteger();
37897 return true;
37898 }
37899 if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {
37900 // FIXME: handle mismatched sizes?
37901 // TODO: investigate if `ISD::OR` handling in
37902 // `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.
37903 auto computeKnownBitsElementWise = [&DAG](SDValue V) {
37904 unsigned NumElts = V.getValueType().getVectorNumElements();
37905 KnownBits Known(NumElts);
37906 for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
37907 APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
37908 KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);
37909 if (PeepholeKnown.isZero())
37910 Known.Zero.setBit(EltIdx);
37911 if (PeepholeKnown.isAllOnes())
37912 Known.One.setBit(EltIdx);
37913 }
37914 return Known;
37915 };
37916
37917 KnownBits V1Known = computeKnownBitsElementWise(V1);
37918 KnownBits V2Known = computeKnownBitsElementWise(V2);
37919
37920 for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {
37921 int M = Mask[i];
37922 if (M == SM_SentinelUndef)
37923 continue;
37924 if (M == SM_SentinelZero) {
37925 IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];
37926 continue;
37927 }
37928 if (M == (int)i) {
37929 IsBlend &= V2Known.Zero[i] || V1Known.One[i];
37930 continue;
37931 }
37932 if (M == (int)(i + NumMaskElts)) {
37933 IsBlend &= V1Known.Zero[i] || V2Known.One[i];
37934 continue;
37935 }
37936 llvm_unreachable("will not get here.");
37937 }
37938 if (IsBlend) {
37939 Shuffle = ISD::OR;
37940 SrcVT = DstVT = MaskVT.changeTypeToInteger();
37941 return true;
37942 }
37943 }
37944 }
37945 }
37946
37947 return false;
37948}
37949
37951 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
37952 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
37953 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
37954 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
37955 unsigned NumMaskElts = Mask.size();
37956 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
37957
37958 // Attempt to match against VALIGND/VALIGNQ rotate.
37959 if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
37960 ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
37961 (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
37962 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
37963 if (!isAnyZero(Mask)) {
37964 int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
37965 if (0 < Rotation) {
37966 Shuffle = X86ISD::VALIGN;
37967 if (EltSizeInBits == 64)
37968 ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);
37969 else
37970 ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);
37971 PermuteImm = Rotation;
37972 return true;
37973 }
37974 }
37975 }
37976
37977 // Attempt to match against PALIGNR byte rotate.
37978 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
37979 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
37980 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
37981 int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
37982 if (0 < ByteRotation) {
37983 Shuffle = X86ISD::PALIGNR;
37984 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
37985 PermuteImm = ByteRotation;
37986 return true;
37987 }
37988 }
37989
37990 // Attempt to combine to X86ISD::BLENDI.
37991 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
37992 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
37993 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
37994 uint64_t BlendMask = 0;
37995 bool ForceV1Zero = false, ForceV2Zero = false;
37996 SmallVector<int, 8> TargetMask(Mask);
37997 if (matchShuffleAsBlend(MaskVT, V1, V2, TargetMask, Zeroable, ForceV1Zero,
37998 ForceV2Zero, BlendMask)) {
37999 if (MaskVT == MVT::v16i16) {
38000 // We can only use v16i16 PBLENDW if the lanes are repeated.
38001 SmallVector<int, 8> RepeatedMask;
38002 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
38003 RepeatedMask)) {
38004 assert(RepeatedMask.size() == 8 &&
38005 "Repeated mask size doesn't match!");
38006 PermuteImm = 0;
38007 for (int i = 0; i < 8; ++i)
38008 if (RepeatedMask[i] >= 8)
38009 PermuteImm |= 1 << i;
38010 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
38011 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
38012 Shuffle = X86ISD::BLENDI;
38013 ShuffleVT = MaskVT;
38014 return true;
38015 }
38016 } else {
38017 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
38018 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
38019 PermuteImm = (unsigned)BlendMask;
38020 Shuffle = X86ISD::BLENDI;
38021 ShuffleVT = MaskVT;
38022 return true;
38023 }
38024 }
38025 }
38026
38027 // Attempt to combine to INSERTPS, but only if it has elements that need to
38028 // be set to zero.
38029 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
38030 MaskVT.is128BitVector() && isAnyZero(Mask) &&
38031 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
38032 Shuffle = X86ISD::INSERTPS;
38033 ShuffleVT = MVT::v4f32;
38034 return true;
38035 }
38036
38037 // Attempt to combine to SHUFPD.
38038 if (AllowFloatDomain && EltSizeInBits == 64 &&
38039 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
38040 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
38041 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
38042 bool ForceV1Zero = false, ForceV2Zero = false;
38043 if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
38044 PermuteImm, Mask, Zeroable)) {
38045 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
38046 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
38047 Shuffle = X86ISD::SHUFP;
38048 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
38049 return true;
38050 }
38051 }
38052
38053 // Attempt to combine to SHUFPS.
38054 if (AllowFloatDomain && EltSizeInBits == 32 &&
38055 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
38056 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
38057 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
38058 SmallVector<int, 4> RepeatedMask;
38059 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
38060 // Match each half of the repeated mask, to determine if its just
38061 // referencing one of the vectors, is zeroable or entirely undef.
38062 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
38063 int M0 = RepeatedMask[Offset];
38064 int M1 = RepeatedMask[Offset + 1];
38065
38066 if (isUndefInRange(RepeatedMask, Offset, 2)) {
38067 return DAG.getUNDEF(MaskVT);
38068 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
38069 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
38070 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
38071 return getZeroVector(MaskVT, Subtarget, DAG, DL);
38072 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
38073 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
38074 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
38075 return V1;
38076 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
38077 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
38078 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
38079 return V2;
38080 }
38081
38082 return SDValue();
38083 };
38084
38085 int ShufMask[4] = {-1, -1, -1, -1};
38086 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
38087 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
38088
38089 if (Lo && Hi) {
38090 V1 = Lo;
38091 V2 = Hi;
38092 Shuffle = X86ISD::SHUFP;
38093 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
38094 PermuteImm = getV4X86ShuffleImm(ShufMask);
38095 return true;
38096 }
38097 }
38098 }
38099
38100 // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
38101 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
38102 MaskVT.is128BitVector() &&
38103 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
38104 Shuffle = X86ISD::INSERTPS;
38105 ShuffleVT = MVT::v4f32;
38106 return true;
38107 }
38108
38109 return false;
38110}
38111
38113 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
38114 bool HasVariableMask, bool AllowVariableCrossLaneMask,
38115 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
38116 const X86Subtarget &Subtarget);
38117
38118/// Combine an arbitrary chain of shuffles into a single instruction if
38119/// possible.
38120///
38121/// This is the leaf of the recursive combine below. When we have found some
38122/// chain of single-use x86 shuffle instructions and accumulated the combined
38123/// shuffle mask represented by them, this will try to pattern match that mask
38124/// into either a single instruction if there is a special purpose instruction
38125/// for this operation, or into a PSHUFB instruction which is a fully general
38126/// instruction but should only be used to replace chains over a certain depth.
38128 ArrayRef<int> BaseMask, int Depth,
38129 bool HasVariableMask,
38130 bool AllowVariableCrossLaneMask,
38131 bool AllowVariablePerLaneMask,
38132 SelectionDAG &DAG,
38133 const X86Subtarget &Subtarget) {
38134 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
38135 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
38136 "Unexpected number of shuffle inputs!");
38137
38138 SDLoc DL(Root);
38139 MVT RootVT = Root.getSimpleValueType();
38140 unsigned RootSizeInBits = RootVT.getSizeInBits();
38141 unsigned NumRootElts = RootVT.getVectorNumElements();
38142
38143 // Canonicalize shuffle input op to the requested type.
38144 auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
38145 if (VT.getSizeInBits() > Op.getValueSizeInBits())
38146 Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());
38147 else if (VT.getSizeInBits() < Op.getValueSizeInBits())
38148 Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());
38149 return DAG.getBitcast(VT, Op);
38150 };
38151
38152 // Find the inputs that enter the chain. Note that multiple uses are OK
38153 // here, we're not going to remove the operands we find.
38154 bool UnaryShuffle = (Inputs.size() == 1);
38155 SDValue V1 = peekThroughBitcasts(Inputs[0]);
38156 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
38157 : peekThroughBitcasts(Inputs[1]));
38158
38159 MVT VT1 = V1.getSimpleValueType();
38160 MVT VT2 = V2.getSimpleValueType();
38161 assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&
38162 (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch");
38163
38164 SDValue Res;
38165
38166 unsigned NumBaseMaskElts = BaseMask.size();
38167 if (NumBaseMaskElts == 1) {
38168 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
38169 return CanonicalizeShuffleInput(RootVT, V1);
38170 }
38171
38172 bool OptForSize = DAG.shouldOptForSize();
38173 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
38174 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
38175 (RootVT.isFloatingPoint() && Depth >= 1) ||
38176 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
38177
38178 // Don't combine if we are a AVX512/EVEX target and the mask element size
38179 // is different from the root element size - this would prevent writemasks
38180 // from being reused.
38181 bool IsMaskedShuffle = false;
38182 if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
38183 if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&
38184 Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
38185 IsMaskedShuffle = true;
38186 }
38187 }
38188
38189 // If we are shuffling a splat (and not introducing zeros) then we can just
38190 // use it directly. This works for smaller elements as well as they already
38191 // repeat across each mask element.
38192 if (UnaryShuffle && !isAnyZero(BaseMask) &&
38193 V1.getValueSizeInBits() >= RootSizeInBits &&
38194 (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
38195 DAG.isSplatValue(V1, /*AllowUndefs*/ false)) {
38196 return CanonicalizeShuffleInput(RootVT, V1);
38197 }
38198
38199 SmallVector<int, 64> Mask(BaseMask);
38200
38201 // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
38202 // etc. can be simplified.
38203 if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {
38204 SmallVector<int> ScaledMask, IdentityMask;
38205 unsigned NumElts = VT1.getVectorNumElements();
38206 if (Mask.size() <= NumElts &&
38207 scaleShuffleElements(Mask, NumElts, ScaledMask)) {
38208 for (unsigned i = 0; i != NumElts; ++i)
38209 IdentityMask.push_back(i);
38210 if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1,
38211 V2))
38212 return CanonicalizeShuffleInput(RootVT, V1);
38213 }
38214 }
38215
38216 // Handle 128/256-bit lane shuffles of 512-bit vectors.
38217 if (RootVT.is512BitVector() &&
38218 (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
38219 // If the upper subvectors are zeroable, then an extract+insert is more
38220 // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
38221 // to zero the upper subvectors.
38222 if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
38223 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
38224 return SDValue(); // Nothing to do!
38225 assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&
38226 "Unexpected lane shuffle");
38227 Res = CanonicalizeShuffleInput(RootVT, V1);
38228 unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);
38229 bool UseZero = isAnyZero(Mask);
38230 Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
38231 return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
38232 }
38233
38234 // Narrow shuffle mask to v4x128.
38235 SmallVector<int, 4> ScaledMask;
38236 assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
38237 narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);
38238
38239 // Try to lower to vshuf64x2/vshuf32x4.
38240 auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
38241 ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,
38242 SelectionDAG &DAG) {
38243 int PermMask[4] = {-1, -1, -1, -1};
38244 // Ensure elements came from the same Op.
38245 SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
38246 for (int i = 0; i < 4; ++i) {
38247 assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value");
38248 if (ScaledMask[i] < 0)
38249 continue;
38250
38251 SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;
38252 unsigned OpIndex = i / 2;
38253 if (Ops[OpIndex].isUndef())
38254 Ops[OpIndex] = Op;
38255 else if (Ops[OpIndex] != Op)
38256 return SDValue();
38257
38258 PermMask[i] = ScaledMask[i] % 4;
38259 }
38260
38261 return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
38262 CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
38263 CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
38264 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
38265 };
38266
38267 // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
38268 // doesn't work because our mask is for 128 bits and we don't have an MVT
38269 // to match that.
38270 bool PreferPERMQ = UnaryShuffle && isUndefOrInRange(ScaledMask[0], 0, 2) &&
38271 isUndefOrInRange(ScaledMask[1], 0, 2) &&
38272 isUndefOrInRange(ScaledMask[2], 2, 4) &&
38273 isUndefOrInRange(ScaledMask[3], 2, 4) &&
38274 (ScaledMask[0] < 0 || ScaledMask[2] < 0 ||
38275 ScaledMask[0] == (ScaledMask[2] % 2)) &&
38276 (ScaledMask[1] < 0 || ScaledMask[3] < 0 ||
38277 ScaledMask[1] == (ScaledMask[3] % 2));
38278
38279 if (!isAnyZero(ScaledMask) && !PreferPERMQ) {
38280 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
38281 return SDValue(); // Nothing to do!
38282 MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
38283 if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))
38284 return DAG.getBitcast(RootVT, V);
38285 }
38286 }
38287
38288 // Handle 128-bit lane shuffles of 256-bit vectors.
38289 if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
38290 // If the upper half is zeroable, then an extract+insert is more optimal
38291 // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
38292 // zero the upper half.
38293 if (isUndefOrZero(Mask[1])) {
38294 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
38295 return SDValue(); // Nothing to do!
38296 assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle");
38297 Res = CanonicalizeShuffleInput(RootVT, V1);
38298 Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);
38299 return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,
38300 256);
38301 }
38302
38303 // If we're inserting the low subvector, an insert-subvector 'concat'
38304 // pattern is quicker than VPERM2X128.
38305 // TODO: Add AVX2 support instead of VPERMQ/VPERMPD.
38306 if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&
38307 !Subtarget.hasAVX2()) {
38308 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
38309 return SDValue(); // Nothing to do!
38310 SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);
38311 SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);
38312 Hi = extractSubVector(Hi, 0, DAG, DL, 128);
38313 return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);
38314 }
38315
38316 if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
38317 return SDValue(); // Nothing to do!
38318
38319 // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
38320 // we need to use the zeroing feature.
38321 // Prefer blends for sequential shuffles unless we are optimizing for size.
38322 if (UnaryShuffle &&
38323 !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&
38324 (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {
38325 unsigned PermMask = 0;
38326 PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);
38327 PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);
38328 return DAG.getNode(
38329 X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
38330 DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
38331 }
38332
38333 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
38334 return SDValue(); // Nothing to do!
38335
38336 // TODO - handle AVX512VL cases with X86ISD::SHUF128.
38337 if (!UnaryShuffle && !IsMaskedShuffle) {
38338 assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&
38339 "Unexpected shuffle sentinel value");
38340 // Prefer blends to X86ISD::VPERM2X128.
38341 if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
38342 unsigned PermMask = 0;
38343 PermMask |= ((Mask[0] & 3) << 0);
38344 PermMask |= ((Mask[1] & 3) << 4);
38345 SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
38346 SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
38347 return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
38348 CanonicalizeShuffleInput(RootVT, LHS),
38349 CanonicalizeShuffleInput(RootVT, RHS),
38350 DAG.getTargetConstant(PermMask, DL, MVT::i8));
38351 }
38352 }
38353 }
38354
38355 // For masks that have been widened to 128-bit elements or more,
38356 // narrow back down to 64-bit elements.
38357 if (BaseMaskEltSizeInBits > 64) {
38358 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
38359 int MaskScale = BaseMaskEltSizeInBits / 64;
38360 SmallVector<int, 64> ScaledMask;
38361 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
38362 Mask = std::move(ScaledMask);
38363 }
38364
38365 // For masked shuffles, we're trying to match the root width for better
38366 // writemask folding, attempt to scale the mask.
38367 // TODO - variable shuffles might need this to be widened again.
38368 if (IsMaskedShuffle && NumRootElts > Mask.size()) {
38369 assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size");
38370 int MaskScale = NumRootElts / Mask.size();
38371 SmallVector<int, 64> ScaledMask;
38372 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
38373 Mask = std::move(ScaledMask);
38374 }
38375
38376 unsigned NumMaskElts = Mask.size();
38377 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
38378 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38379
38380 // Determine the effective mask value type.
38381 FloatDomain &= (32 <= MaskEltSizeInBits);
38382 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
38383 : MVT::getIntegerVT(MaskEltSizeInBits);
38384 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
38385
38386 // Only allow legal mask types.
38387 if (!TLI.isTypeLegal(MaskVT))
38388 return SDValue();
38389
38390 // Attempt to match the mask against known shuffle patterns.
38391 MVT ShuffleSrcVT, ShuffleVT;
38392 unsigned Shuffle, PermuteImm;
38393
38394 // Which shuffle domains are permitted?
38395 // Permit domain crossing at higher combine depths.
38396 // TODO: Should we indicate which domain is preferred if both are allowed?
38397 bool AllowFloatDomain = FloatDomain || (Depth >= 3);
38398 bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
38399 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
38400
38401 // Determine zeroable mask elements.
38402 APInt KnownUndef, KnownZero;
38403 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
38404 APInt Zeroable = KnownUndef | KnownZero;
38405
38406 if (UnaryShuffle) {
38407 // Attempt to match against broadcast-from-vector.
38408 // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
38409 if ((Subtarget.hasAVX2() ||
38410 (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
38411 (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
38412 if (isUndefOrEqual(Mask, 0)) {
38413 if (V1.getValueType() == MaskVT &&
38415 X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {
38416 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
38417 return SDValue(); // Nothing to do!
38418 Res = V1.getOperand(0);
38419 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
38420 return DAG.getBitcast(RootVT, Res);
38421 }
38422 if (Subtarget.hasAVX2()) {
38423 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
38424 return SDValue(); // Nothing to do!
38425 Res = CanonicalizeShuffleInput(MaskVT, V1);
38426 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
38427 return DAG.getBitcast(RootVT, Res);
38428 }
38429 }
38430 }
38431
38432 if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,
38433 DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&
38434 (!IsMaskedShuffle ||
38435 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
38436 if (Depth == 0 && Root.getOpcode() == Shuffle)
38437 return SDValue(); // Nothing to do!
38438 Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
38439 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
38440 return DAG.getBitcast(RootVT, Res);
38441 }
38442
38443 if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
38444 AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,
38445 PermuteImm) &&
38446 (!IsMaskedShuffle ||
38447 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
38448 if (Depth == 0 && Root.getOpcode() == Shuffle)
38449 return SDValue(); // Nothing to do!
38450 Res = CanonicalizeShuffleInput(ShuffleVT, V1);
38451 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
38452 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
38453 return DAG.getBitcast(RootVT, Res);
38454 }
38455 }
38456
38457 // Attempt to combine to INSERTPS, but only if the inserted element has come
38458 // from a scalar.
38459 // TODO: Handle other insertions here as well?
38460 if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
38461 Subtarget.hasSSE41() &&
38462 !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) {
38463 if (MaskEltSizeInBits == 32) {
38464 SDValue SrcV1 = V1, SrcV2 = V2;
38465 if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
38466 DAG) &&
38467 SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
38468 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
38469 return SDValue(); // Nothing to do!
38470 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
38471 CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
38472 CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
38473 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
38474 return DAG.getBitcast(RootVT, Res);
38475 }
38476 }
38477 if (MaskEltSizeInBits == 64 &&
38478 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) &&
38479 V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&
38480 V2.getScalarValueSizeInBits() <= 32) {
38481 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
38482 return SDValue(); // Nothing to do!
38483 PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0);
38484 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
38485 CanonicalizeShuffleInput(MVT::v4f32, V1),
38486 CanonicalizeShuffleInput(MVT::v4f32, V2),
38487 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
38488 return DAG.getBitcast(RootVT, Res);
38489 }
38490 }
38491
38492 SDValue NewV1 = V1; // Save operands in case early exit happens.
38493 SDValue NewV2 = V2;
38494 if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
38495 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
38496 ShuffleVT, UnaryShuffle) &&
38497 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
38498 if (Depth == 0 && Root.getOpcode() == Shuffle)
38499 return SDValue(); // Nothing to do!
38500 NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
38501 NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
38502 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
38503 return DAG.getBitcast(RootVT, Res);
38504 }
38505
38506 NewV1 = V1; // Save operands in case early exit happens.
38507 NewV2 = V2;
38508 if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
38509 AllowIntDomain, NewV1, NewV2, DL, DAG,
38510 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
38511 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
38512 if (Depth == 0 && Root.getOpcode() == Shuffle)
38513 return SDValue(); // Nothing to do!
38514 NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
38515 NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
38516 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
38517 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
38518 return DAG.getBitcast(RootVT, Res);
38519 }
38520
38521 // Typically from here on, we need an integer version of MaskVT.
38522 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
38523 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
38524
38525 // Annoyingly, SSE4A instructions don't map into the above match helpers.
38526 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
38527 uint64_t BitLen, BitIdx;
38528 if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
38529 Zeroable)) {
38530 if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
38531 return SDValue(); // Nothing to do!
38532 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
38533 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
38534 DAG.getTargetConstant(BitLen, DL, MVT::i8),
38535 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
38536 return DAG.getBitcast(RootVT, Res);
38537 }
38538
38539 if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
38540 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
38541 return SDValue(); // Nothing to do!
38542 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
38543 V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
38544 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
38545 DAG.getTargetConstant(BitLen, DL, MVT::i8),
38546 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
38547 return DAG.getBitcast(RootVT, Res);
38548 }
38549 }
38550
38551 // Match shuffle against TRUNCATE patterns.
38552 if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
38553 // Match against a VTRUNC instruction, accounting for src/dst sizes.
38554 if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
38555 Subtarget)) {
38556 bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
38557 ShuffleSrcVT.getVectorNumElements();
38558 unsigned Opc =
38559 IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
38560 if (Depth == 0 && Root.getOpcode() == Opc)
38561 return SDValue(); // Nothing to do!
38562 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
38563 Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
38564 if (ShuffleVT.getSizeInBits() < RootSizeInBits)
38565 Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
38566 return DAG.getBitcast(RootVT, Res);
38567 }
38568
38569 // Do we need a more general binary truncation pattern?
38570 if (RootSizeInBits < 512 &&
38571 ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
38572 (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
38573 (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
38574 isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
38575 // Bail if this was already a truncation or PACK node.
38576 // We sometimes fail to match PACK if we demand known undef elements.
38577 if (Depth == 0 && (Root.getOpcode() == ISD::TRUNCATE ||
38578 Root.getOpcode() == X86ISD::PACKSS ||
38579 Root.getOpcode() == X86ISD::PACKUS))
38580 return SDValue(); // Nothing to do!
38581 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
38582 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
38583 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
38584 V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
38585 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
38586 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
38587 Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
38588 Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
38589 return DAG.getBitcast(RootVT, Res);
38590 }
38591 }
38592
38593 // Don't try to re-form single instruction chains under any circumstances now
38594 // that we've done encoding canonicalization for them.
38595 if (Depth < 1)
38596 return SDValue();
38597
38598 // Depth threshold above which we can efficiently use variable mask shuffles.
38599 int VariableCrossLaneShuffleDepth =
38600 Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
38601 int VariablePerLaneShuffleDepth =
38602 Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
38603 AllowVariableCrossLaneMask &=
38604 (Depth >= VariableCrossLaneShuffleDepth) || HasVariableMask;
38605 AllowVariablePerLaneMask &=
38606 (Depth >= VariablePerLaneShuffleDepth) || HasVariableMask;
38607 // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a
38608 // higher depth before combining them.
38609 bool AllowBWIVPERMV3 =
38610 (Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask);
38611
38612 bool MaskContainsZeros = isAnyZero(Mask);
38613
38614 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
38615 // If we have a single input lane-crossing shuffle then lower to VPERMV.
38616 if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
38617 if (Subtarget.hasAVX2() &&
38618 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
38619 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
38620 Res = CanonicalizeShuffleInput(MaskVT, V1);
38621 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
38622 return DAG.getBitcast(RootVT, Res);
38623 }
38624 // AVX512 variants (non-VLX will pad to 512-bit shuffles).
38625 if ((Subtarget.hasAVX512() &&
38626 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
38627 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
38628 (Subtarget.hasBWI() &&
38629 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
38630 (Subtarget.hasVBMI() &&
38631 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
38632 V1 = CanonicalizeShuffleInput(MaskVT, V1);
38633 V2 = DAG.getUNDEF(MaskVT);
38634 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
38635 return DAG.getBitcast(RootVT, Res);
38636 }
38637 }
38638
38639 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
38640 // vector as the second source (non-VLX will pad to 512-bit shuffles).
38641 if (UnaryShuffle && AllowVariableCrossLaneMask &&
38642 ((Subtarget.hasAVX512() &&
38643 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
38644 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
38645 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
38646 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
38647 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
38648 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
38649 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
38650 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
38651 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
38652 for (unsigned i = 0; i != NumMaskElts; ++i)
38653 if (Mask[i] == SM_SentinelZero)
38654 Mask[i] = NumMaskElts + i;
38655 V1 = CanonicalizeShuffleInput(MaskVT, V1);
38656 V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
38657 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
38658 return DAG.getBitcast(RootVT, Res);
38659 }
38660
38661 // If that failed and either input is extracted then try to combine as a
38662 // shuffle with the larger type.
38664 Inputs, Root, BaseMask, Depth, HasVariableMask,
38665 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG,
38666 Subtarget))
38667 return WideShuffle;
38668
38669 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
38670 // (non-VLX will pad to 512-bit shuffles).
38671 if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
38672 ((Subtarget.hasAVX512() &&
38673 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
38674 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
38675 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
38676 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
38677 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
38678 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
38679 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
38680 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
38681 V1 = CanonicalizeShuffleInput(MaskVT, V1);
38682 V2 = CanonicalizeShuffleInput(MaskVT, V2);
38683 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
38684 return DAG.getBitcast(RootVT, Res);
38685 }
38686 return SDValue();
38687 }
38688
38689 // See if we can combine a single input shuffle with zeros to a bit-mask,
38690 // which is much simpler than any shuffle.
38691 if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
38692 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
38693 TLI.isTypeLegal(MaskVT)) {
38694 APInt Zero = APInt::getZero(MaskEltSizeInBits);
38695 APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);
38696 APInt UndefElts(NumMaskElts, 0);
38697 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
38698 for (unsigned i = 0; i != NumMaskElts; ++i) {
38699 int M = Mask[i];
38700 if (M == SM_SentinelUndef) {
38701 UndefElts.setBit(i);
38702 continue;
38703 }
38704 if (M == SM_SentinelZero)
38705 continue;
38706 EltBits[i] = AllOnes;
38707 }
38708 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
38709 Res = CanonicalizeShuffleInput(MaskVT, V1);
38710 unsigned AndOpcode =
38712 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
38713 return DAG.getBitcast(RootVT, Res);
38714 }
38715
38716 // If we have a single input shuffle with different shuffle patterns in the
38717 // the 128-bit lanes use the variable mask to VPERMILPS.
38718 // TODO Combine other mask types at higher depths.
38719 if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
38720 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
38721 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
38722 SmallVector<SDValue, 16> VPermIdx;
38723 for (int M : Mask) {
38724 SDValue Idx =
38725 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
38726 VPermIdx.push_back(Idx);
38727 }
38728 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
38729 Res = CanonicalizeShuffleInput(MaskVT, V1);
38730 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
38731 return DAG.getBitcast(RootVT, Res);
38732 }
38733
38734 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
38735 // to VPERMIL2PD/VPERMIL2PS.
38736 if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
38737 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
38738 MaskVT == MVT::v8f32)) {
38739 // VPERMIL2 Operation.
38740 // Bits[3] - Match Bit.
38741 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
38742 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
38743 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
38744 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
38745 SmallVector<int, 8> VPerm2Idx;
38746 unsigned M2ZImm = 0;
38747 for (int M : Mask) {
38748 if (M == SM_SentinelUndef) {
38749 VPerm2Idx.push_back(-1);
38750 continue;
38751 }
38752 if (M == SM_SentinelZero) {
38753 M2ZImm = 2;
38754 VPerm2Idx.push_back(8);
38755 continue;
38756 }
38757 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
38758 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
38759 VPerm2Idx.push_back(Index);
38760 }
38761 V1 = CanonicalizeShuffleInput(MaskVT, V1);
38762 V2 = CanonicalizeShuffleInput(MaskVT, V2);
38763 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
38764 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
38765 DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
38766 return DAG.getBitcast(RootVT, Res);
38767 }
38768
38769 // If we have 3 or more shuffle instructions or a chain involving a variable
38770 // mask, we can replace them with a single PSHUFB instruction profitably.
38771 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
38772 // instructions, but in practice PSHUFB tends to be *very* fast so we're
38773 // more aggressive.
38774 if (UnaryShuffle && AllowVariablePerLaneMask &&
38775 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
38776 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
38777 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
38778 SmallVector<SDValue, 16> PSHUFBMask;
38779 int NumBytes = RootVT.getSizeInBits() / 8;
38780 int Ratio = NumBytes / NumMaskElts;
38781 for (int i = 0; i < NumBytes; ++i) {
38782 int M = Mask[i / Ratio];
38783 if (M == SM_SentinelUndef) {
38784 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
38785 continue;
38786 }
38787 if (M == SM_SentinelZero) {
38788 PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
38789 continue;
38790 }
38791 M = Ratio * M + i % Ratio;
38792 assert((M / 16) == (i / 16) && "Lane crossing detected");
38793 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
38794 }
38795 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
38796 Res = CanonicalizeShuffleInput(ByteVT, V1);
38797 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
38798 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
38799 return DAG.getBitcast(RootVT, Res);
38800 }
38801
38802 // With XOP, if we have a 128-bit binary input shuffle we can always combine
38803 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
38804 // slower than PSHUFB on targets that support both.
38805 if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
38806 Subtarget.hasXOP()) {
38807 // VPPERM Mask Operation
38808 // Bits[4:0] - Byte Index (0 - 31)
38809 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
38810 SmallVector<SDValue, 16> VPPERMMask;
38811 int NumBytes = 16;
38812 int Ratio = NumBytes / NumMaskElts;
38813 for (int i = 0; i < NumBytes; ++i) {
38814 int M = Mask[i / Ratio];
38815 if (M == SM_SentinelUndef) {
38816 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
38817 continue;
38818 }
38819 if (M == SM_SentinelZero) {
38820 VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
38821 continue;
38822 }
38823 M = Ratio * M + i % Ratio;
38824 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
38825 }
38826 MVT ByteVT = MVT::v16i8;
38827 V1 = CanonicalizeShuffleInput(ByteVT, V1);
38828 V2 = CanonicalizeShuffleInput(ByteVT, V2);
38829 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
38830 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
38831 return DAG.getBitcast(RootVT, Res);
38832 }
38833
38834 // If that failed and either input is extracted then try to combine as a
38835 // shuffle with the larger type.
38837 Inputs, Root, BaseMask, Depth, HasVariableMask,
38838 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget))
38839 return WideShuffle;
38840
38841 // If we have a dual input shuffle then lower to VPERMV3,
38842 // (non-VLX will pad to 512-bit shuffles)
38843 if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
38844 ((Subtarget.hasAVX512() &&
38845 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
38846 MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
38847 MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
38848 MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
38849 MaskVT == MVT::v16i32)) ||
38850 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
38851 (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
38852 MaskVT == MVT::v32i16)) ||
38853 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
38854 (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
38855 MaskVT == MVT::v64i8)))) {
38856 V1 = CanonicalizeShuffleInput(MaskVT, V1);
38857 V2 = CanonicalizeShuffleInput(MaskVT, V2);
38858 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
38859 return DAG.getBitcast(RootVT, Res);
38860 }
38861
38862 // Failed to find any combines.
38863 return SDValue();
38864}
38865
38866// Combine an arbitrary chain of shuffles + extract_subvectors into a single
38867// instruction if possible.
38868//
38869// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
38870// type size to attempt to combine:
38871// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
38872// -->
38873// extract_subvector(shuffle(x,y,m2),0)
38875 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
38876 bool HasVariableMask, bool AllowVariableCrossLaneMask,
38877 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
38878 const X86Subtarget &Subtarget) {
38879 unsigned NumMaskElts = BaseMask.size();
38880 unsigned NumInputs = Inputs.size();
38881 if (NumInputs == 0)
38882 return SDValue();
38883
38884 EVT RootVT = Root.getValueType();
38885 unsigned RootSizeInBits = RootVT.getSizeInBits();
38886 unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;
38887 assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
38888
38889 // Peek through extract_subvector to find widest legal vector.
38890 // TODO: Handle ISD::TRUNCATE
38891 unsigned WideSizeInBits = RootSizeInBits;
38892 for (unsigned I = 0; I != NumInputs; ++I) {
38893 SDValue Input = peekThroughBitcasts(Inputs[I]);
38894 while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR)
38895 Input = peekThroughBitcasts(Input.getOperand(0));
38896 if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&
38897 WideSizeInBits < Input.getValueSizeInBits())
38898 WideSizeInBits = Input.getValueSizeInBits();
38899 }
38900
38901 // Bail if we fail to find a source larger than the existing root.
38902 unsigned Scale = WideSizeInBits / RootSizeInBits;
38903 if (WideSizeInBits <= RootSizeInBits ||
38904 (WideSizeInBits % RootSizeInBits) != 0)
38905 return SDValue();
38906
38907 // Create new mask for larger type.
38908 SmallVector<int, 64> WideMask(BaseMask);
38909 for (int &M : WideMask) {
38910 if (M < 0)
38911 continue;
38912 M = (M % NumMaskElts) + ((M / NumMaskElts) * Scale * NumMaskElts);
38913 }
38914 WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
38915
38916 // Attempt to peek through inputs and adjust mask when we extract from an
38917 // upper subvector.
38918 int AdjustedMasks = 0;
38919 SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
38920 for (unsigned I = 0; I != NumInputs; ++I) {
38921 SDValue &Input = WideInputs[I];
38922 Input = peekThroughBitcasts(Input);
38923 while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
38924 Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {
38926 if (Idx != 0) {
38927 ++AdjustedMasks;
38928 unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();
38929 Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;
38930
38931 int lo = I * WideMask.size();
38932 int hi = (I + 1) * WideMask.size();
38933 for (int &M : WideMask)
38934 if (lo <= M && M < hi)
38935 M += Idx;
38936 }
38937 Input = peekThroughBitcasts(Input.getOperand(0));
38938 }
38939 }
38940
38941 // Remove unused/repeated shuffle source ops.
38942 resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
38943 assert(!WideInputs.empty() && "Shuffle with no inputs detected");
38944
38945 // Bail if we're always extracting from the lowest subvectors,
38946 // combineX86ShuffleChain should match this for the current width, or the
38947 // shuffle still references too many inputs.
38948 if (AdjustedMasks == 0 || WideInputs.size() > 2)
38949 return SDValue();
38950
38951 // Minor canonicalization of the accumulated shuffle mask to make it easier
38952 // to match below. All this does is detect masks with sequential pairs of
38953 // elements, and shrink them to the half-width mask. It does this in a loop
38954 // so it will reduce the size of the mask to the minimal width mask which
38955 // performs an equivalent shuffle.
38956 while (WideMask.size() > 1) {
38957 SmallVector<int, 64> WidenedMask;
38958 if (!canWidenShuffleElements(WideMask, WidenedMask))
38959 break;
38960 WideMask = std::move(WidenedMask);
38961 }
38962
38963 // Canonicalization of binary shuffle masks to improve pattern matching by
38964 // commuting the inputs.
38965 if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {
38967 std::swap(WideInputs[0], WideInputs[1]);
38968 }
38969
38970 // Increase depth for every upper subvector we've peeked through.
38971 Depth += AdjustedMasks;
38972
38973 // Attempt to combine wider chain.
38974 // TODO: Can we use a better Root?
38975 SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
38976 WideInputs.back().getValueSizeInBits()
38977 ? WideInputs.front()
38978 : WideInputs.back();
38979 assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&
38980 "WideRootSize mismatch");
38981
38982 if (SDValue WideShuffle =
38983 combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth,
38984 HasVariableMask, AllowVariableCrossLaneMask,
38985 AllowVariablePerLaneMask, DAG, Subtarget)) {
38986 WideShuffle =
38987 extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
38988 return DAG.getBitcast(RootVT, WideShuffle);
38989 }
38990
38991 return SDValue();
38992}
38993
38994// Canonicalize the combined shuffle mask chain with horizontal ops.
38995// NOTE: This may update the Ops and Mask.
38998 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
38999 const X86Subtarget &Subtarget) {
39000 if (Mask.empty() || Ops.empty())
39001 return SDValue();
39002
39004 for (SDValue Op : Ops)
39006
39007 // All ops must be the same horizop + type.
39008 SDValue BC0 = BC[0];
39009 EVT VT0 = BC0.getValueType();
39010 unsigned Opcode0 = BC0.getOpcode();
39011 if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
39012 return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
39013 }))
39014 return SDValue();
39015
39016 bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
39017 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
39018 bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
39019 if (!isHoriz && !isPack)
39020 return SDValue();
39021
39022 // Do all ops have a single use?
39023 bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
39024 return Op.hasOneUse() &&
39026 });
39027
39028 int NumElts = VT0.getVectorNumElements();
39029 int NumLanes = VT0.getSizeInBits() / 128;
39030 int NumEltsPerLane = NumElts / NumLanes;
39031 int NumHalfEltsPerLane = NumEltsPerLane / 2;
39032 MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
39033 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
39034
39035 if (NumEltsPerLane >= 4 &&
39036 (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
39037 SmallVector<int> LaneMask, ScaledMask;
39038 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
39039 scaleShuffleElements(LaneMask, 4, ScaledMask)) {
39040 // See if we can remove the shuffle by resorting the HOP chain so that
39041 // the HOP args are pre-shuffled.
39042 // TODO: Generalize to any sized/depth chain.
39043 // TODO: Add support for PACKSS/PACKUS.
39044 if (isHoriz) {
39045 // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
39046 auto GetHOpSrc = [&](int M) {
39047 if (M == SM_SentinelUndef)
39048 return DAG.getUNDEF(VT0);
39049 if (M == SM_SentinelZero)
39050 return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
39051 SDValue Src0 = BC[M / 4];
39052 SDValue Src1 = Src0.getOperand((M % 4) >= 2);
39053 if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
39054 return Src1.getOperand(M % 2);
39055 return SDValue();
39056 };
39057 SDValue M0 = GetHOpSrc(ScaledMask[0]);
39058 SDValue M1 = GetHOpSrc(ScaledMask[1]);
39059 SDValue M2 = GetHOpSrc(ScaledMask[2]);
39060 SDValue M3 = GetHOpSrc(ScaledMask[3]);
39061 if (M0 && M1 && M2 && M3) {
39062 SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
39063 SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
39064 return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
39065 }
39066 }
39067 // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
39068 if (Ops.size() >= 2) {
39069 SDValue LHS, RHS;
39070 auto GetHOpSrc = [&](int M, int &OutM) {
39071 // TODO: Support SM_SentinelZero
39072 if (M < 0)
39073 return M == SM_SentinelUndef;
39074 SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
39075 if (!LHS || LHS == Src) {
39076 LHS = Src;
39077 OutM = (M % 2);
39078 return true;
39079 }
39080 if (!RHS || RHS == Src) {
39081 RHS = Src;
39082 OutM = (M % 2) + 2;
39083 return true;
39084 }
39085 return false;
39086 };
39087 int PostMask[4] = {-1, -1, -1, -1};
39088 if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
39089 GetHOpSrc(ScaledMask[1], PostMask[1]) &&
39090 GetHOpSrc(ScaledMask[2], PostMask[2]) &&
39091 GetHOpSrc(ScaledMask[3], PostMask[3])) {
39092 LHS = DAG.getBitcast(SrcVT, LHS);
39093 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
39094 SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
39095 // Use SHUFPS for the permute so this will work on SSE2 targets,
39096 // shuffle combining and domain handling will simplify this later on.
39097 MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
39098 Res = DAG.getBitcast(ShuffleVT, Res);
39099 return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
39100 getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
39101 }
39102 }
39103 }
39104 }
39105
39106 if (2 < Ops.size())
39107 return SDValue();
39108
39109 SDValue BC1 = BC[BC.size() - 1];
39110 if (Mask.size() == VT0.getVectorNumElements()) {
39111 // Canonicalize binary shuffles of horizontal ops that use the
39112 // same sources to an unary shuffle.
39113 // TODO: Try to perform this fold even if the shuffle remains.
39114 if (Ops.size() == 2) {
39115 auto ContainsOps = [](SDValue HOp, SDValue Op) {
39116 return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
39117 };
39118 // Commute if all BC0's ops are contained in BC1.
39119 if (ContainsOps(BC1, BC0.getOperand(0)) &&
39120 ContainsOps(BC1, BC0.getOperand(1))) {
39122 std::swap(Ops[0], Ops[1]);
39123 std::swap(BC0, BC1);
39124 }
39125
39126 // If BC1 can be represented by BC0, then convert to unary shuffle.
39127 if (ContainsOps(BC0, BC1.getOperand(0)) &&
39128 ContainsOps(BC0, BC1.getOperand(1))) {
39129 for (int &M : Mask) {
39130 if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
39131 continue;
39132 int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
39133 M -= NumElts + (SubLane * NumHalfEltsPerLane);
39134 if (BC1.getOperand(SubLane) != BC0.getOperand(0))
39135 M += NumHalfEltsPerLane;
39136 }
39137 }
39138 }
39139
39140 // Canonicalize unary horizontal ops to only refer to lower halves.
39141 for (int i = 0; i != NumElts; ++i) {
39142 int &M = Mask[i];
39143 if (isUndefOrZero(M))
39144 continue;
39145 if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
39146 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
39147 M -= NumHalfEltsPerLane;
39148 if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
39149 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
39150 M -= NumHalfEltsPerLane;
39151 }
39152 }
39153
39154 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
39155 // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
39156 // represents the LHS/RHS inputs for the lower/upper halves.
39157 SmallVector<int, 16> TargetMask128, WideMask128;
39158 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
39159 scaleShuffleElements(TargetMask128, 2, WideMask128)) {
39160 assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle");
39161 bool SingleOp = (Ops.size() == 1);
39162 if (isPack || OneUseOps ||
39163 shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
39164 SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
39165 SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
39166 Lo = Lo.getOperand(WideMask128[0] & 1);
39167 Hi = Hi.getOperand(WideMask128[1] & 1);
39168 if (SingleOp) {
39169 SDValue Undef = DAG.getUNDEF(SrcVT);
39170 SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
39171 Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
39172 Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
39173 Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
39174 Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
39175 }
39176 return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
39177 }
39178 }
39179
39180 // If we are post-shuffling a 256-bit hop and not requiring the upper
39181 // elements, then try to narrow to a 128-bit hop directly.
39182 SmallVector<int, 16> WideMask64;
39183 if (Ops.size() == 1 && NumLanes == 2 &&
39184 scaleShuffleElements(Mask, 4, WideMask64) &&
39185 isUndefInRange(WideMask64, 2, 2)) {
39186 int M0 = WideMask64[0];
39187 int M1 = WideMask64[1];
39188 if (isInRange(M0, 0, 4) && isInRange(M1, 0, 4)) {
39190 unsigned Idx0 = (M0 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
39191 unsigned Idx1 = (M1 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
39192 SDValue V0 = extract128BitVector(BC[0].getOperand(M0 & 1), Idx0, DAG, DL);
39193 SDValue V1 = extract128BitVector(BC[0].getOperand(M1 & 1), Idx1, DAG, DL);
39194 SDValue Res = DAG.getNode(Opcode0, DL, HalfVT, V0, V1);
39195 return widenSubVector(Res, false, Subtarget, DAG, DL, 256);
39196 }
39197 }
39198
39199 return SDValue();
39200}
39201
39202// Attempt to constant fold all of the constant source ops.
39203// Returns true if the entire shuffle is folded to a constant.
39204// TODO: Extend this to merge multiple constant Ops and update the mask.
39206 ArrayRef<int> Mask, SDValue Root,
39207 bool HasVariableMask,
39208 SelectionDAG &DAG,
39209 const X86Subtarget &Subtarget) {
39210 MVT VT = Root.getSimpleValueType();
39211
39212 unsigned SizeInBits = VT.getSizeInBits();
39213 unsigned NumMaskElts = Mask.size();
39214 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
39215 unsigned NumOps = Ops.size();
39216
39217 // Extract constant bits from each source op.
39218 SmallVector<APInt, 16> UndefEltsOps(NumOps);
39219 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
39220 for (unsigned I = 0; I != NumOps; ++I)
39221 if (!getTargetConstantBitsFromNode(Ops[I], MaskSizeInBits, UndefEltsOps[I],
39222 RawBitsOps[I],
39223 /*AllowWholeUndefs*/ true,
39224 /*AllowPartialUndefs*/ true))
39225 return SDValue();
39226
39227 // If we're optimizing for size, only fold if at least one of the constants is
39228 // only used once or the combined shuffle has included a variable mask
39229 // shuffle, this is to avoid constant pool bloat.
39230 bool IsOptimizingSize = DAG.shouldOptForSize();
39231 if (IsOptimizingSize && !HasVariableMask &&
39232 llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); }))
39233 return SDValue();
39234
39235 // Shuffle the constant bits according to the mask.
39236 SDLoc DL(Root);
39237 APInt UndefElts(NumMaskElts, 0);
39238 APInt ZeroElts(NumMaskElts, 0);
39239 APInt ConstantElts(NumMaskElts, 0);
39240 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
39241 APInt::getZero(MaskSizeInBits));
39242 for (unsigned i = 0; i != NumMaskElts; ++i) {
39243 int M = Mask[i];
39244 if (M == SM_SentinelUndef) {
39245 UndefElts.setBit(i);
39246 continue;
39247 } else if (M == SM_SentinelZero) {
39248 ZeroElts.setBit(i);
39249 continue;
39250 }
39251 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
39252
39253 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
39254 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
39255
39256 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
39257 if (SrcUndefElts[SrcMaskIdx]) {
39258 UndefElts.setBit(i);
39259 continue;
39260 }
39261
39262 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
39263 APInt &Bits = SrcEltBits[SrcMaskIdx];
39264 if (!Bits) {
39265 ZeroElts.setBit(i);
39266 continue;
39267 }
39268
39269 ConstantElts.setBit(i);
39270 ConstantBitData[i] = Bits;
39271 }
39272 assert((UndefElts | ZeroElts | ConstantElts).isAllOnes());
39273
39274 // Attempt to create a zero vector.
39275 if ((UndefElts | ZeroElts).isAllOnes())
39276 return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);
39277
39278 // Create the constant data.
39279 MVT MaskSVT;
39280 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
39281 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
39282 else
39283 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
39284
39285 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
39286 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
39287 return SDValue();
39288
39289 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
39290 return DAG.getBitcast(VT, CstOp);
39291}
39292
39293namespace llvm {
39294 namespace X86 {
39295 enum {
39298 } // namespace X86
39299} // namespace llvm
39300
39301/// Fully generic combining of x86 shuffle instructions.
39302///
39303/// This should be the last combine run over the x86 shuffle instructions. Once
39304/// they have been fully optimized, this will recursively consider all chains
39305/// of single-use shuffle instructions, build a generic model of the cumulative
39306/// shuffle operation, and check for simpler instructions which implement this
39307/// operation. We use this primarily for two purposes:
39308///
39309/// 1) Collapse generic shuffles to specialized single instructions when
39310/// equivalent. In most cases, this is just an encoding size win, but
39311/// sometimes we will collapse multiple generic shuffles into a single
39312/// special-purpose shuffle.
39313/// 2) Look for sequences of shuffle instructions with 3 or more total
39314/// instructions, and replace them with the slightly more expensive SSSE3
39315/// PSHUFB instruction if available. We do this as the last combining step
39316/// to ensure we avoid using PSHUFB if we can implement the shuffle with
39317/// a suitable short sequence of other instructions. The PSHUFB will either
39318/// use a register or have to read from memory and so is slightly (but only
39319/// slightly) more expensive than the other shuffle instructions.
39320///
39321/// Because this is inherently a quadratic operation (for each shuffle in
39322/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
39323/// This should never be an issue in practice as the shuffle lowering doesn't
39324/// produce sequences of more than 8 instructions.
39325///
39326/// FIXME: We will currently miss some cases where the redundant shuffling
39327/// would simplify under the threshold for PSHUFB formation because of
39328/// combine-ordering. To fix this, we should do the redundant instruction
39329/// combining in this recursive walk.
39331 ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
39332 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
39333 unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask,
39334 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
39335 const X86Subtarget &Subtarget) {
39336 assert(!RootMask.empty() &&
39337 (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
39338 "Illegal shuffle root mask");
39339 MVT RootVT = Root.getSimpleValueType();
39340 assert(RootVT.isVector() && "Shuffles operate on vector types!");
39341 unsigned RootSizeInBits = RootVT.getSizeInBits();
39342
39343 // Bound the depth of our recursive combine because this is ultimately
39344 // quadratic in nature.
39345 if (Depth >= MaxDepth)
39346 return SDValue();
39347
39348 // Directly rip through bitcasts to find the underlying operand.
39349 SDValue Op = SrcOps[SrcOpIndex];
39351
39352 EVT VT = Op.getValueType();
39353 if (!VT.isVector() || !VT.isSimple())
39354 return SDValue(); // Bail if we hit a non-simple non-vector.
39355
39356 // FIXME: Just bail on f16 for now.
39357 if (VT.getVectorElementType() == MVT::f16)
39358 return SDValue();
39359
39360 assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&
39361 "Can only combine shuffles upto size of the root op.");
39362
39363 // Create a demanded elts mask from the referenced elements of Op.
39364 APInt OpDemandedElts = APInt::getZero(RootMask.size());
39365 for (int M : RootMask) {
39366 int BaseIdx = RootMask.size() * SrcOpIndex;
39367 if (isInRange(M, BaseIdx, BaseIdx + RootMask.size()))
39368 OpDemandedElts.setBit(M - BaseIdx);
39369 }
39370 if (RootSizeInBits != VT.getSizeInBits()) {
39371 // Op is smaller than Root - extract the demanded elts for the subvector.
39372 unsigned Scale = RootSizeInBits / VT.getSizeInBits();
39373 unsigned NumOpMaskElts = RootMask.size() / Scale;
39374 assert((RootMask.size() % Scale) == 0 && "Root mask size mismatch");
39375 assert(OpDemandedElts
39376 .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)
39377 .isZero() &&
39378 "Out of range elements referenced in root mask");
39379 OpDemandedElts = OpDemandedElts.extractBits(NumOpMaskElts, 0);
39380 }
39381 OpDemandedElts =
39382 APIntOps::ScaleBitMask(OpDemandedElts, VT.getVectorNumElements());
39383
39384 // Extract target shuffle mask and resolve sentinels and inputs.
39385 SmallVector<int, 64> OpMask;
39386 SmallVector<SDValue, 2> OpInputs;
39387 APInt OpUndef, OpZero;
39388 bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
39389 if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
39390 OpZero, DAG, Depth, false)) {
39391 // Shuffle inputs must not be larger than the shuffle result.
39392 // TODO: Relax this for single input faux shuffles (e.g. trunc).
39393 if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
39394 return OpInput.getValueSizeInBits() > VT.getSizeInBits();
39395 }))
39396 return SDValue();
39397 } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
39398 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
39399 !isNullConstant(Op.getOperand(1))) {
39400 SDValue SrcVec = Op.getOperand(0);
39401 int ExtractIdx = Op.getConstantOperandVal(1);
39402 unsigned NumElts = VT.getVectorNumElements();
39403 OpInputs.assign({SrcVec});
39404 OpMask.assign(NumElts, SM_SentinelUndef);
39405 std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
39406 OpZero = OpUndef = APInt::getZero(NumElts);
39407 } else {
39408 return SDValue();
39409 }
39410
39411 // If the shuffle result was smaller than the root, we need to adjust the
39412 // mask indices and pad the mask with undefs.
39413 if (RootSizeInBits > VT.getSizeInBits()) {
39414 unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
39415 unsigned OpMaskSize = OpMask.size();
39416 if (OpInputs.size() > 1) {
39417 unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
39418 for (int &M : OpMask) {
39419 if (M < 0)
39420 continue;
39421 int EltIdx = M % OpMaskSize;
39422 int OpIdx = M / OpMaskSize;
39423 M = (PaddedMaskSize * OpIdx) + EltIdx;
39424 }
39425 }
39426 OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
39427 OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
39428 OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
39429 }
39430
39433
39434 // We don't need to merge masks if the root is empty.
39435 bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
39436 if (EmptyRoot) {
39437 // Only resolve zeros if it will remove an input, otherwise we might end
39438 // up in an infinite loop.
39439 bool ResolveKnownZeros = true;
39440 if (!OpZero.isZero()) {
39441 APInt UsedInputs = APInt::getZero(OpInputs.size());
39442 for (int i = 0, e = OpMask.size(); i != e; ++i) {
39443 int M = OpMask[i];
39444 if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
39445 continue;
39446 UsedInputs.setBit(M / OpMask.size());
39447 if (UsedInputs.isAllOnes()) {
39448 ResolveKnownZeros = false;
39449 break;
39450 }
39451 }
39452 }
39453 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
39454 ResolveKnownZeros);
39455
39456 Mask = OpMask;
39457 Ops.append(OpInputs.begin(), OpInputs.end());
39458 } else {
39459 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
39460
39461 // Add the inputs to the Ops list, avoiding duplicates.
39462 Ops.append(SrcOps.begin(), SrcOps.end());
39463
39464 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
39465 // Attempt to find an existing match.
39466 SDValue InputBC = peekThroughBitcasts(Input);
39467 for (int i = 0, e = Ops.size(); i < e; ++i)
39468 if (InputBC == peekThroughBitcasts(Ops[i]))
39469 return i;
39470 // Match failed - should we replace an existing Op?
39471 if (InsertionPoint >= 0) {
39472 Ops[InsertionPoint] = Input;
39473 return InsertionPoint;
39474 }
39475 // Add to the end of the Ops list.
39476 Ops.push_back(Input);
39477 return Ops.size() - 1;
39478 };
39479
39480 SmallVector<int, 2> OpInputIdx;
39481 for (SDValue OpInput : OpInputs)
39482 OpInputIdx.push_back(
39483 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
39484
39485 assert(((RootMask.size() > OpMask.size() &&
39486 RootMask.size() % OpMask.size() == 0) ||
39487 (OpMask.size() > RootMask.size() &&
39488 OpMask.size() % RootMask.size() == 0) ||
39489 OpMask.size() == RootMask.size()) &&
39490 "The smaller number of elements must divide the larger.");
39491
39492 // This function can be performance-critical, so we rely on the power-of-2
39493 // knowledge that we have about the mask sizes to replace div/rem ops with
39494 // bit-masks and shifts.
39495 assert(llvm::has_single_bit<uint32_t>(RootMask.size()) &&
39496 "Non-power-of-2 shuffle mask sizes");
39497 assert(llvm::has_single_bit<uint32_t>(OpMask.size()) &&
39498 "Non-power-of-2 shuffle mask sizes");
39499 unsigned RootMaskSizeLog2 = llvm::countr_zero(RootMask.size());
39500 unsigned OpMaskSizeLog2 = llvm::countr_zero(OpMask.size());
39501
39502 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
39503 unsigned RootRatio =
39504 std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
39505 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
39506 assert((RootRatio == 1 || OpRatio == 1) &&
39507 "Must not have a ratio for both incoming and op masks!");
39508
39509 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
39510 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
39511 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
39512 unsigned RootRatioLog2 = llvm::countr_zero(RootRatio);
39513 unsigned OpRatioLog2 = llvm::countr_zero(OpRatio);
39514
39515 Mask.resize(MaskWidth, SM_SentinelUndef);
39516
39517 // Merge this shuffle operation's mask into our accumulated mask. Note that
39518 // this shuffle's mask will be the first applied to the input, followed by
39519 // the root mask to get us all the way to the root value arrangement. The
39520 // reason for this order is that we are recursing up the operation chain.
39521 for (unsigned i = 0; i < MaskWidth; ++i) {
39522 unsigned RootIdx = i >> RootRatioLog2;
39523 if (RootMask[RootIdx] < 0) {
39524 // This is a zero or undef lane, we're done.
39525 Mask[i] = RootMask[RootIdx];
39526 continue;
39527 }
39528
39529 unsigned RootMaskedIdx =
39530 RootRatio == 1
39531 ? RootMask[RootIdx]
39532 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
39533
39534 // Just insert the scaled root mask value if it references an input other
39535 // than the SrcOp we're currently inserting.
39536 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
39537 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
39538 Mask[i] = RootMaskedIdx;
39539 continue;
39540 }
39541
39542 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
39543 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
39544 if (OpMask[OpIdx] < 0) {
39545 // The incoming lanes are zero or undef, it doesn't matter which ones we
39546 // are using.
39547 Mask[i] = OpMask[OpIdx];
39548 continue;
39549 }
39550
39551 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
39552 unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
39553 : (OpMask[OpIdx] << OpRatioLog2) +
39554 (RootMaskedIdx & (OpRatio - 1));
39555
39556 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
39557 int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
39558 assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
39559 OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
39560
39561 Mask[i] = OpMaskedIdx;
39562 }
39563 }
39564
39565 // Peek through vector widenings and set out of bounds mask indices to undef.
39566 // TODO: Can resolveTargetShuffleInputsAndMask do some of this?
39567 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
39568 SDValue &Op = Ops[I];
39569 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op.getOperand(0).isUndef() &&
39570 isNullConstant(Op.getOperand(2))) {
39571 Op = Op.getOperand(1);
39572 unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();
39573 int Lo = I * Mask.size();
39574 int Hi = (I + 1) * Mask.size();
39575 int NewHi = Lo + (Mask.size() / Scale);
39576 for (int &M : Mask) {
39577 if (Lo <= M && NewHi <= M && M < Hi)
39578 M = SM_SentinelUndef;
39579 }
39580 }
39581 }
39582
39583 // Peek through any free extract_subvector nodes back to root size.
39584 for (SDValue &Op : Ops)
39585 while (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
39586 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
39587 isNullConstant(Op.getOperand(1)))
39588 Op = Op.getOperand(0);
39589
39590 // Remove unused/repeated shuffle source ops.
39592
39593 // Handle the all undef/zero/ones cases early.
39594 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
39595 return DAG.getUNDEF(RootVT);
39596 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
39597 return getZeroVector(RootVT, Subtarget, DAG, SDLoc(Root));
39598 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
39600 return getOnesVector(RootVT, DAG, SDLoc(Root));
39601
39602 assert(!Ops.empty() && "Shuffle with no inputs detected");
39603 HasVariableMask |= IsOpVariableMask;
39604
39605 // Update the list of shuffle nodes that have been combined so far.
39606 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
39607 SrcNodes.end());
39608 CombinedNodes.push_back(Op.getNode());
39609
39610 // See if we can recurse into each shuffle source op (if it's a target
39611 // shuffle). The source op should only be generally combined if it either has
39612 // a single use (i.e. current Op) or all its users have already been combined,
39613 // if not then we can still combine but should prevent generation of variable
39614 // shuffles to avoid constant pool bloat.
39615 // Don't recurse if we already have more source ops than we can combine in
39616 // the remaining recursion depth.
39617 if (Ops.size() < (MaxDepth - Depth)) {
39618 for (int i = 0, e = Ops.size(); i < e; ++i) {
39619 // For empty roots, we need to resolve zeroable elements before combining
39620 // them with other shuffles.
39621 SmallVector<int, 64> ResolvedMask = Mask;
39622 if (EmptyRoot)
39623 resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
39624 bool AllowCrossLaneVar = false;
39625 bool AllowPerLaneVar = false;
39626 if (Ops[i].getNode()->hasOneUse() ||
39627 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
39628 AllowCrossLaneVar = AllowVariableCrossLaneMask;
39629 AllowPerLaneVar = AllowVariablePerLaneMask;
39630 }
39632 Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,
39633 HasVariableMask, AllowCrossLaneVar, AllowPerLaneVar, DAG,
39634 Subtarget))
39635 return Res;
39636 }
39637 }
39638
39639 // Attempt to constant fold all of the constant source ops.
39641 Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
39642 return Cst;
39643
39644 // If constant fold failed and we only have constants - then we have
39645 // multiple uses by a single non-variable shuffle - just bail.
39646 if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
39647 APInt UndefElts;
39648 SmallVector<APInt> RawBits;
39649 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
39650 return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
39651 RawBits,
39652 /*AllowWholeUndefs*/ true,
39653 /*AllowPartialUndefs*/ true);
39654 })) {
39655 return SDValue();
39656 }
39657
39658 // Canonicalize the combined shuffle mask chain with horizontal ops.
39659 // NOTE: This will update the Ops and Mask.
39661 Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))
39662 return DAG.getBitcast(RootVT, HOp);
39663
39664 // Try to refine our inputs given our knowledge of target shuffle mask.
39665 for (auto I : enumerate(Ops)) {
39666 int OpIdx = I.index();
39667 SDValue &Op = I.value();
39668
39669 // What range of shuffle mask element values results in picking from Op?
39670 int Lo = OpIdx * Mask.size();
39671 int Hi = Lo + Mask.size();
39672
39673 // Which elements of Op do we demand, given the mask's granularity?
39674 APInt OpDemandedElts(Mask.size(), 0);
39675 for (int MaskElt : Mask) {
39676 if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?
39677 int OpEltIdx = MaskElt - Lo;
39678 OpDemandedElts.setBit(OpEltIdx);
39679 }
39680 }
39681
39682 // Is the shuffle result smaller than the root?
39683 if (Op.getValueSizeInBits() < RootSizeInBits) {
39684 // We padded the mask with undefs. But we now need to undo that.
39685 unsigned NumExpectedVectorElts = Mask.size();
39686 unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;
39687 unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;
39688 assert(!OpDemandedElts.extractBits(
39689 NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&
39690 "Demanding the virtual undef widening padding?");
39691 OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW
39692 }
39693
39694 // The Op itself may be of different VT, so we need to scale the mask.
39695 unsigned NumOpElts = Op.getValueType().getVectorNumElements();
39696 APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
39697
39698 // Can this operand be simplified any further, given it's demanded elements?
39699 if (SDValue NewOp =
39701 Op, OpScaledDemandedElts, DAG))
39702 Op = NewOp;
39703 }
39704 // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
39705
39706 // Widen any subvector shuffle inputs we've collected.
39707 // TODO: Remove this to avoid generating temporary nodes, we should only
39708 // widen once combineX86ShuffleChain has found a match.
39709 if (any_of(Ops, [RootSizeInBits](SDValue Op) {
39710 return Op.getValueSizeInBits() < RootSizeInBits;
39711 })) {
39712 for (SDValue &Op : Ops)
39713 if (Op.getValueSizeInBits() < RootSizeInBits)
39714 Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
39715 RootSizeInBits);
39716 // Reresolve - we might have repeated subvector sources.
39718 }
39719
39720 // We can only combine unary and binary shuffle mask cases.
39721 if (Ops.size() <= 2) {
39722 // Minor canonicalization of the accumulated shuffle mask to make it easier
39723 // to match below. All this does is detect masks with sequential pairs of
39724 // elements, and shrink them to the half-width mask. It does this in a loop
39725 // so it will reduce the size of the mask to the minimal width mask which
39726 // performs an equivalent shuffle.
39727 while (Mask.size() > 1) {
39728 SmallVector<int, 64> WidenedMask;
39729 if (!canWidenShuffleElements(Mask, WidenedMask))
39730 break;
39731 Mask = std::move(WidenedMask);
39732 }
39733
39734 // Canonicalization of binary shuffle masks to improve pattern matching by
39735 // commuting the inputs.
39736 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
39738 std::swap(Ops[0], Ops[1]);
39739 }
39740
39741 // Try to combine into a single shuffle instruction.
39742 if (SDValue Shuffle = combineX86ShuffleChain(
39743 Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
39744 AllowVariablePerLaneMask, DAG, Subtarget))
39745 return Shuffle;
39746
39747 // If all the operands come from the same larger vector, fallthrough and try
39748 // to use combineX86ShuffleChainWithExtract.
39751 if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||
39752 (RootSizeInBits / Mask.size()) != 64 ||
39753 LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
39754 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
39755 LHS.getOperand(0) != RHS.getOperand(0))
39756 return SDValue();
39757 }
39758
39759 // If that failed and any input is extracted then try to combine as a
39760 // shuffle with the larger type.
39762 Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
39763 AllowVariablePerLaneMask, DAG, Subtarget);
39764}
39765
39766/// Helper entry wrapper to combineX86ShufflesRecursively.
39768 const X86Subtarget &Subtarget) {
39770 {Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth,
39771 /*HasVarMask*/ false,
39772 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG,
39773 Subtarget);
39774}
39775
39776/// Get the PSHUF-style mask from PSHUF node.
39777///
39778/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
39779/// PSHUF-style masks that can be reused with such instructions.
39781 MVT VT = N.getSimpleValueType();
39784 bool HaveMask = getTargetShuffleMask(N, false, Ops, Mask);
39785 (void)HaveMask;
39786 assert(HaveMask);
39787
39788 // If we have more than 128-bits, only the low 128-bits of shuffle mask
39789 // matter. Check that the upper masks are repeats and remove them.
39790 if (VT.getSizeInBits() > 128) {
39791 int LaneElts = 128 / VT.getScalarSizeInBits();
39792#ifndef NDEBUG
39793 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
39794 for (int j = 0; j < LaneElts; ++j)
39795 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
39796 "Mask doesn't repeat in high 128-bit lanes!");
39797#endif
39798 Mask.resize(LaneElts);
39799 }
39800
39801 switch (N.getOpcode()) {
39802 case X86ISD::PSHUFD:
39803 return Mask;
39804 case X86ISD::PSHUFLW:
39805 Mask.resize(4);
39806 return Mask;
39807 case X86ISD::PSHUFHW:
39808 Mask.erase(Mask.begin(), Mask.begin() + 4);
39809 for (int &M : Mask)
39810 M -= 4;
39811 return Mask;
39812 default:
39813 llvm_unreachable("No valid shuffle instruction found!");
39814 }
39815}
39816
39817/// Search for a combinable shuffle across a chain ending in pshufd.
39818///
39819/// We walk up the chain and look for a combinable shuffle, skipping over
39820/// shuffles that we could hoist this shuffle's transformation past without
39821/// altering anything.
39824 const SDLoc &DL,
39825 SelectionDAG &DAG) {
39826 assert(N.getOpcode() == X86ISD::PSHUFD &&
39827 "Called with something other than an x86 128-bit half shuffle!");
39828
39829 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
39830 // of the shuffles in the chain so that we can form a fresh chain to replace
39831 // this one.
39833 SDValue V = N.getOperand(0);
39834 for (; V.hasOneUse(); V = V.getOperand(0)) {
39835 switch (V.getOpcode()) {
39836 default:
39837 return SDValue(); // Nothing combined!
39838
39839 case ISD::BITCAST:
39840 // Skip bitcasts as we always know the type for the target specific
39841 // instructions.
39842 continue;
39843
39844 case X86ISD::PSHUFD:
39845 // Found another dword shuffle.
39846 break;
39847
39848 case X86ISD::PSHUFLW:
39849 // Check that the low words (being shuffled) are the identity in the
39850 // dword shuffle, and the high words are self-contained.
39851 if (Mask[0] != 0 || Mask[1] != 1 ||
39852 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
39853 return SDValue();
39854
39855 Chain.push_back(V);
39856 continue;
39857
39858 case X86ISD::PSHUFHW:
39859 // Check that the high words (being shuffled) are the identity in the
39860 // dword shuffle, and the low words are self-contained.
39861 if (Mask[2] != 2 || Mask[3] != 3 ||
39862 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
39863 return SDValue();
39864
39865 Chain.push_back(V);
39866 continue;
39867
39868 case X86ISD::UNPCKL:
39869 case X86ISD::UNPCKH:
39870 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
39871 // shuffle into a preceding word shuffle.
39872 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
39873 V.getSimpleValueType().getVectorElementType() != MVT::i16)
39874 return SDValue();
39875
39876 // Search for a half-shuffle which we can combine with.
39877 unsigned CombineOp =
39878 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
39879 if (V.getOperand(0) != V.getOperand(1) ||
39880 !V->isOnlyUserOf(V.getOperand(0).getNode()))
39881 return SDValue();
39882 Chain.push_back(V);
39883 V = V.getOperand(0);
39884 do {
39885 switch (V.getOpcode()) {
39886 default:
39887 return SDValue(); // Nothing to combine.
39888
39889 case X86ISD::PSHUFLW:
39890 case X86ISD::PSHUFHW:
39891 if (V.getOpcode() == CombineOp)
39892 break;
39893
39894 Chain.push_back(V);
39895
39896 [[fallthrough]];
39897 case ISD::BITCAST:
39898 V = V.getOperand(0);
39899 continue;
39900 }
39901 break;
39902 } while (V.hasOneUse());
39903 break;
39904 }
39905 // Break out of the loop if we break out of the switch.
39906 break;
39907 }
39908
39909 if (!V.hasOneUse())
39910 // We fell out of the loop without finding a viable combining instruction.
39911 return SDValue();
39912
39913 // Merge this node's mask and our incoming mask.
39915 for (int &M : Mask)
39916 M = VMask[M];
39917 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
39918 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
39919
39920 // Rebuild the chain around this new shuffle.
39921 while (!Chain.empty()) {
39922 SDValue W = Chain.pop_back_val();
39923
39924 if (V.getValueType() != W.getOperand(0).getValueType())
39925 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
39926
39927 switch (W.getOpcode()) {
39928 default:
39929 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
39930
39931 case X86ISD::UNPCKL:
39932 case X86ISD::UNPCKH:
39933 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
39934 break;
39935
39936 case X86ISD::PSHUFD:
39937 case X86ISD::PSHUFLW:
39938 case X86ISD::PSHUFHW:
39939 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
39940 break;
39941 }
39942 }
39943 if (V.getValueType() != N.getValueType())
39944 V = DAG.getBitcast(N.getValueType(), V);
39945
39946 // Return the new chain to replace N.
39947 return V;
39948}
39949
39950// Attempt to commute shufps LHS loads:
39951// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
39953 SelectionDAG &DAG) {
39954 // TODO: Add vXf64 support.
39955 if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
39956 return SDValue();
39957
39958 // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
39959 auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
39960 if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
39961 return SDValue();
39962 SDValue N0 = V.getOperand(0);
39963 SDValue N1 = V.getOperand(1);
39964 unsigned Imm = V.getConstantOperandVal(2);
39965 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
39966 if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
39968 return SDValue();
39969 Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
39970 return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
39971 DAG.getTargetConstant(Imm, DL, MVT::i8));
39972 };
39973
39974 switch (N.getOpcode()) {
39975 case X86ISD::VPERMILPI:
39976 if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
39977 unsigned Imm = N.getConstantOperandVal(1);
39978 return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
39979 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
39980 }
39981 break;
39982 case X86ISD::SHUFP: {
39983 SDValue N0 = N.getOperand(0);
39984 SDValue N1 = N.getOperand(1);
39985 unsigned Imm = N.getConstantOperandVal(2);
39986 if (N0 == N1) {
39987 if (SDValue NewSHUFP = commuteSHUFP(N, N0))
39988 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
39989 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
39990 } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
39991 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
39992 DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
39993 } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
39994 return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
39995 DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
39996 }
39997 break;
39998 }
39999 }
40000
40001 return SDValue();
40002}
40003
40004// TODO - move this to TLI like isBinOp?
40005static bool isUnaryOp(unsigned Opcode) {
40006 switch (Opcode) {
40007 case ISD::CTLZ:
40008 case ISD::CTTZ:
40009 case ISD::CTPOP:
40010 return true;
40011 }
40012 return false;
40013}
40014
40015// Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
40016// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
40018 const SDLoc &DL) {
40019 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40020 EVT ShuffleVT = N.getValueType();
40021 unsigned Opc = N.getOpcode();
40022
40023 auto IsMergeableWithShuffle = [Opc, &DAG](SDValue Op, bool FoldShuf = true,
40024 bool FoldLoad = false) {
40025 // AllZeros/AllOnes constants are freely shuffled and will peek through
40026 // bitcasts. Other constant build vectors do not peek through bitcasts. Only
40027 // merge with target shuffles if it has one use so shuffle combining is
40028 // likely to kick in. Shuffles of splats are expected to be removed.
40029 return ISD::isBuildVectorAllOnes(Op.getNode()) ||
40030 ISD::isBuildVectorAllZeros(Op.getNode()) ||
40033 getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op)) ||
40034 (Op.getOpcode() == Opc && Op->hasOneUse()) ||
40035 (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||
40036 (FoldShuf && isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
40037 (FoldLoad && isShuffleFoldableLoad(Op)) ||
40038 DAG.isSplatValue(Op, /*AllowUndefs*/ false);
40039 };
40040 auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
40041 // Ensure we only shuffle whole vector src elements, unless its a logical
40042 // binops where we can more aggressively move shuffles from dst to src.
40043 return isLogicOp(BinOp) ||
40044 (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
40045 };
40046
40047 switch (Opc) {
40048 // Unary and Unary+Permute Shuffles.
40049 case X86ISD::PSHUFB: {
40050 // Don't merge PSHUFB if it contains zero'd elements.
40051 SmallVector<int> Mask;
40053 if (!getTargetShuffleMask(N, false, Ops, Mask))
40054 break;
40055 [[fallthrough]];
40056 }
40057 case X86ISD::VBROADCAST:
40058 case X86ISD::MOVDDUP:
40059 case X86ISD::PSHUFD:
40060 case X86ISD::PSHUFHW:
40061 case X86ISD::PSHUFLW:
40062 case X86ISD::VPERMI:
40063 case X86ISD::VPERMILPI: {
40064 if (N.getOperand(0).getValueType() == ShuffleVT &&
40065 N->isOnlyUserOf(N.getOperand(0).getNode())) {
40066 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
40067 unsigned SrcOpcode = N0.getOpcode();
40068 if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
40071 if (IsMergeableWithShuffle(Op00, Opc != X86ISD::VPERMI,
40072 Opc != X86ISD::PSHUFB) ||
40073 IsMergeableWithShuffle(Op01, Opc != X86ISD::VPERMI,
40074 Opc != X86ISD::PSHUFB)) {
40075 SDValue LHS, RHS;
40076 Op00 = DAG.getBitcast(ShuffleVT, Op00);
40077 Op01 = DAG.getBitcast(ShuffleVT, Op01);
40078 if (N.getNumOperands() == 2) {
40079 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
40080 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
40081 } else {
40082 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
40083 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
40084 }
40085 EVT OpVT = N0.getValueType();
40086 return DAG.getBitcast(ShuffleVT,
40087 DAG.getNode(SrcOpcode, DL, OpVT,
40088 DAG.getBitcast(OpVT, LHS),
40089 DAG.getBitcast(OpVT, RHS)));
40090 }
40091 }
40092 }
40093 break;
40094 }
40095 // Binary and Binary+Permute Shuffles.
40096 case X86ISD::INSERTPS: {
40097 // Don't merge INSERTPS if it contains zero'd elements.
40098 unsigned InsertPSMask = N.getConstantOperandVal(2);
40099 unsigned ZeroMask = InsertPSMask & 0xF;
40100 if (ZeroMask != 0)
40101 break;
40102 [[fallthrough]];
40103 }
40104 case X86ISD::MOVSD:
40105 case X86ISD::MOVSS:
40106 case X86ISD::BLENDI:
40107 case X86ISD::SHUFP:
40108 case X86ISD::UNPCKH:
40109 case X86ISD::UNPCKL: {
40110 if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
40111 N->isOnlyUserOf(N.getOperand(1).getNode())) {
40112 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
40113 SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
40114 unsigned SrcOpcode = N0.getOpcode();
40115 if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
40116 N0.getValueType() == N1.getValueType() &&
40117 IsSafeToMoveShuffle(N0, SrcOpcode) &&
40118 IsSafeToMoveShuffle(N1, SrcOpcode)) {
40123 // Ensure the total number of shuffles doesn't increase by folding this
40124 // shuffle through to the source ops.
40125 if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
40126 (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
40127 ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
40128 (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
40129 SDValue LHS, RHS;
40130 Op00 = DAG.getBitcast(ShuffleVT, Op00);
40131 Op10 = DAG.getBitcast(ShuffleVT, Op10);
40132 Op01 = DAG.getBitcast(ShuffleVT, Op01);
40133 Op11 = DAG.getBitcast(ShuffleVT, Op11);
40134 if (N.getNumOperands() == 3) {
40135 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
40136 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
40137 } else {
40138 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
40139 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
40140 }
40141 EVT OpVT = N0.getValueType();
40142 return DAG.getBitcast(ShuffleVT,
40143 DAG.getNode(SrcOpcode, DL, OpVT,
40144 DAG.getBitcast(OpVT, LHS),
40145 DAG.getBitcast(OpVT, RHS)));
40146 }
40147 }
40148 if (isUnaryOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
40149 N0.getValueType() == N1.getValueType() &&
40150 IsSafeToMoveShuffle(N0, SrcOpcode) &&
40151 IsSafeToMoveShuffle(N1, SrcOpcode)) {
40154 SDValue Res;
40155 Op00 = DAG.getBitcast(ShuffleVT, Op00);
40156 Op10 = DAG.getBitcast(ShuffleVT, Op10);
40157 if (N.getNumOperands() == 3) {
40158 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
40159 } else {
40160 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
40161 }
40162 EVT OpVT = N0.getValueType();
40163 return DAG.getBitcast(
40164 ShuffleVT,
40165 DAG.getNode(SrcOpcode, DL, OpVT, DAG.getBitcast(OpVT, Res)));
40166 }
40167 }
40168 break;
40169 }
40170 }
40171 return SDValue();
40172}
40173
40174/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
40176 SelectionDAG &DAG,
40177 const SDLoc &DL) {
40178 assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
40179
40180 MVT VT = V.getSimpleValueType();
40181 SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
40182 SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
40183 unsigned SrcOpc0 = Src0.getOpcode();
40184 unsigned SrcOpc1 = Src1.getOpcode();
40185 EVT SrcVT0 = Src0.getValueType();
40186 EVT SrcVT1 = Src1.getValueType();
40187
40188 if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
40189 return SDValue();
40190
40191 switch (SrcOpc0) {
40192 case X86ISD::MOVDDUP: {
40193 SDValue LHS = Src0.getOperand(0);
40194 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
40195 SDValue Res =
40196 DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
40197 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
40198 return DAG.getBitcast(VT, Res);
40199 }
40200 case X86ISD::VPERMILPI:
40201 // TODO: Handle v4f64 permutes with different low/high lane masks.
40202 if (SrcVT0 == MVT::v4f64) {
40203 uint64_t Mask = Src0.getConstantOperandVal(1);
40204 if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
40205 break;
40206 }
40207 [[fallthrough]];
40208 case X86ISD::VSHLI:
40209 case X86ISD::VSRLI:
40210 case X86ISD::VSRAI:
40211 case X86ISD::PSHUFD:
40212 if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
40213 SDValue LHS = Src0.getOperand(0);
40214 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
40215 SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
40216 V.getOperand(2));
40217 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
40218 return DAG.getBitcast(VT, Res);
40219 }
40220 break;
40221 }
40222
40223 return SDValue();
40224}
40225
40226/// Try to combine x86 target specific shuffles.
40228 SelectionDAG &DAG,
40230 const X86Subtarget &Subtarget) {
40231 MVT VT = N.getSimpleValueType();
40233 unsigned Opcode = N.getOpcode();
40234 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40235
40236 if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
40237 return R;
40238
40239 // Handle specific target shuffles.
40240 switch (Opcode) {
40241 case X86ISD::MOVDDUP: {
40242 SDValue Src = N.getOperand(0);
40243 // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
40244 if (VT == MVT::v2f64 && Src.hasOneUse() &&
40245 ISD::isNormalLoad(Src.getNode())) {
40246 LoadSDNode *LN = cast<LoadSDNode>(Src);
40247 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
40248 SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
40249 DCI.CombineTo(N.getNode(), Movddup);
40250 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
40252 return N; // Return N so it doesn't get rechecked!
40253 }
40254 }
40255
40256 return SDValue();
40257 }
40258 case X86ISD::VBROADCAST: {
40259 SDValue Src = N.getOperand(0);
40260 SDValue BC = peekThroughBitcasts(Src);
40261 EVT SrcVT = Src.getValueType();
40262 EVT BCVT = BC.getValueType();
40263
40264 // If broadcasting from another shuffle, attempt to simplify it.
40265 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
40266 if (isTargetShuffle(BC.getOpcode()) &&
40267 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
40268 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
40269 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
40271 for (unsigned i = 0; i != Scale; ++i)
40272 DemandedMask[i] = i;
40274 {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
40276 /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true,
40277 /*AllowPerLaneVarMask*/ true, DAG, Subtarget))
40278 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
40279 DAG.getBitcast(SrcVT, Res));
40280 }
40281
40282 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
40283 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
40284 if (Src.getOpcode() == ISD::BITCAST &&
40285 SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
40286 TLI.isTypeLegal(BCVT) &&
40288 BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
40289 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
40291 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
40292 }
40293
40294 // vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))
40295 // If we're re-broadcasting a smaller type then broadcast with that type and
40296 // bitcast.
40297 // TODO: Do this for any splat?
40298 if (Src.getOpcode() == ISD::BITCAST &&
40299 (BC.getOpcode() == X86ISD::VBROADCAST ||
40301 (VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits()) == 0 &&
40302 (VT.getSizeInBits() % BCVT.getSizeInBits()) == 0) {
40303 MVT NewVT =
40305 VT.getSizeInBits() / BCVT.getScalarSizeInBits());
40306 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
40307 }
40308
40309 // Reduce broadcast source vector to lowest 128-bits.
40310 if (SrcVT.getSizeInBits() > 128)
40311 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
40312 extract128BitVector(Src, 0, DAG, DL));
40313
40314 // broadcast(scalar_to_vector(x)) -> broadcast(x).
40315 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR &&
40316 Src.getValueType().getScalarType() == Src.getOperand(0).getValueType())
40317 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
40318
40319 // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
40320 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
40321 isNullConstant(Src.getOperand(1)) &&
40322 Src.getValueType() ==
40323 Src.getOperand(0).getValueType().getScalarType() &&
40324 TLI.isTypeLegal(Src.getOperand(0).getValueType()))
40325 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
40326
40327 // Share broadcast with the longest vector and extract low subvector (free).
40328 // Ensure the same SDValue from the SDNode use is being used.
40329 for (SDNode *User : Src->uses())
40330 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
40331 Src == User->getOperand(0) &&
40332 User->getValueSizeInBits(0).getFixedValue() >
40333 VT.getFixedSizeInBits()) {
40334 return extractSubVector(SDValue(User, 0), 0, DAG, DL,
40335 VT.getSizeInBits());
40336 }
40337
40338 // vbroadcast(scalarload X) -> vbroadcast_load X
40339 // For float loads, extract other uses of the scalar from the broadcast.
40340 if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
40341 ISD::isNormalLoad(Src.getNode())) {
40342 LoadSDNode *LN = cast<LoadSDNode>(Src);
40343 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40344 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
40345 SDValue BcastLd =
40347 LN->getMemoryVT(), LN->getMemOperand());
40348 // If the load value is used only by N, replace it via CombineTo N.
40349 bool NoReplaceExtract = Src.hasOneUse();
40350 DCI.CombineTo(N.getNode(), BcastLd);
40351 if (NoReplaceExtract) {
40352 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
40354 } else {
40355 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
40356 DAG.getIntPtrConstant(0, DL));
40357 DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
40358 }
40359 return N; // Return N so it doesn't get rechecked!
40360 }
40361
40362 // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
40363 // i16. So shrink it ourselves if we can make a broadcast_load.
40364 if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
40365 Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
40366 assert(Subtarget.hasAVX2() && "Expected AVX2");
40367 SDValue TruncIn = Src.getOperand(0);
40368
40369 // If this is a truncate of a non extending load we can just narrow it to
40370 // use a broadcast_load.
40371 if (ISD::isNormalLoad(TruncIn.getNode())) {
40372 LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
40373 // Unless its volatile or atomic.
40374 if (LN->isSimple()) {
40375 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40376 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
40377 SDValue BcastLd = DAG.getMemIntrinsicNode(
40378 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
40379 LN->getPointerInfo(), LN->getOriginalAlign(),
40380 LN->getMemOperand()->getFlags());
40381 DCI.CombineTo(N.getNode(), BcastLd);
40382 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
40383 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
40384 return N; // Return N so it doesn't get rechecked!
40385 }
40386 }
40387
40388 // If this is a truncate of an i16 extload, we can directly replace it.
40389 if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
40390 ISD::isEXTLoad(Src.getOperand(0).getNode())) {
40391 LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
40392 if (LN->getMemoryVT().getSizeInBits() == 16) {
40393 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40394 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
40395 SDValue BcastLd =
40397 LN->getMemoryVT(), LN->getMemOperand());
40398 DCI.CombineTo(N.getNode(), BcastLd);
40399 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
40400 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
40401 return N; // Return N so it doesn't get rechecked!
40402 }
40403 }
40404
40405 // If this is a truncate of load that has been shifted right, we can
40406 // offset the pointer and use a narrower load.
40407 if (TruncIn.getOpcode() == ISD::SRL &&
40408 TruncIn.getOperand(0).hasOneUse() &&
40409 isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
40410 ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
40411 LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
40412 unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
40413 // Make sure the shift amount and the load size are divisible by 16.
40414 // Don't do this if the load is volatile or atomic.
40415 if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
40416 LN->isSimple()) {
40417 unsigned Offset = ShiftAmt / 8;
40418 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40421 SDValue Ops[] = { LN->getChain(), Ptr };
40422 SDValue BcastLd = DAG.getMemIntrinsicNode(
40423 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
40425 LN->getOriginalAlign(),
40426 LN->getMemOperand()->getFlags());
40427 DCI.CombineTo(N.getNode(), BcastLd);
40428 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
40429 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
40430 return N; // Return N so it doesn't get rechecked!
40431 }
40432 }
40433 }
40434
40435 // vbroadcast(vzload X) -> vbroadcast_load X
40436 if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
40437 MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
40438 if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
40439 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40440 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
40441 SDValue BcastLd =
40443 LN->getMemoryVT(), LN->getMemOperand());
40444 DCI.CombineTo(N.getNode(), BcastLd);
40445 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
40447 return N; // Return N so it doesn't get rechecked!
40448 }
40449 }
40450
40451 // vbroadcast(vector load X) -> vbroadcast_load
40452 if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||
40453 SrcVT == MVT::v4i32) &&
40454 Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
40455 LoadSDNode *LN = cast<LoadSDNode>(Src);
40456 // Unless the load is volatile or atomic.
40457 if (LN->isSimple()) {
40458 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40459 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
40460 SDValue BcastLd = DAG.getMemIntrinsicNode(
40461 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),
40462 LN->getPointerInfo(), LN->getOriginalAlign(),
40463 LN->getMemOperand()->getFlags());
40464 DCI.CombineTo(N.getNode(), BcastLd);
40465 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
40467 return N; // Return N so it doesn't get rechecked!
40468 }
40469 }
40470
40471 return SDValue();
40472 }
40473 case X86ISD::VZEXT_MOVL: {
40474 SDValue N0 = N.getOperand(0);
40475
40476 // If this a vzmovl of a full vector load, replace it with a vzload, unless
40477 // the load is volatile.
40478 if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
40479 auto *LN = cast<LoadSDNode>(N0);
40480 if (SDValue VZLoad =
40481 narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
40482 DCI.CombineTo(N.getNode(), VZLoad);
40483 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
40485 return N;
40486 }
40487 }
40488
40489 // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
40490 // and can just use a VZEXT_LOAD.
40491 // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
40492 if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
40493 auto *LN = cast<MemSDNode>(N0);
40494 if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
40495 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40496 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
40497 SDValue VZLoad =
40499 LN->getMemoryVT(), LN->getMemOperand());
40500 DCI.CombineTo(N.getNode(), VZLoad);
40501 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
40503 return N;
40504 }
40505 }
40506
40507 // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
40508 // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
40509 // if the upper bits of the i64 are zero.
40510 if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
40511 N0.getOperand(0).hasOneUse() &&
40512 N0.getOperand(0).getValueType() == MVT::i64) {
40513 SDValue In = N0.getOperand(0);
40514 APInt Mask = APInt::getHighBitsSet(64, 32);
40515 if (DAG.MaskedValueIsZero(In, Mask)) {
40516 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
40517 MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
40518 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
40519 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
40520 return DAG.getBitcast(VT, Movl);
40521 }
40522 }
40523
40524 // Load a scalar integer constant directly to XMM instead of transferring an
40525 // immediate value from GPR.
40526 // vzext_movl (scalar_to_vector C) --> load [C,0...]
40527 if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
40528 if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
40529 // Create a vector constant - scalar constant followed by zeros.
40530 EVT ScalarVT = N0.getOperand(0).getValueType();
40531 Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
40532 unsigned NumElts = VT.getVectorNumElements();
40533 Constant *Zero = ConstantInt::getNullValue(ScalarTy);
40534 SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
40535 ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
40536
40537 // Load the vector constant from constant pool.
40538 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
40539 SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
40540 MachinePointerInfo MPI =
40542 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
40543 return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
40545 }
40546 }
40547
40548 // Pull subvector inserts into undef through VZEXT_MOVL by making it an
40549 // insert into a zero vector. This helps get VZEXT_MOVL closer to
40550 // scalar_to_vectors where 256/512 are canonicalized to an insert and a
40551 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
40552 if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
40554
40555 if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
40556 isNullConstant(V.getOperand(2))) {
40557 SDValue In = V.getOperand(1);
40559 In.getValueSizeInBits() /
40560 VT.getScalarSizeInBits());
40561 In = DAG.getBitcast(SubVT, In);
40562 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
40563 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
40564 getZeroVector(VT, Subtarget, DAG, DL), Movl,
40565 V.getOperand(2));
40566 }
40567 }
40568
40569 return SDValue();
40570 }
40571 case X86ISD::BLENDI: {
40572 SDValue N0 = N.getOperand(0);
40573 SDValue N1 = N.getOperand(1);
40574
40575 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
40576 // TODO: Handle MVT::v16i16 repeated blend mask.
40577 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
40578 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
40579 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
40580 if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
40581 SrcVT.getScalarSizeInBits() >= 32) {
40582 unsigned Size = VT.getVectorNumElements();
40583 unsigned NewSize = SrcVT.getVectorNumElements();
40584 APInt BlendMask = N.getConstantOperandAPInt(2).zextOrTrunc(Size);
40585 APInt NewBlendMask = APIntOps::ScaleBitMask(BlendMask, NewSize);
40586 return DAG.getBitcast(
40587 VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
40588 N1.getOperand(0),
40589 DAG.getTargetConstant(NewBlendMask.getZExtValue(),
40590 DL, MVT::i8)));
40591 }
40592 }
40593 return SDValue();
40594 }
40595 case X86ISD::SHUFP: {
40596 // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
40597 // This is a more relaxed shuffle combiner that can ignore oneuse limits.
40598 // TODO: Support types other than v4f32.
40599 if (VT == MVT::v4f32) {
40600 bool Updated = false;
40601 SmallVector<int> Mask;
40603 if (getTargetShuffleMask(N, false, Ops, Mask) && Ops.size() == 2) {
40604 for (int i = 0; i != 2; ++i) {
40605 SmallVector<SDValue> SubOps;
40606 SmallVector<int> SubMask, SubScaledMask;
40607 SDValue Sub = peekThroughBitcasts(Ops[i]);
40608 // TODO: Scaling might be easier if we specify the demanded elts.
40609 if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&
40610 scaleShuffleElements(SubMask, 4, SubScaledMask) &&
40611 SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {
40612 int Ofs = i * 2;
40613 Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);
40614 Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);
40615 Ops[i] = DAG.getBitcast(VT, SubOps[0]);
40616 Updated = true;
40617 }
40618 }
40619 }
40620 if (Updated) {
40621 for (int &M : Mask)
40622 M %= 4;
40623 Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
40624 return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);
40625 }
40626 }
40627 return SDValue();
40628 }
40629 case X86ISD::VPERMI: {
40630 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
40631 // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
40632 SDValue N0 = N.getOperand(0);
40633 SDValue N1 = N.getOperand(1);
40634 unsigned EltSizeInBits = VT.getScalarSizeInBits();
40635 if (N0.getOpcode() == ISD::BITCAST &&
40636 N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
40637 SDValue Src = N0.getOperand(0);
40638 EVT SrcVT = Src.getValueType();
40639 SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
40640 return DAG.getBitcast(VT, Res);
40641 }
40642 return SDValue();
40643 }
40644 case X86ISD::SHUF128: {
40645 // If we're permuting the upper 256-bits subvectors of a concatenation, then
40646 // see if we can peek through and access the subvector directly.
40647 if (VT.is512BitVector()) {
40648 // 512-bit mask uses 4 x i2 indices - if the msb is always set then only the
40649 // upper subvector is used.
40650 SDValue LHS = N->getOperand(0);
40651 SDValue RHS = N->getOperand(1);
40652 uint64_t Mask = N->getConstantOperandVal(2);
40653 SmallVector<SDValue> LHSOps, RHSOps;
40654 SDValue NewLHS, NewRHS;
40655 if ((Mask & 0x0A) == 0x0A &&
40656 collectConcatOps(LHS.getNode(), LHSOps, DAG) && LHSOps.size() == 2) {
40657 NewLHS = widenSubVector(LHSOps[1], false, Subtarget, DAG, DL, 512);
40658 Mask &= ~0x0A;
40659 }
40660 if ((Mask & 0xA0) == 0xA0 &&
40661 collectConcatOps(RHS.getNode(), RHSOps, DAG) && RHSOps.size() == 2) {
40662 NewRHS = widenSubVector(RHSOps[1], false, Subtarget, DAG, DL, 512);
40663 Mask &= ~0xA0;
40664 }
40665 if (NewLHS || NewRHS)
40666 return DAG.getNode(X86ISD::SHUF128, DL, VT, NewLHS ? NewLHS : LHS,
40667 NewRHS ? NewRHS : RHS,
40668 DAG.getTargetConstant(Mask, DL, MVT::i8));
40669 }
40670 return SDValue();
40671 }
40672 case X86ISD::VPERM2X128: {
40673 // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
40674 SDValue LHS = N->getOperand(0);
40675 SDValue RHS = N->getOperand(1);
40676 if (LHS.getOpcode() == ISD::BITCAST &&
40677 (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
40678 EVT SrcVT = LHS.getOperand(0).getValueType();
40679 if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
40680 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
40681 DAG.getBitcast(SrcVT, LHS),
40682 DAG.getBitcast(SrcVT, RHS),
40683 N->getOperand(2)));
40684 }
40685 }
40686
40687 // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
40689 return Res;
40690
40691 // Fold vperm2x128 subvector shuffle with an inner concat pattern.
40692 // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
40693 auto FindSubVector128 = [&](unsigned Idx) {
40694 if (Idx > 3)
40695 return SDValue();
40696 SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
40697 SmallVector<SDValue> SubOps;
40698 if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)
40699 return SubOps[Idx & 1];
40700 unsigned NumElts = Src.getValueType().getVectorNumElements();
40701 if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
40702 Src.getOperand(1).getValueSizeInBits() == 128 &&
40703 Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
40704 return Src.getOperand(1);
40705 }
40706 return SDValue();
40707 };
40708 unsigned Imm = N.getConstantOperandVal(2);
40709 if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
40710 if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
40711 MVT SubVT = VT.getHalfNumVectorElementsVT();
40712 SubLo = DAG.getBitcast(SubVT, SubLo);
40713 SubHi = DAG.getBitcast(SubVT, SubHi);
40714 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
40715 }
40716 }
40717 return SDValue();
40718 }
40719 case X86ISD::PSHUFD:
40720 case X86ISD::PSHUFLW:
40721 case X86ISD::PSHUFHW: {
40722 SDValue N0 = N.getOperand(0);
40723 SDValue N1 = N.getOperand(1);
40724 if (N0->hasOneUse()) {
40726 switch (V.getOpcode()) {
40727 case X86ISD::VSHL:
40728 case X86ISD::VSRL:
40729 case X86ISD::VSRA:
40730 case X86ISD::VSHLI:
40731 case X86ISD::VSRLI:
40732 case X86ISD::VSRAI:
40733 case X86ISD::VROTLI:
40734 case X86ISD::VROTRI: {
40735 MVT InnerVT = V.getSimpleValueType();
40736 if (InnerVT.getScalarSizeInBits() <= VT.getScalarSizeInBits()) {
40737 SDValue Res = DAG.getNode(Opcode, DL, VT,
40738 DAG.getBitcast(VT, V.getOperand(0)), N1);
40739 Res = DAG.getBitcast(InnerVT, Res);
40740 Res = DAG.getNode(V.getOpcode(), DL, InnerVT, Res, V.getOperand(1));
40741 return DAG.getBitcast(VT, Res);
40742 }
40743 break;
40744 }
40745 }
40746 }
40747
40748 Mask = getPSHUFShuffleMask(N);
40749 assert(Mask.size() == 4);
40750 break;
40751 }
40752 case X86ISD::MOVSD:
40753 case X86ISD::MOVSH:
40754 case X86ISD::MOVSS: {
40755 SDValue N0 = N.getOperand(0);
40756 SDValue N1 = N.getOperand(1);
40757
40758 // Canonicalize scalar FPOps:
40759 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
40760 // If commutable, allow OP(N1[0], N0[0]).
40761 unsigned Opcode1 = N1.getOpcode();
40762 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
40763 Opcode1 == ISD::FDIV) {
40764 SDValue N10 = N1.getOperand(0);
40765 SDValue N11 = N1.getOperand(1);
40766 if (N10 == N0 ||
40767 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
40768 if (N10 != N0)
40769 std::swap(N10, N11);
40770 MVT SVT = VT.getVectorElementType();
40771 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
40772 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
40773 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
40774 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
40775 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
40776 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
40777 }
40778 }
40779
40780 return SDValue();
40781 }
40782 case X86ISD::INSERTPS: {
40783 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
40784 SDValue Op0 = N.getOperand(0);
40785 SDValue Op1 = N.getOperand(1);
40786 unsigned InsertPSMask = N.getConstantOperandVal(2);
40787 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
40788 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
40789 unsigned ZeroMask = InsertPSMask & 0xF;
40790
40791 // If we zero out all elements from Op0 then we don't need to reference it.
40792 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
40793 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
40794 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
40795
40796 // If we zero out the element from Op1 then we don't need to reference it.
40797 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
40798 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
40799 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
40800
40801 // Attempt to merge insertps Op1 with an inner target shuffle node.
40802 SmallVector<int, 8> TargetMask1;
40804 APInt KnownUndef1, KnownZero1;
40805 if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
40806 KnownZero1)) {
40807 if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
40808 // Zero/UNDEF insertion - zero out element and remove dependency.
40809 InsertPSMask |= (1u << DstIdx);
40810 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
40811 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
40812 }
40813 // Update insertps mask srcidx and reference the source input directly.
40814 int M = TargetMask1[SrcIdx];
40815 assert(0 <= M && M < 8 && "Shuffle index out of range");
40816 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
40817 Op1 = Ops1[M < 4 ? 0 : 1];
40818 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
40819 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
40820 }
40821
40822 // Attempt to merge insertps Op0 with an inner target shuffle node.
40823 SmallVector<int, 8> TargetMask0;
40825 APInt KnownUndef0, KnownZero0;
40826 if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
40827 KnownZero0)) {
40828 bool Updated = false;
40829 bool UseInput00 = false;
40830 bool UseInput01 = false;
40831 for (int i = 0; i != 4; ++i) {
40832 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
40833 // No change if element is already zero or the inserted element.
40834 continue;
40835 }
40836
40837 if (KnownUndef0[i] || KnownZero0[i]) {
40838 // If the target mask is undef/zero then we must zero the element.
40839 InsertPSMask |= (1u << i);
40840 Updated = true;
40841 continue;
40842 }
40843
40844 // The input vector element must be inline.
40845 int M = TargetMask0[i];
40846 if (M != i && M != (i + 4))
40847 return SDValue();
40848
40849 // Determine which inputs of the target shuffle we're using.
40850 UseInput00 |= (0 <= M && M < 4);
40851 UseInput01 |= (4 <= M);
40852 }
40853
40854 // If we're not using both inputs of the target shuffle then use the
40855 // referenced input directly.
40856 if (UseInput00 && !UseInput01) {
40857 Updated = true;
40858 Op0 = Ops0[0];
40859 } else if (!UseInput00 && UseInput01) {
40860 Updated = true;
40861 Op0 = Ops0[1];
40862 }
40863
40864 if (Updated)
40865 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
40866 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
40867 }
40868
40869 // If we're inserting an element from a vbroadcast load, fold the
40870 // load into the X86insertps instruction. We need to convert the scalar
40871 // load to a vector and clear the source lane of the INSERTPS control.
40872 if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
40873 auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
40874 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
40875 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
40876 MemIntr->getBasePtr(),
40877 MemIntr->getMemOperand());
40878 SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
40880 Load),
40881 DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
40882 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
40883 return Insert;
40884 }
40885 }
40886
40887 return SDValue();
40888 }
40889 default:
40890 return SDValue();
40891 }
40892
40893 // Nuke no-op shuffles that show up after combining.
40894 if (isNoopShuffleMask(Mask))
40895 return N.getOperand(0);
40896
40897 // Look for simplifications involving one or two shuffle instructions.
40898 SDValue V = N.getOperand(0);
40899 switch (N.getOpcode()) {
40900 default:
40901 break;
40902 case X86ISD::PSHUFLW:
40903 case X86ISD::PSHUFHW:
40904 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
40905
40906 // See if this reduces to a PSHUFD which is no more expensive and can
40907 // combine with more operations. Note that it has to at least flip the
40908 // dwords as otherwise it would have been removed as a no-op.
40909 if (ArrayRef<int>(Mask).equals({2, 3, 0, 1})) {
40910 int DMask[] = {0, 1, 2, 3};
40911 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
40912 DMask[DOffset + 0] = DOffset + 1;
40913 DMask[DOffset + 1] = DOffset + 0;
40914 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
40915 V = DAG.getBitcast(DVT, V);
40916 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
40917 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
40918 return DAG.getBitcast(VT, V);
40919 }
40920
40921 // Look for shuffle patterns which can be implemented as a single unpack.
40922 // FIXME: This doesn't handle the location of the PSHUFD generically, and
40923 // only works when we have a PSHUFD followed by two half-shuffles.
40924 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
40925 (V.getOpcode() == X86ISD::PSHUFLW ||
40926 V.getOpcode() == X86ISD::PSHUFHW) &&
40927 V.getOpcode() != N.getOpcode() &&
40928 V.hasOneUse() && V.getOperand(0).hasOneUse()) {
40929 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
40930 if (D.getOpcode() == X86ISD::PSHUFD) {
40933 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
40934 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
40935 int WordMask[8];
40936 for (int i = 0; i < 4; ++i) {
40937 WordMask[i + NOffset] = Mask[i] + NOffset;
40938 WordMask[i + VOffset] = VMask[i] + VOffset;
40939 }
40940 // Map the word mask through the DWord mask.
40941 int MappedMask[8];
40942 for (int i = 0; i < 8; ++i)
40943 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
40944 if (ArrayRef<int>(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
40945 ArrayRef<int>(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
40946 // We can replace all three shuffles with an unpack.
40947 V = DAG.getBitcast(VT, D.getOperand(0));
40948 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
40950 DL, VT, V, V);
40951 }
40952 }
40953 }
40954
40955 break;
40956
40957 case X86ISD::PSHUFD:
40958 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DL, DAG))
40959 return NewN;
40960
40961 break;
40962 }
40963
40964 return SDValue();
40965}
40966
40967/// Checks if the shuffle mask takes subsequent elements
40968/// alternately from two vectors.
40969/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
40970static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
40971
40972 int ParitySrc[2] = {-1, -1};
40973 unsigned Size = Mask.size();
40974 for (unsigned i = 0; i != Size; ++i) {
40975 int M = Mask[i];
40976 if (M < 0)
40977 continue;
40978
40979 // Make sure we are using the matching element from the input.
40980 if ((M % Size) != i)
40981 return false;
40982
40983 // Make sure we use the same input for all elements of the same parity.
40984 int Src = M / Size;
40985 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
40986 return false;
40987 ParitySrc[i % 2] = Src;
40988 }
40989
40990 // Make sure each input is used.
40991 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
40992 return false;
40993
40994 Op0Even = ParitySrc[0] == 0;
40995 return true;
40996}
40997
40998/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
40999/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
41000/// are written to the parameters \p Opnd0 and \p Opnd1.
41001///
41002/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
41003/// so it is easier to generically match. We also insert dummy vector shuffle
41004/// nodes for the operands which explicitly discard the lanes which are unused
41005/// by this operation to try to flow through the rest of the combiner
41006/// the fact that they're unused.
41007static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
41008 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
41009 bool &IsSubAdd) {
41010
41011 EVT VT = N->getValueType(0);
41012 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41013 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
41015 return false;
41016
41017 // We only handle target-independent shuffles.
41018 // FIXME: It would be easy and harmless to use the target shuffle mask
41019 // extraction tool to support more.
41020 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
41021 return false;
41022
41023 SDValue V1 = N->getOperand(0);
41024 SDValue V2 = N->getOperand(1);
41025
41026 // Make sure we have an FADD and an FSUB.
41027 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
41028 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
41029 V1.getOpcode() == V2.getOpcode())
41030 return false;
41031
41032 // If there are other uses of these operations we can't fold them.
41033 if (!V1->hasOneUse() || !V2->hasOneUse())
41034 return false;
41035
41036 // Ensure that both operations have the same operands. Note that we can
41037 // commute the FADD operands.
41038 SDValue LHS, RHS;
41039 if (V1.getOpcode() == ISD::FSUB) {
41040 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
41041 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
41042 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
41043 return false;
41044 } else {
41045 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
41046 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
41047 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
41048 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
41049 return false;
41050 }
41051
41052 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
41053 bool Op0Even;
41054 if (!isAddSubOrSubAddMask(Mask, Op0Even))
41055 return false;
41056
41057 // It's a subadd if the vector in the even parity is an FADD.
41058 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
41059 : V2->getOpcode() == ISD::FADD;
41060
41061 Opnd0 = LHS;
41062 Opnd1 = RHS;
41063 return true;
41064}
41065
41066/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
41068 const X86Subtarget &Subtarget,
41069 SelectionDAG &DAG) {
41070 // We only handle target-independent shuffles.
41071 // FIXME: It would be easy and harmless to use the target shuffle mask
41072 // extraction tool to support more.
41073 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
41074 return SDValue();
41075
41076 MVT VT = N->getSimpleValueType(0);
41077 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41078 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
41079 return SDValue();
41080
41081 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
41082 SDValue Op0 = N->getOperand(0);
41083 SDValue Op1 = N->getOperand(1);
41084 SDValue FMAdd = Op0, FMSub = Op1;
41085 if (FMSub.getOpcode() != X86ISD::FMSUB)
41086 std::swap(FMAdd, FMSub);
41087
41088 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
41089 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
41090 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
41091 FMAdd.getOperand(2) != FMSub.getOperand(2))
41092 return SDValue();
41093
41094 // Check for correct shuffle mask.
41095 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
41096 bool Op0Even;
41097 if (!isAddSubOrSubAddMask(Mask, Op0Even))
41098 return SDValue();
41099
41100 // FMAddSub takes zeroth operand from FMSub node.
41101 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
41102 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
41103 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
41104 FMAdd.getOperand(2));
41105}
41106
41107/// Try to combine a shuffle into a target-specific add-sub or
41108/// mul-add-sub node.
41110 const X86Subtarget &Subtarget,
41111 SelectionDAG &DAG) {
41112 if (SDValue V = combineShuffleToFMAddSub(N, DL, Subtarget, DAG))
41113 return V;
41114
41115 SDValue Opnd0, Opnd1;
41116 bool IsSubAdd;
41117 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
41118 return SDValue();
41119
41120 MVT VT = N->getSimpleValueType(0);
41121
41122 // Try to generate X86ISD::FMADDSUB node here.
41123 SDValue Opnd2;
41124 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
41125 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
41126 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
41127 }
41128
41129 if (IsSubAdd)
41130 return SDValue();
41131
41132 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
41133 // the ADDSUB idiom has been successfully recognized. There are no known
41134 // X86 targets with 512-bit ADDSUB instructions!
41135 if (VT.is512BitVector())
41136 return SDValue();
41137
41138 // Do not generate X86ISD::ADDSUB node for FP16's vector types even though
41139 // the ADDSUB idiom has been successfully recognized. There are no known
41140 // X86 targets with FP16 ADDSUB instructions!
41141 if (VT.getVectorElementType() == MVT::f16)
41142 return SDValue();
41143
41144 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
41145}
41146
41147// We are looking for a shuffle where both sources are concatenated with undef
41148// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
41149// if we can express this as a single-source shuffle, that's preferable.
41151 SelectionDAG &DAG,
41152 const X86Subtarget &Subtarget) {
41153 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
41154 return SDValue();
41155
41156 EVT VT = N->getValueType(0);
41157
41158 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
41159 if (!VT.is128BitVector() && !VT.is256BitVector())
41160 return SDValue();
41161
41162 if (VT.getVectorElementType() != MVT::i32 &&
41163 VT.getVectorElementType() != MVT::i64 &&
41164 VT.getVectorElementType() != MVT::f32 &&
41165 VT.getVectorElementType() != MVT::f64)
41166 return SDValue();
41167
41168 SDValue N0 = N->getOperand(0);
41169 SDValue N1 = N->getOperand(1);
41170
41171 // Check that both sources are concats with undef.
41172 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
41173 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
41174 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
41175 !N1.getOperand(1).isUndef())
41176 return SDValue();
41177
41178 // Construct the new shuffle mask. Elements from the first source retain their
41179 // index, but elements from the second source no longer need to skip an undef.
41181 int NumElts = VT.getVectorNumElements();
41182
41183 auto *SVOp = cast<ShuffleVectorSDNode>(N);
41184 for (int Elt : SVOp->getMask())
41185 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
41186
41188 N1.getOperand(0));
41189 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
41190}
41191
41192/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
41193/// low half of each source vector and does not set any high half elements in
41194/// the destination vector, narrow the shuffle to half its original size.
41196 EVT VT = Shuf->getValueType(0);
41197 if (!DAG.getTargetLoweringInfo().isTypeLegal(Shuf->getValueType(0)))
41198 return SDValue();
41199 if (!VT.is256BitVector() && !VT.is512BitVector())
41200 return SDValue();
41201
41202 // See if we can ignore all of the high elements of the shuffle.
41203 ArrayRef<int> Mask = Shuf->getMask();
41204 if (!isUndefUpperHalf(Mask))
41205 return SDValue();
41206
41207 // Check if the shuffle mask accesses only the low half of each input vector
41208 // (half-index output is 0 or 2).
41209 int HalfIdx1, HalfIdx2;
41210 SmallVector<int, 8> HalfMask(Mask.size() / 2);
41211 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
41212 (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
41213 return SDValue();
41214
41215 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
41216 // The trick is knowing that all of the insert/extract are actually free
41217 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
41218 // of narrow inputs into a narrow output, and that is always cheaper than
41219 // the wide shuffle that we started with.
41220 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
41221 Shuf->getOperand(1), HalfMask, HalfIdx1,
41222 HalfIdx2, false, DAG, /*UseConcat*/ true);
41223}
41224
41227 const X86Subtarget &Subtarget) {
41228 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
41229 if (SDValue V = narrowShuffle(Shuf, DAG))
41230 return V;
41231
41232 // If we have legalized the vector types, look for blends of FADD and FSUB
41233 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
41234 SDLoc dl(N);
41235 EVT VT = N->getValueType(0);
41236 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41237 if (TLI.isTypeLegal(VT) && !isSoftF16(VT, Subtarget))
41238 if (SDValue AddSub =
41239 combineShuffleToAddSubOrFMAddSub(N, dl, Subtarget, DAG))
41240 return AddSub;
41241
41242 // Attempt to combine into a vector load/broadcast.
41244 VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
41245 return LD;
41246
41247 // For AVX2, we sometimes want to combine
41248 // (vector_shuffle <mask> (concat_vectors t1, undef)
41249 // (concat_vectors t2, undef))
41250 // Into:
41251 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
41252 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
41253 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, dl, DAG, Subtarget))
41254 return ShufConcat;
41255
41256 if (isTargetShuffle(N->getOpcode())) {
41257 SDValue Op(N, 0);
41258 if (SDValue Shuffle = combineTargetShuffle(Op, dl, DAG, DCI, Subtarget))
41259 return Shuffle;
41260
41261 // Try recursively combining arbitrary sequences of x86 shuffle
41262 // instructions into higher-order shuffles. We do this after combining
41263 // specific PSHUF instruction sequences into their minimal form so that we
41264 // can evaluate how many specialized shuffle instructions are involved in
41265 // a particular chain.
41266 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
41267 return Res;
41268
41269 // Simplify source operands based on shuffle mask.
41270 // TODO - merge this into combineX86ShufflesRecursively.
41271 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
41272 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))
41273 return SDValue(N, 0);
41274
41275 // Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
41276 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
41277 // Perform this after other shuffle combines to allow inner shuffles to be
41278 // combined away first.
41279 if (SDValue BinOp = canonicalizeShuffleWithOp(Op, DAG, dl))
41280 return BinOp;
41281 }
41282
41283 return SDValue();
41284}
41285
41286// Simplify variable target shuffle masks based on the demanded elements.
41287// TODO: Handle DemandedBits in mask indices as well?
41289 SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
41290 TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
41291 // If we're demanding all elements don't bother trying to simplify the mask.
41292 unsigned NumElts = DemandedElts.getBitWidth();
41293 if (DemandedElts.isAllOnes())
41294 return false;
41295
41296 SDValue Mask = Op.getOperand(MaskIndex);
41297 if (!Mask.hasOneUse())
41298 return false;
41299
41300 // Attempt to generically simplify the variable shuffle mask.
41301 APInt MaskUndef, MaskZero;
41302 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
41303 Depth + 1))
41304 return true;
41305
41306 // Attempt to extract+simplify a (constant pool load) shuffle mask.
41307 // TODO: Support other types from getTargetShuffleMaskIndices?
41309 EVT BCVT = BC.getValueType();
41310 auto *Load = dyn_cast<LoadSDNode>(BC);
41311 if (!Load || !Load->getBasePtr().hasOneUse())
41312 return false;
41313
41314 const Constant *C = getTargetConstantFromNode(Load);
41315 if (!C)
41316 return false;
41317
41318 Type *CTy = C->getType();
41319 if (!CTy->isVectorTy() ||
41320 CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
41321 return false;
41322
41323 // Handle scaling for i64 elements on 32-bit targets.
41324 unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
41325 if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
41326 return false;
41327 unsigned Scale = NumCstElts / NumElts;
41328
41329 // Simplify mask if we have an undemanded element that is not undef.
41330 bool Simplified = false;
41331 SmallVector<Constant *, 32> ConstVecOps;
41332 for (unsigned i = 0; i != NumCstElts; ++i) {
41333 Constant *Elt = C->getAggregateElement(i);
41334 if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
41335 ConstVecOps.push_back(UndefValue::get(Elt->getType()));
41336 Simplified = true;
41337 continue;
41338 }
41339 ConstVecOps.push_back(Elt);
41340 }
41341 if (!Simplified)
41342 return false;
41343
41344 // Generate new constant pool entry + legalize immediately for the load.
41345 SDLoc DL(Op);
41346 SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
41347 SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
41348 SDValue NewMask = TLO.DAG.getLoad(
41349 BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
41351 Load->getAlign());
41352 return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
41353}
41354
41356 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
41357 TargetLoweringOpt &TLO, unsigned Depth) const {
41358 int NumElts = DemandedElts.getBitWidth();
41359 unsigned Opc = Op.getOpcode();
41360 EVT VT = Op.getValueType();
41361
41362 // Handle special case opcodes.
41363 switch (Opc) {
41364 case X86ISD::PMULDQ:
41365 case X86ISD::PMULUDQ: {
41366 APInt LHSUndef, LHSZero;
41367 APInt RHSUndef, RHSZero;
41368 SDValue LHS = Op.getOperand(0);
41369 SDValue RHS = Op.getOperand(1);
41370 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
41371 Depth + 1))
41372 return true;
41373 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
41374 Depth + 1))
41375 return true;
41376 // Multiply by zero.
41377 KnownZero = LHSZero | RHSZero;
41378 break;
41379 }
41380 case X86ISD::VPMADDWD: {
41381 APInt LHSUndef, LHSZero;
41382 APInt RHSUndef, RHSZero;
41383 SDValue LHS = Op.getOperand(0);
41384 SDValue RHS = Op.getOperand(1);
41385 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);
41386
41387 if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,
41388 Depth + 1))
41389 return true;
41390 if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,
41391 Depth + 1))
41392 return true;
41393
41394 // TODO: Multiply by zero.
41395
41396 // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.
41397 APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;
41398 if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,
41399 Depth + 1))
41400 return true;
41401 APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;
41402 if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,
41403 Depth + 1))
41404 return true;
41405 break;
41406 }
41407 case X86ISD::PSADBW: {
41408 SDValue LHS = Op.getOperand(0);
41409 SDValue RHS = Op.getOperand(1);
41410 assert(VT.getScalarType() == MVT::i64 &&
41411 LHS.getValueType() == RHS.getValueType() &&
41412 LHS.getValueType().getScalarType() == MVT::i8 &&
41413 "Unexpected PSADBW types");
41414
41415 // Aggressively peek through ops to get at the demanded elts.
41416 if (!DemandedElts.isAllOnes()) {
41417 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
41418 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
41420 LHS, DemandedSrcElts, TLO.DAG, Depth + 1);
41422 RHS, DemandedSrcElts, TLO.DAG, Depth + 1);
41423 if (NewLHS || NewRHS) {
41424 NewLHS = NewLHS ? NewLHS : LHS;
41425 NewRHS = NewRHS ? NewRHS : RHS;
41426 return TLO.CombineTo(
41427 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
41428 }
41429 }
41430 break;
41431 }
41432 case X86ISD::VSHL:
41433 case X86ISD::VSRL:
41434 case X86ISD::VSRA: {
41435 // We only need the bottom 64-bits of the (128-bit) shift amount.
41436 SDValue Amt = Op.getOperand(1);
41437 MVT AmtVT = Amt.getSimpleValueType();
41438 assert(AmtVT.is128BitVector() && "Unexpected value type");
41439
41440 // If we reuse the shift amount just for sse shift amounts then we know that
41441 // only the bottom 64-bits are only ever used.
41442 bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
41443 unsigned UseOpc = Use->getOpcode();
41444 return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
41445 UseOpc == X86ISD::VSRA) &&
41446 Use->getOperand(0) != Amt;
41447 });
41448
41449 APInt AmtUndef, AmtZero;
41450 unsigned NumAmtElts = AmtVT.getVectorNumElements();
41451 APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
41452 if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
41453 Depth + 1, AssumeSingleUse))
41454 return true;
41455 [[fallthrough]];
41456 }
41457 case X86ISD::VSHLI:
41458 case X86ISD::VSRLI:
41459 case X86ISD::VSRAI: {
41460 SDValue Src = Op.getOperand(0);
41461 APInt SrcUndef;
41462 if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
41463 Depth + 1))
41464 return true;
41465
41466 // Fold shift(0,x) -> 0
41467 if (DemandedElts.isSubsetOf(KnownZero))
41468 return TLO.CombineTo(
41469 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
41470
41471 // Aggressively peek through ops to get at the demanded elts.
41472 if (!DemandedElts.isAllOnes())
41474 Src, DemandedElts, TLO.DAG, Depth + 1))
41475 return TLO.CombineTo(
41476 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
41477 break;
41478 }
41479 case X86ISD::VPSHA:
41480 case X86ISD::VPSHL:
41481 case X86ISD::VSHLV:
41482 case X86ISD::VSRLV:
41483 case X86ISD::VSRAV: {
41484 APInt LHSUndef, LHSZero;
41485 APInt RHSUndef, RHSZero;
41486 SDValue LHS = Op.getOperand(0);
41487 SDValue RHS = Op.getOperand(1);
41488 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
41489 Depth + 1))
41490 return true;
41491
41492 // Fold shift(0,x) -> 0
41493 if (DemandedElts.isSubsetOf(LHSZero))
41494 return TLO.CombineTo(
41495 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
41496
41497 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
41498 Depth + 1))
41499 return true;
41500
41501 KnownZero = LHSZero;
41502 break;
41503 }
41504 case X86ISD::PCMPEQ:
41505 case X86ISD::PCMPGT: {
41506 APInt LHSUndef, LHSZero;
41507 APInt RHSUndef, RHSZero;
41508 SDValue LHS = Op.getOperand(0);
41509 SDValue RHS = Op.getOperand(1);
41510 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
41511 Depth + 1))
41512 return true;
41513 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
41514 Depth + 1))
41515 return true;
41516 break;
41517 }
41518 case X86ISD::KSHIFTL: {
41519 SDValue Src = Op.getOperand(0);
41520 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
41521 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
41522 unsigned ShiftAmt = Amt->getZExtValue();
41523
41524 if (ShiftAmt == 0)
41525 return TLO.CombineTo(Op, Src);
41526
41527 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
41528 // single shift. We can do this if the bottom bits (which are shifted
41529 // out) are never demanded.
41530 if (Src.getOpcode() == X86ISD::KSHIFTR) {
41531 if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
41532 unsigned C1 = Src.getConstantOperandVal(1);
41533 unsigned NewOpc = X86ISD::KSHIFTL;
41534 int Diff = ShiftAmt - C1;
41535 if (Diff < 0) {
41536 Diff = -Diff;
41537 NewOpc = X86ISD::KSHIFTR;
41538 }
41539
41540 SDLoc dl(Op);
41541 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
41542 return TLO.CombineTo(
41543 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
41544 }
41545 }
41546
41547 APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
41548 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
41549 Depth + 1))
41550 return true;
41551
41552 KnownUndef <<= ShiftAmt;
41553 KnownZero <<= ShiftAmt;
41554 KnownZero.setLowBits(ShiftAmt);
41555 break;
41556 }
41557 case X86ISD::KSHIFTR: {
41558 SDValue Src = Op.getOperand(0);
41559 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
41560 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
41561 unsigned ShiftAmt = Amt->getZExtValue();
41562
41563 if (ShiftAmt == 0)
41564 return TLO.CombineTo(Op, Src);
41565
41566 // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
41567 // single shift. We can do this if the top bits (which are shifted
41568 // out) are never demanded.
41569 if (Src.getOpcode() == X86ISD::KSHIFTL) {
41570 if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
41571 unsigned C1 = Src.getConstantOperandVal(1);
41572 unsigned NewOpc = X86ISD::KSHIFTR;
41573 int Diff = ShiftAmt - C1;
41574 if (Diff < 0) {
41575 Diff = -Diff;
41576 NewOpc = X86ISD::KSHIFTL;
41577 }
41578
41579 SDLoc dl(Op);
41580 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
41581 return TLO.CombineTo(
41582 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
41583 }
41584 }
41585
41586 APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
41587 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
41588 Depth + 1))
41589 return true;
41590
41591 KnownUndef.lshrInPlace(ShiftAmt);
41592 KnownZero.lshrInPlace(ShiftAmt);
41593 KnownZero.setHighBits(ShiftAmt);
41594 break;
41595 }
41596 case X86ISD::ANDNP: {
41597 // ANDNP = (~LHS & RHS);
41598 SDValue LHS = Op.getOperand(0);
41599 SDValue RHS = Op.getOperand(1);
41600
41601 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
41602 APInt UndefElts;
41603 SmallVector<APInt> EltBits;
41604 int NumElts = VT.getVectorNumElements();
41605 int EltSizeInBits = VT.getScalarSizeInBits();
41606 APInt OpBits = APInt::getAllOnes(EltSizeInBits);
41607 APInt OpElts = DemandedElts;
41608 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
41609 EltBits)) {
41610 OpBits.clearAllBits();
41611 OpElts.clearAllBits();
41612 for (int I = 0; I != NumElts; ++I) {
41613 if (!DemandedElts[I])
41614 continue;
41615 if (UndefElts[I]) {
41616 // We can't assume an undef src element gives an undef dst - the
41617 // other src might be zero.
41618 OpBits.setAllBits();
41619 OpElts.setBit(I);
41620 } else if ((Invert && !EltBits[I].isAllOnes()) ||
41621 (!Invert && !EltBits[I].isZero())) {
41622 OpBits |= Invert ? ~EltBits[I] : EltBits[I];
41623 OpElts.setBit(I);
41624 }
41625 }
41626 }
41627 return std::make_pair(OpBits, OpElts);
41628 };
41629 APInt BitsLHS, EltsLHS;
41630 APInt BitsRHS, EltsRHS;
41631 std::tie(BitsLHS, EltsLHS) = GetDemandedMasks(RHS);
41632 std::tie(BitsRHS, EltsRHS) = GetDemandedMasks(LHS, true);
41633
41634 APInt LHSUndef, LHSZero;
41635 APInt RHSUndef, RHSZero;
41636 if (SimplifyDemandedVectorElts(LHS, EltsLHS, LHSUndef, LHSZero, TLO,
41637 Depth + 1))
41638 return true;
41639 if (SimplifyDemandedVectorElts(RHS, EltsRHS, RHSUndef, RHSZero, TLO,
41640 Depth + 1))
41641 return true;
41642
41643 if (!DemandedElts.isAllOnes()) {
41644 SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS,
41645 TLO.DAG, Depth + 1);
41646 SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS,
41647 TLO.DAG, Depth + 1);
41648 if (NewLHS || NewRHS) {
41649 NewLHS = NewLHS ? NewLHS : LHS;
41650 NewRHS = NewRHS ? NewRHS : RHS;
41651 return TLO.CombineTo(
41652 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
41653 }
41654 }
41655 break;
41656 }
41657 case X86ISD::CVTSI2P:
41658 case X86ISD::CVTUI2P:
41659 case X86ISD::CVTPH2PS:
41660 case X86ISD::CVTPS2PH: {
41661 SDValue Src = Op.getOperand(0);
41662 MVT SrcVT = Src.getSimpleValueType();
41663 APInt SrcUndef, SrcZero;
41664 APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
41665 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
41666 Depth + 1))
41667 return true;
41668 break;
41669 }
41670 case X86ISD::PACKSS:
41671 case X86ISD::PACKUS: {
41672 SDValue N0 = Op.getOperand(0);
41673 SDValue N1 = Op.getOperand(1);
41674
41675 APInt DemandedLHS, DemandedRHS;
41676 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
41677
41678 APInt LHSUndef, LHSZero;
41679 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
41680 Depth + 1))
41681 return true;
41682 APInt RHSUndef, RHSZero;
41683 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
41684 Depth + 1))
41685 return true;
41686
41687 // TODO - pass on known zero/undef.
41688
41689 // Aggressively peek through ops to get at the demanded elts.
41690 // TODO - we should do this for all target/faux shuffles ops.
41691 if (!DemandedElts.isAllOnes()) {
41692 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
41693 TLO.DAG, Depth + 1);
41694 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
41695 TLO.DAG, Depth + 1);
41696 if (NewN0 || NewN1) {
41697 NewN0 = NewN0 ? NewN0 : N0;
41698 NewN1 = NewN1 ? NewN1 : N1;
41699 return TLO.CombineTo(Op,
41700 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
41701 }
41702 }
41703 break;
41704 }
41705 case X86ISD::HADD:
41706 case X86ISD::HSUB:
41707 case X86ISD::FHADD:
41708 case X86ISD::FHSUB: {
41709 SDValue N0 = Op.getOperand(0);
41710 SDValue N1 = Op.getOperand(1);
41711
41712 APInt DemandedLHS, DemandedRHS;
41713 getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
41714
41715 APInt LHSUndef, LHSZero;
41716 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
41717 Depth + 1))
41718 return true;
41719 APInt RHSUndef, RHSZero;
41720 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
41721 Depth + 1))
41722 return true;
41723
41724 // TODO - pass on known zero/undef.
41725
41726 // Aggressively peek through ops to get at the demanded elts.
41727 // TODO: Handle repeated operands.
41728 if (N0 != N1 && !DemandedElts.isAllOnes()) {
41729 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
41730 TLO.DAG, Depth + 1);
41731 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
41732 TLO.DAG, Depth + 1);
41733 if (NewN0 || NewN1) {
41734 NewN0 = NewN0 ? NewN0 : N0;
41735 NewN1 = NewN1 ? NewN1 : N1;
41736 return TLO.CombineTo(Op,
41737 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
41738 }
41739 }
41740 break;
41741 }
41742 case X86ISD::VTRUNC:
41743 case X86ISD::VTRUNCS:
41744 case X86ISD::VTRUNCUS: {
41745 SDValue Src = Op.getOperand(0);
41746 MVT SrcVT = Src.getSimpleValueType();
41747 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
41748 APInt SrcUndef, SrcZero;
41749 if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
41750 Depth + 1))
41751 return true;
41752 KnownZero = SrcZero.zextOrTrunc(NumElts);
41753 KnownUndef = SrcUndef.zextOrTrunc(NumElts);
41754 break;
41755 }
41756 case X86ISD::BLENDV: {
41757 APInt SelUndef, SelZero;
41758 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
41759 SelZero, TLO, Depth + 1))
41760 return true;
41761
41762 // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
41763 APInt LHSUndef, LHSZero;
41764 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
41765 LHSZero, TLO, Depth + 1))
41766 return true;
41767
41768 APInt RHSUndef, RHSZero;
41769 if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
41770 RHSZero, TLO, Depth + 1))
41771 return true;
41772
41773 KnownZero = LHSZero & RHSZero;
41774 KnownUndef = LHSUndef & RHSUndef;
41775 break;
41776 }
41777 case X86ISD::VZEXT_MOVL: {
41778 // If upper demanded elements are already zero then we have nothing to do.
41779 SDValue Src = Op.getOperand(0);
41780 APInt DemandedUpperElts = DemandedElts;
41781 DemandedUpperElts.clearLowBits(1);
41782 if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1))
41783 return TLO.CombineTo(Op, Src);
41784 break;
41785 }
41786 case X86ISD::VZEXT_LOAD: {
41787 // If upper demanded elements are not demanded then simplify to a
41788 // scalar_to_vector(load()).
41790 if (DemandedElts == 1 && Op.getValue(1).use_empty() && isTypeLegal(SVT)) {
41791 SDLoc DL(Op);
41792 auto *Mem = cast<MemSDNode>(Op);
41793 SDValue Elt = TLO.DAG.getLoad(SVT, DL, Mem->getChain(), Mem->getBasePtr(),
41794 Mem->getMemOperand());
41795 SDValue Vec = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Elt);
41796 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Vec));
41797 }
41798 break;
41799 }
41800 case X86ISD::VBROADCAST: {
41801 SDValue Src = Op.getOperand(0);
41802 MVT SrcVT = Src.getSimpleValueType();
41803 if (!SrcVT.isVector())
41804 break;
41805 // Don't bother broadcasting if we just need the 0'th element.
41806 if (DemandedElts == 1) {
41807 if (Src.getValueType() != VT)
41808 Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
41809 SDLoc(Op));
41810 return TLO.CombineTo(Op, Src);
41811 }
41812 APInt SrcUndef, SrcZero;
41813 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
41814 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
41815 Depth + 1))
41816 return true;
41817 // Aggressively peek through src to get at the demanded elt.
41818 // TODO - we should do this for all target/faux shuffles ops.
41820 Src, SrcElts, TLO.DAG, Depth + 1))
41821 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
41822 break;
41823 }
41824 case X86ISD::VPERMV:
41825 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
41826 Depth))
41827 return true;
41828 break;
41829 case X86ISD::PSHUFB:
41830 case X86ISD::VPERMV3:
41831 case X86ISD::VPERMILPV:
41832 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
41833 Depth))
41834 return true;
41835 break;
41836 case X86ISD::VPPERM:
41837 case X86ISD::VPERMIL2:
41838 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
41839 Depth))
41840 return true;
41841 break;
41842 }
41843
41844 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
41845 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
41846 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
41847 if ((VT.is256BitVector() || VT.is512BitVector()) &&
41848 DemandedElts.lshr(NumElts / 2) == 0) {
41849 unsigned SizeInBits = VT.getSizeInBits();
41850 unsigned ExtSizeInBits = SizeInBits / 2;
41851
41852 // See if 512-bit ops only use the bottom 128-bits.
41853 if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
41854 ExtSizeInBits = SizeInBits / 4;
41855
41856 switch (Opc) {
41857 // Scalar broadcast.
41858 case X86ISD::VBROADCAST: {
41859 SDLoc DL(Op);
41860 SDValue Src = Op.getOperand(0);
41861 if (Src.getValueSizeInBits() > ExtSizeInBits)
41862 Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
41863 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
41864 ExtSizeInBits / VT.getScalarSizeInBits());
41865 SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
41866 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
41867 TLO.DAG, DL, ExtSizeInBits));
41868 }
41870 SDLoc DL(Op);
41871 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
41872 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
41873 ExtSizeInBits / VT.getScalarSizeInBits());
41874 SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
41875 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
41876 SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
41877 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
41878 MemIntr->getMemOperand());
41880 Bcst.getValue(1));
41881 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
41882 TLO.DAG, DL, ExtSizeInBits));
41883 }
41884 // Subvector broadcast.
41886 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
41887 EVT MemVT = MemIntr->getMemoryVT();
41888 if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
41889 SDLoc DL(Op);
41890 SDValue Ld =
41891 TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
41892 MemIntr->getBasePtr(), MemIntr->getMemOperand());
41894 Ld.getValue(1));
41895 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
41896 TLO.DAG, DL, ExtSizeInBits));
41897 } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
41898 SDLoc DL(Op);
41899 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
41900 ExtSizeInBits / VT.getScalarSizeInBits());
41901 if (SDValue BcstLd =
41902 getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))
41903 return TLO.CombineTo(Op,
41904 insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,
41905 TLO.DAG, DL, ExtSizeInBits));
41906 }
41907 break;
41908 }
41909 // Byte shifts by immediate.
41910 case X86ISD::VSHLDQ:
41911 case X86ISD::VSRLDQ:
41912 // Shift by uniform.
41913 case X86ISD::VSHL:
41914 case X86ISD::VSRL:
41915 case X86ISD::VSRA:
41916 // Shift by immediate.
41917 case X86ISD::VSHLI:
41918 case X86ISD::VSRLI:
41919 case X86ISD::VSRAI: {
41920 SDLoc DL(Op);
41921 SDValue Ext0 =
41922 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
41923 SDValue ExtOp =
41924 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
41925 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
41926 SDValue Insert =
41927 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
41928 return TLO.CombineTo(Op, Insert);
41929 }
41930 case X86ISD::VPERMI: {
41931 // Simplify PERMPD/PERMQ to extract_subvector.
41932 // TODO: This should be done in shuffle combining.
41933 if (VT == MVT::v4f64 || VT == MVT::v4i64) {
41935 DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
41936 if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
41937 SDLoc DL(Op);
41938 SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
41939 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
41940 SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
41941 return TLO.CombineTo(Op, Insert);
41942 }
41943 }
41944 break;
41945 }
41946 case X86ISD::VPERM2X128: {
41947 // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
41948 SDLoc DL(Op);
41949 unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
41950 if (LoMask & 0x8)
41951 return TLO.CombineTo(
41952 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
41953 unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
41954 unsigned SrcIdx = (LoMask & 0x2) >> 1;
41955 SDValue ExtOp =
41956 extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
41957 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
41958 SDValue Insert =
41959 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
41960 return TLO.CombineTo(Op, Insert);
41961 }
41962 // Zero upper elements.
41963 case X86ISD::VZEXT_MOVL:
41964 // Target unary shuffles by immediate:
41965 case X86ISD::PSHUFD:
41966 case X86ISD::PSHUFLW:
41967 case X86ISD::PSHUFHW:
41968 case X86ISD::VPERMILPI:
41969 // (Non-Lane Crossing) Target Shuffles.
41970 case X86ISD::VPERMILPV:
41971 case X86ISD::VPERMIL2:
41972 case X86ISD::PSHUFB:
41973 case X86ISD::UNPCKL:
41974 case X86ISD::UNPCKH:
41975 case X86ISD::BLENDI:
41976 // Integer ops.
41977 case X86ISD::PACKSS:
41978 case X86ISD::PACKUS:
41979 case X86ISD::PCMPEQ:
41980 case X86ISD::PCMPGT:
41981 case X86ISD::PMULUDQ:
41982 case X86ISD::PMULDQ:
41983 case X86ISD::VSHLV:
41984 case X86ISD::VSRLV:
41985 case X86ISD::VSRAV:
41986 // Float ops.
41987 case X86ISD::FMAX:
41988 case X86ISD::FMIN:
41989 case X86ISD::FMAXC:
41990 case X86ISD::FMINC:
41991 // Horizontal Ops.
41992 case X86ISD::HADD:
41993 case X86ISD::HSUB:
41994 case X86ISD::FHADD:
41995 case X86ISD::FHSUB: {
41996 SDLoc DL(Op);
41998 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
41999 SDValue SrcOp = Op.getOperand(i);
42000 EVT SrcVT = SrcOp.getValueType();
42001 assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&
42002 "Unsupported vector size");
42003 Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
42004 ExtSizeInBits)
42005 : SrcOp);
42006 }
42007 MVT ExtVT = VT.getSimpleVT();
42008 ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
42009 ExtSizeInBits / ExtVT.getScalarSizeInBits());
42010 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
42011 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
42012 SDValue Insert =
42013 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
42014 return TLO.CombineTo(Op, Insert);
42015 }
42016 }
42017 }
42018
42019 // For splats, unless we *only* demand the 0'th element,
42020 // stop attempts at simplification here, we aren't going to improve things,
42021 // this is better than any potential shuffle.
42022 if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))
42023 return false;
42024
42025 // Get target/faux shuffle mask.
42026 APInt OpUndef, OpZero;
42027 SmallVector<int, 64> OpMask;
42028 SmallVector<SDValue, 2> OpInputs;
42029 if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
42030 OpZero, TLO.DAG, Depth, false))
42031 return false;
42032
42033 // Shuffle inputs must be the same size as the result.
42034 if (OpMask.size() != (unsigned)NumElts ||
42035 llvm::any_of(OpInputs, [VT](SDValue V) {
42036 return VT.getSizeInBits() != V.getValueSizeInBits() ||
42037 !V.getValueType().isVector();
42038 }))
42039 return false;
42040
42041 KnownZero = OpZero;
42042 KnownUndef = OpUndef;
42043
42044 // Check if shuffle mask can be simplified to undef/zero/identity.
42045 int NumSrcs = OpInputs.size();
42046 for (int i = 0; i != NumElts; ++i)
42047 if (!DemandedElts[i])
42048 OpMask[i] = SM_SentinelUndef;
42049
42050 if (isUndefInRange(OpMask, 0, NumElts)) {
42051 KnownUndef.setAllBits();
42052 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
42053 }
42054 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
42055 KnownZero.setAllBits();
42056 return TLO.CombineTo(
42057 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
42058 }
42059 for (int Src = 0; Src != NumSrcs; ++Src)
42060 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
42061 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
42062
42063 // Attempt to simplify inputs.
42064 for (int Src = 0; Src != NumSrcs; ++Src) {
42065 // TODO: Support inputs of different types.
42066 if (OpInputs[Src].getValueType() != VT)
42067 continue;
42068
42069 int Lo = Src * NumElts;
42070 APInt SrcElts = APInt::getZero(NumElts);
42071 for (int i = 0; i != NumElts; ++i)
42072 if (DemandedElts[i]) {
42073 int M = OpMask[i] - Lo;
42074 if (0 <= M && M < NumElts)
42075 SrcElts.setBit(M);
42076 }
42077
42078 // TODO - Propagate input undef/zero elts.
42079 APInt SrcUndef, SrcZero;
42080 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
42081 TLO, Depth + 1))
42082 return true;
42083 }
42084
42085 // If we don't demand all elements, then attempt to combine to a simpler
42086 // shuffle.
42087 // We need to convert the depth to something combineX86ShufflesRecursively
42088 // can handle - so pretend its Depth == 0 again, and reduce the max depth
42089 // to match. This prevents combineX86ShuffleChain from returning a
42090 // combined shuffle that's the same as the original root, causing an
42091 // infinite loop.
42092 if (!DemandedElts.isAllOnes()) {
42093 assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
42094
42095 SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
42096 for (int i = 0; i != NumElts; ++i)
42097 if (DemandedElts[i])
42098 DemandedMask[i] = i;
42099
42101 {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,
42102 /*HasVarMask*/ false,
42103 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, TLO.DAG,
42104 Subtarget);
42105 if (NewShuffle)
42106 return TLO.CombineTo(Op, NewShuffle);
42107 }
42108
42109 return false;
42110}
42111
42113 SDValue Op, const APInt &OriginalDemandedBits,
42114 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
42115 unsigned Depth) const {
42116 EVT VT = Op.getValueType();
42117 unsigned BitWidth = OriginalDemandedBits.getBitWidth();
42118 unsigned Opc = Op.getOpcode();
42119 switch(Opc) {
42120 case X86ISD::VTRUNC: {
42121 KnownBits KnownOp;
42122 SDValue Src = Op.getOperand(0);
42123 MVT SrcVT = Src.getSimpleValueType();
42124
42125 // Simplify the input, using demanded bit information.
42126 APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
42127 APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
42128 if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
42129 return true;
42130 break;
42131 }
42132 case X86ISD::PMULDQ:
42133 case X86ISD::PMULUDQ: {
42134 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
42135 KnownBits KnownLHS, KnownRHS;
42136 SDValue LHS = Op.getOperand(0);
42137 SDValue RHS = Op.getOperand(1);
42138
42139 // Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.
42140 // FIXME: Can we bound this better?
42141 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
42142 APInt DemandedMaskLHS = APInt::getAllOnes(64);
42143 APInt DemandedMaskRHS = APInt::getAllOnes(64);
42144
42145 bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512();
42146 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS))
42147 DemandedMaskLHS = DemandedMask;
42148 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS))
42149 DemandedMaskRHS = DemandedMask;
42150
42151 if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts,
42152 KnownLHS, TLO, Depth + 1))
42153 return true;
42154 if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts,
42155 KnownRHS, TLO, Depth + 1))
42156 return true;
42157
42158 // PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'.
42159 KnownRHS = KnownRHS.trunc(32);
42160 if (Opc == X86ISD::PMULUDQ && KnownRHS.isConstant() &&
42161 KnownRHS.getConstant().isOne()) {
42162 SDLoc DL(Op);
42163 SDValue Mask = TLO.DAG.getConstant(DemandedMask, DL, VT);
42164 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, DL, VT, LHS, Mask));
42165 }
42166
42167 // Aggressively peek through ops to get at the demanded low bits.
42169 LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
42171 RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
42172 if (DemandedLHS || DemandedRHS) {
42173 DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
42174 DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
42175 return TLO.CombineTo(
42176 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
42177 }
42178 break;
42179 }
42180 case X86ISD::ANDNP: {
42181 KnownBits Known2;
42182 SDValue Op0 = Op.getOperand(0);
42183 SDValue Op1 = Op.getOperand(1);
42184
42185 if (SimplifyDemandedBits(Op1, OriginalDemandedBits, OriginalDemandedElts,
42186 Known, TLO, Depth + 1))
42187 return true;
42188 assert(!Known.hasConflict() && "Bits known to be one AND zero?");
42189
42190 if (SimplifyDemandedBits(Op0, ~Known.Zero & OriginalDemandedBits,
42191 OriginalDemandedElts, Known2, TLO, Depth + 1))
42192 return true;
42193 assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
42194
42195 // If the RHS is a constant, see if we can simplify it.
42196 if (ShrinkDemandedConstant(Op, ~Known2.One & OriginalDemandedBits,
42197 OriginalDemandedElts, TLO))
42198 return true;
42199
42200 // ANDNP = (~Op0 & Op1);
42201 Known.One &= Known2.Zero;
42202 Known.Zero |= Known2.One;
42203 break;
42204 }
42205 case X86ISD::VSHLI: {
42206 SDValue Op0 = Op.getOperand(0);
42207
42208 unsigned ShAmt = Op.getConstantOperandVal(1);
42209 if (ShAmt >= BitWidth)
42210 break;
42211
42212 APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
42213
42214 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
42215 // single shift. We can do this if the bottom bits (which are shifted
42216 // out) are never demanded.
42217 if (Op0.getOpcode() == X86ISD::VSRLI &&
42218 OriginalDemandedBits.countr_zero() >= ShAmt) {
42219 unsigned Shift2Amt = Op0.getConstantOperandVal(1);
42220 if (Shift2Amt < BitWidth) {
42221 int Diff = ShAmt - Shift2Amt;
42222 if (Diff == 0)
42223 return TLO.CombineTo(Op, Op0.getOperand(0));
42224
42225 unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
42226 SDValue NewShift = TLO.DAG.getNode(
42227 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
42228 TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
42229 return TLO.CombineTo(Op, NewShift);
42230 }
42231 }
42232
42233 // If we are only demanding sign bits then we can use the shift source directly.
42234 unsigned NumSignBits =
42235 TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
42236 unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
42237 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
42238 return TLO.CombineTo(Op, Op0);
42239
42240 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
42241 TLO, Depth + 1))
42242 return true;
42243
42244 assert(!Known.hasConflict() && "Bits known to be one AND zero?");
42245 Known.Zero <<= ShAmt;
42246 Known.One <<= ShAmt;
42247
42248 // Low bits known zero.
42249 Known.Zero.setLowBits(ShAmt);
42250 return false;
42251 }
42252 case X86ISD::VSRLI: {
42253 unsigned ShAmt = Op.getConstantOperandVal(1);
42254 if (ShAmt >= BitWidth)
42255 break;
42256
42257 APInt DemandedMask = OriginalDemandedBits << ShAmt;
42258
42259 if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
42260 OriginalDemandedElts, Known, TLO, Depth + 1))
42261 return true;
42262
42263 assert(!Known.hasConflict() && "Bits known to be one AND zero?");
42264 Known.Zero.lshrInPlace(ShAmt);
42265 Known.One.lshrInPlace(ShAmt);
42266
42267 // High bits known zero.
42268 Known.Zero.setHighBits(ShAmt);
42269 return false;
42270 }
42271 case X86ISD::VSRAI: {
42272 SDValue Op0 = Op.getOperand(0);
42273 SDValue Op1 = Op.getOperand(1);
42274
42275 unsigned ShAmt = Op1->getAsZExtVal();
42276 if (ShAmt >= BitWidth)
42277 break;
42278
42279 APInt DemandedMask = OriginalDemandedBits << ShAmt;
42280
42281 // If we just want the sign bit then we don't need to shift it.
42282 if (OriginalDemandedBits.isSignMask())
42283 return TLO.CombineTo(Op, Op0);
42284
42285 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
42286 if (Op0.getOpcode() == X86ISD::VSHLI &&
42287 Op.getOperand(1) == Op0.getOperand(1)) {
42288 SDValue Op00 = Op0.getOperand(0);
42289 unsigned NumSignBits =
42290 TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
42291 if (ShAmt < NumSignBits)
42292 return TLO.CombineTo(Op, Op00);
42293 }
42294
42295 // If any of the demanded bits are produced by the sign extension, we also
42296 // demand the input sign bit.
42297 if (OriginalDemandedBits.countl_zero() < ShAmt)
42298 DemandedMask.setSignBit();
42299
42300 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
42301 TLO, Depth + 1))
42302 return true;
42303
42304 assert(!Known.hasConflict() && "Bits known to be one AND zero?");
42305 Known.Zero.lshrInPlace(ShAmt);
42306 Known.One.lshrInPlace(ShAmt);
42307
42308 // If the input sign bit is known to be zero, or if none of the top bits
42309 // are demanded, turn this into an unsigned shift right.
42310 if (Known.Zero[BitWidth - ShAmt - 1] ||
42311 OriginalDemandedBits.countl_zero() >= ShAmt)
42312 return TLO.CombineTo(
42313 Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
42314
42315 // High bits are known one.
42316 if (Known.One[BitWidth - ShAmt - 1])
42317 Known.One.setHighBits(ShAmt);
42318 return false;
42319 }
42320 case X86ISD::BLENDV: {
42321 SDValue Sel = Op.getOperand(0);
42322 SDValue LHS = Op.getOperand(1);
42323 SDValue RHS = Op.getOperand(2);
42324
42325 APInt SignMask = APInt::getSignMask(BitWidth);
42327 Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
42329 LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
42331 RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
42332
42333 if (NewSel || NewLHS || NewRHS) {
42334 NewSel = NewSel ? NewSel : Sel;
42335 NewLHS = NewLHS ? NewLHS : LHS;
42336 NewRHS = NewRHS ? NewRHS : RHS;
42337 return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,
42338 NewSel, NewLHS, NewRHS));
42339 }
42340 break;
42341 }
42342 case X86ISD::PEXTRB:
42343 case X86ISD::PEXTRW: {
42344 SDValue Vec = Op.getOperand(0);
42345 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
42346 MVT VecVT = Vec.getSimpleValueType();
42347 unsigned NumVecElts = VecVT.getVectorNumElements();
42348
42349 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
42350 unsigned Idx = CIdx->getZExtValue();
42351 unsigned VecBitWidth = VecVT.getScalarSizeInBits();
42352
42353 // If we demand no bits from the vector then we must have demanded
42354 // bits from the implict zext - simplify to zero.
42355 APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
42356 if (DemandedVecBits == 0)
42357 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
42358
42359 APInt KnownUndef, KnownZero;
42360 APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
42361 if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
42362 KnownZero, TLO, Depth + 1))
42363 return true;
42364
42365 KnownBits KnownVec;
42366 if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
42367 KnownVec, TLO, Depth + 1))
42368 return true;
42369
42371 Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
42372 return TLO.CombineTo(
42373 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
42374
42375 Known = KnownVec.zext(BitWidth);
42376 return false;
42377 }
42378 break;
42379 }
42380 case X86ISD::PINSRB:
42381 case X86ISD::PINSRW: {
42382 SDValue Vec = Op.getOperand(0);
42383 SDValue Scl = Op.getOperand(1);
42384 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
42385 MVT VecVT = Vec.getSimpleValueType();
42386
42387 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
42388 unsigned Idx = CIdx->getZExtValue();
42389 if (!OriginalDemandedElts[Idx])
42390 return TLO.CombineTo(Op, Vec);
42391
42392 KnownBits KnownVec;
42393 APInt DemandedVecElts(OriginalDemandedElts);
42394 DemandedVecElts.clearBit(Idx);
42395 if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
42396 KnownVec, TLO, Depth + 1))
42397 return true;
42398
42399 KnownBits KnownScl;
42400 unsigned NumSclBits = Scl.getScalarValueSizeInBits();
42401 APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
42402 if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
42403 return true;
42404
42405 KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
42406 Known = KnownVec.intersectWith(KnownScl);
42407 return false;
42408 }
42409 break;
42410 }
42411 case X86ISD::PACKSS:
42412 // PACKSS saturates to MIN/MAX integer values. So if we just want the
42413 // sign bit then we can just ask for the source operands sign bit.
42414 // TODO - add known bits handling.
42415 if (OriginalDemandedBits.isSignMask()) {
42416 APInt DemandedLHS, DemandedRHS;
42417 getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
42418
42419 KnownBits KnownLHS, KnownRHS;
42420 APInt SignMask = APInt::getSignMask(BitWidth * 2);
42421 if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
42422 KnownLHS, TLO, Depth + 1))
42423 return true;
42424 if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
42425 KnownRHS, TLO, Depth + 1))
42426 return true;
42427
42428 // Attempt to avoid multi-use ops if we don't need anything from them.
42430 Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
42432 Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
42433 if (DemandedOp0 || DemandedOp1) {
42434 SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
42435 SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
42436 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
42437 }
42438 }
42439 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
42440 break;
42441 case X86ISD::VBROADCAST: {
42442 SDValue Src = Op.getOperand(0);
42443 MVT SrcVT = Src.getSimpleValueType();
42444 APInt DemandedElts = APInt::getOneBitSet(
42445 SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
42446 if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
42447 TLO, Depth + 1))
42448 return true;
42449 // If we don't need the upper bits, attempt to narrow the broadcast source.
42450 // Don't attempt this on AVX512 as it might affect broadcast folding.
42451 // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
42452 if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
42453 OriginalDemandedBits.countl_zero() >= (BitWidth / 2) &&
42454 Src->hasOneUse()) {
42455 MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
42456 SDValue NewSrc =
42457 TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
42458 MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
42459 SDValue NewBcst =
42460 TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
42461 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
42462 }
42463 break;
42464 }
42465 case X86ISD::PCMPGT:
42466 // icmp sgt(0, R) == ashr(R, BitWidth-1).
42467 // iff we only need the sign bit then we can use R directly.
42468 if (OriginalDemandedBits.isSignMask() &&
42469 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
42470 return TLO.CombineTo(Op, Op.getOperand(1));
42471 break;
42472 case X86ISD::MOVMSK: {
42473 SDValue Src = Op.getOperand(0);
42474 MVT SrcVT = Src.getSimpleValueType();
42475 unsigned SrcBits = SrcVT.getScalarSizeInBits();
42476 unsigned NumElts = SrcVT.getVectorNumElements();
42477
42478 // If we don't need the sign bits at all just return zero.
42479 if (OriginalDemandedBits.countr_zero() >= NumElts)
42480 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
42481
42482 // See if we only demand bits from the lower 128-bit vector.
42483 if (SrcVT.is256BitVector() &&
42484 OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {
42485 SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));
42486 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
42487 }
42488
42489 // Only demand the vector elements of the sign bits we need.
42490 APInt KnownUndef, KnownZero;
42491 APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
42492 if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
42493 TLO, Depth + 1))
42494 return true;
42495
42496 Known.Zero = KnownZero.zext(BitWidth);
42497 Known.Zero.setHighBits(BitWidth - NumElts);
42498
42499 // MOVMSK only uses the MSB from each vector element.
42500 KnownBits KnownSrc;
42501 APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
42502 if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
42503 Depth + 1))
42504 return true;
42505
42506 if (KnownSrc.One[SrcBits - 1])
42507 Known.One.setLowBits(NumElts);
42508 else if (KnownSrc.Zero[SrcBits - 1])
42509 Known.Zero.setLowBits(NumElts);
42510
42511 // Attempt to avoid multi-use os if we don't need anything from it.
42513 Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
42514 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
42515 return false;
42516 }
42517 case X86ISD::TESTP: {
42518 SDValue Op0 = Op.getOperand(0);
42519 SDValue Op1 = Op.getOperand(1);
42520 MVT OpVT = Op0.getSimpleValueType();
42521 assert((OpVT.getVectorElementType() == MVT::f32 ||
42522 OpVT.getVectorElementType() == MVT::f64) &&
42523 "Illegal vector type for X86ISD::TESTP");
42524
42525 // TESTPS/TESTPD only demands the sign bits of ALL the elements.
42526 KnownBits KnownSrc;
42527 APInt SignMask = APInt::getSignMask(OpVT.getScalarSizeInBits());
42528 bool AssumeSingleUse = (Op0 == Op1) && Op->isOnlyUserOf(Op0.getNode());
42529 return SimplifyDemandedBits(Op0, SignMask, KnownSrc, TLO, Depth + 1,
42530 AssumeSingleUse) ||
42531 SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1,
42532 AssumeSingleUse);
42533 }
42534 case X86ISD::BEXTR:
42535 case X86ISD::BEXTRI: {
42536 SDValue Op0 = Op.getOperand(0);
42537 SDValue Op1 = Op.getOperand(1);
42538
42539 // Only bottom 16-bits of the control bits are required.
42540 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
42541 // NOTE: SimplifyDemandedBits won't do this for constants.
42542 uint64_t Val1 = Cst1->getZExtValue();
42543 uint64_t MaskedVal1 = Val1 & 0xFFFF;
42544 if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
42545 SDLoc DL(Op);
42546 return TLO.CombineTo(
42547 Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
42548 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
42549 }
42550
42551 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
42552 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
42553
42554 // If the length is 0, the result is 0.
42555 if (Length == 0) {
42556 Known.setAllZero();
42557 return false;
42558 }
42559
42560 if ((Shift + Length) <= BitWidth) {
42561 APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
42562 if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
42563 return true;
42564
42565 Known = Known.extractBits(Length, Shift);
42566 Known = Known.zextOrTrunc(BitWidth);
42567 return false;
42568 }
42569 } else {
42570 assert(Opc == X86ISD::BEXTR && "Unexpected opcode!");
42571 KnownBits Known1;
42572 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
42573 if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
42574 return true;
42575
42576 // If the length is 0, replace with 0.
42577 KnownBits LengthBits = Known1.extractBits(8, 8);
42578 if (LengthBits.isZero())
42579 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
42580 }
42581
42582 break;
42583 }
42584 case X86ISD::PDEP: {
42585 SDValue Op0 = Op.getOperand(0);
42586 SDValue Op1 = Op.getOperand(1);
42587
42588 unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();
42589 APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
42590
42591 // If the demanded bits has leading zeroes, we don't demand those from the
42592 // mask.
42593 if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
42594 return true;
42595
42596 // The number of possible 1s in the mask determines the number of LSBs of
42597 // operand 0 used. Undemanded bits from the mask don't matter so filter
42598 // them before counting.
42599 KnownBits Known2;
42600 uint64_t Count = (~Known.Zero & LoMask).popcount();
42601 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
42602 if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
42603 return true;
42604
42605 // Zeroes are retained from the mask, but not ones.
42606 Known.One.clearAllBits();
42607 // The result will have at least as many trailing zeros as the non-mask
42608 // operand since bits can only map to the same or higher bit position.
42609 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
42610 return false;
42611 }
42612 }
42613
42615 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
42616}
42617
42619 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
42620 SelectionDAG &DAG, unsigned Depth) const {
42621 int NumElts = DemandedElts.getBitWidth();
42622 unsigned Opc = Op.getOpcode();
42623 EVT VT = Op.getValueType();
42624
42625 switch (Opc) {
42626 case X86ISD::PINSRB:
42627 case X86ISD::PINSRW: {
42628 // If we don't demand the inserted element, return the base vector.
42629 SDValue Vec = Op.getOperand(0);
42630 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
42631 MVT VecVT = Vec.getSimpleValueType();
42632 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
42633 !DemandedElts[CIdx->getZExtValue()])
42634 return Vec;
42635 break;
42636 }
42637 case X86ISD::VSHLI: {
42638 // If we are only demanding sign bits then we can use the shift source
42639 // directly.
42640 SDValue Op0 = Op.getOperand(0);
42641 unsigned ShAmt = Op.getConstantOperandVal(1);
42642 unsigned BitWidth = DemandedBits.getBitWidth();
42643 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
42644 unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();
42645 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
42646 return Op0;
42647 break;
42648 }
42649 case X86ISD::VSRAI:
42650 // iff we only need the sign bit then we can use the source directly.
42651 // TODO: generalize where we only demand extended signbits.
42652 if (DemandedBits.isSignMask())
42653 return Op.getOperand(0);
42654 break;
42655 case X86ISD::PCMPGT:
42656 // icmp sgt(0, R) == ashr(R, BitWidth-1).
42657 // iff we only need the sign bit then we can use R directly.
42658 if (DemandedBits.isSignMask() &&
42659 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
42660 return Op.getOperand(1);
42661 break;
42662 case X86ISD::BLENDV: {
42663 // BLENDV: Cond (MSB) ? LHS : RHS
42664 SDValue Cond = Op.getOperand(0);
42665 SDValue LHS = Op.getOperand(1);
42666 SDValue RHS = Op.getOperand(2);
42667
42668 KnownBits CondKnown = DAG.computeKnownBits(Cond, DemandedElts, Depth + 1);
42669 if (CondKnown.isNegative())
42670 return LHS;
42671 if (CondKnown.isNonNegative())
42672 return RHS;
42673 break;
42674 }
42675 case X86ISD::ANDNP: {
42676 // ANDNP = (~LHS & RHS);
42677 SDValue LHS = Op.getOperand(0);
42678 SDValue RHS = Op.getOperand(1);
42679
42680 KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1);
42681 KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1);
42682
42683 // If all of the demanded bits are known 0 on LHS and known 0 on RHS, then
42684 // the (inverted) LHS bits cannot contribute to the result of the 'andn' in
42685 // this context, so return RHS.
42686 if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero))
42687 return RHS;
42688 break;
42689 }
42690 }
42691
42692 APInt ShuffleUndef, ShuffleZero;
42693 SmallVector<int, 16> ShuffleMask;
42695 if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
42696 ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
42697 // If all the demanded elts are from one operand and are inline,
42698 // then we can use the operand directly.
42699 int NumOps = ShuffleOps.size();
42700 if (ShuffleMask.size() == (unsigned)NumElts &&
42702 return VT.getSizeInBits() == V.getValueSizeInBits();
42703 })) {
42704
42705 if (DemandedElts.isSubsetOf(ShuffleUndef))
42706 return DAG.getUNDEF(VT);
42707 if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
42708 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
42709
42710 // Bitmask that indicates which ops have only been accessed 'inline'.
42711 APInt IdentityOp = APInt::getAllOnes(NumOps);
42712 for (int i = 0; i != NumElts; ++i) {
42713 int M = ShuffleMask[i];
42714 if (!DemandedElts[i] || ShuffleUndef[i])
42715 continue;
42716 int OpIdx = M / NumElts;
42717 int EltIdx = M % NumElts;
42718 if (M < 0 || EltIdx != i) {
42719 IdentityOp.clearAllBits();
42720 break;
42721 }
42722 IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
42723 if (IdentityOp == 0)
42724 break;
42725 }
42726 assert((IdentityOp == 0 || IdentityOp.popcount() == 1) &&
42727 "Multiple identity shuffles detected");
42728
42729 if (IdentityOp != 0)
42730 return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countr_zero()]);
42731 }
42732 }
42733
42735 Op, DemandedBits, DemandedElts, DAG, Depth);
42736}
42737
42739 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
42740 bool PoisonOnly, unsigned Depth) const {
42741 unsigned NumElts = DemandedElts.getBitWidth();
42742
42743 // TODO: Add more target shuffles.
42744 switch (Op.getOpcode()) {
42745 case X86ISD::PSHUFD:
42746 case X86ISD::VPERMILPI: {
42749 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
42750 SmallVector<APInt, 2> DemandedSrcElts(Ops.size(),
42751 APInt::getZero(NumElts));
42752 for (auto M : enumerate(Mask)) {
42753 if (!DemandedElts[M.index()] || M.value() == SM_SentinelZero)
42754 continue;
42755 if (M.value() == SM_SentinelUndef)
42756 return false;
42757 assert(0 <= M.value() && M.value() < (int)(Ops.size() * NumElts) &&
42758 "Shuffle mask index out of range");
42759 DemandedSrcElts[M.value() / NumElts].setBit(M.value() % NumElts);
42760 }
42761 for (auto Op : enumerate(Ops))
42762 if (!DemandedSrcElts[Op.index()].isZero() &&
42764 Op.value(), DemandedSrcElts[Op.index()], PoisonOnly, Depth + 1))
42765 return false;
42766 return true;
42767 }
42768 break;
42769 }
42770 }
42772 Op, DemandedElts, DAG, PoisonOnly, Depth);
42773}
42774
42776 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
42777 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
42778
42779 // TODO: Add more target shuffles.
42780 switch (Op.getOpcode()) {
42781 case X86ISD::PSHUFD:
42782 case X86ISD::VPERMILPI:
42783 case X86ISD::UNPCKH:
42784 case X86ISD::UNPCKL:
42785 return false;
42786 }
42788 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
42789}
42790
42792 const APInt &DemandedElts,
42793 APInt &UndefElts,
42794 const SelectionDAG &DAG,
42795 unsigned Depth) const {
42796 unsigned NumElts = DemandedElts.getBitWidth();
42797 unsigned Opc = Op.getOpcode();
42798
42799 switch (Opc) {
42800 case X86ISD::VBROADCAST:
42802 UndefElts = APInt::getZero(NumElts);
42803 return true;
42804 }
42805
42806 return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,
42807 DAG, Depth);
42808}
42809
42810// Helper to peek through bitops/trunc/setcc to determine size of source vector.
42811// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
42812static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
42813 bool AllowTruncate) {
42814 switch (Src.getOpcode()) {
42815 case ISD::TRUNCATE:
42816 if (!AllowTruncate)
42817 return false;
42818 [[fallthrough]];
42819 case ISD::SETCC:
42820 return Src.getOperand(0).getValueSizeInBits() == Size;
42821 case ISD::AND:
42822 case ISD::XOR:
42823 case ISD::OR:
42824 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate) &&
42825 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate);
42826 case ISD::SELECT:
42827 case ISD::VSELECT:
42828 return Src.getOperand(0).getScalarValueSizeInBits() == 1 &&
42829 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate) &&
42830 checkBitcastSrcVectorSize(Src.getOperand(2), Size, AllowTruncate);
42831 case ISD::BUILD_VECTOR:
42832 return ISD::isBuildVectorAllZeros(Src.getNode()) ||
42833 ISD::isBuildVectorAllOnes(Src.getNode());
42834 }
42835 return false;
42836}
42837
42838// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
42839static unsigned getAltBitOpcode(unsigned Opcode) {
42840 switch(Opcode) {
42841 // clang-format off
42842 case ISD::AND: return X86ISD::FAND;
42843 case ISD::OR: return X86ISD::FOR;
42844 case ISD::XOR: return X86ISD::FXOR;
42845 case X86ISD::ANDNP: return X86ISD::FANDN;
42846 // clang-format on
42847 }
42848 llvm_unreachable("Unknown bitwise opcode");
42849}
42850
42851// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
42853 const SDLoc &DL) {
42854 EVT SrcVT = Src.getValueType();
42855 if (SrcVT != MVT::v4i1)
42856 return SDValue();
42857
42858 switch (Src.getOpcode()) {
42859 case ISD::SETCC:
42860 if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
42861 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
42862 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
42863 SDValue Op0 = Src.getOperand(0);
42864 if (ISD::isNormalLoad(Op0.getNode()))
42865 return DAG.getBitcast(MVT::v4f32, Op0);
42866 if (Op0.getOpcode() == ISD::BITCAST &&
42867 Op0.getOperand(0).getValueType() == MVT::v4f32)
42868 return Op0.getOperand(0);
42869 }
42870 break;
42871 case ISD::AND:
42872 case ISD::XOR:
42873 case ISD::OR: {
42874 SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
42875 SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
42876 if (Op0 && Op1)
42877 return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
42878 Op1);
42879 break;
42880 }
42881 }
42882 return SDValue();
42883}
42884
42885// Helper to push sign extension of vXi1 SETCC result through bitops.
42887 SDValue Src, const SDLoc &DL) {
42888 switch (Src.getOpcode()) {
42889 case ISD::SETCC:
42890 case ISD::TRUNCATE:
42891 case ISD::BUILD_VECTOR:
42892 return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
42893 case ISD::AND:
42894 case ISD::XOR:
42895 case ISD::OR:
42896 return DAG.getNode(
42897 Src.getOpcode(), DL, SExtVT,
42898 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
42899 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
42900 case ISD::SELECT:
42901 case ISD::VSELECT:
42902 return DAG.getSelect(
42903 DL, SExtVT, Src.getOperand(0),
42904 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL),
42905 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(2), DL));
42906 }
42907 llvm_unreachable("Unexpected node type for vXi1 sign extension");
42908}
42909
42910// Try to match patterns such as
42911// (i16 bitcast (v16i1 x))
42912// ->
42913// (i16 movmsk (16i8 sext (v16i1 x)))
42914// before the illegal vector is scalarized on subtargets that don't have legal
42915// vxi1 types.
42917 const SDLoc &DL,
42918 const X86Subtarget &Subtarget) {
42919 EVT SrcVT = Src.getValueType();
42920 if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
42921 return SDValue();
42922
42923 // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
42924 // legalization destroys the v4i32 type.
42925 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
42926 if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
42927 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
42928 DAG.getBitcast(MVT::v4f32, V));
42929 return DAG.getZExtOrTrunc(V, DL, VT);
42930 }
42931 }
42932
42933 // If the input is a truncate from v16i8 or v32i8 go ahead and use a
42934 // movmskb even with avx512. This will be better than truncating to vXi1 and
42935 // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
42936 // vpcmpeqb/vpcmpgtb.
42937 bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
42938 (Src.getOperand(0).getValueType() == MVT::v16i8 ||
42939 Src.getOperand(0).getValueType() == MVT::v32i8 ||
42940 Src.getOperand(0).getValueType() == MVT::v64i8);
42941
42942 // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
42943 // directly with vpmovmskb/vmovmskps/vmovmskpd.
42944 if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
42945 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
42946 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
42947 EVT CmpVT = Src.getOperand(0).getValueType();
42948 EVT EltVT = CmpVT.getVectorElementType();
42949 if (CmpVT.getSizeInBits() <= 256 &&
42950 (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
42951 PreferMovMsk = true;
42952 }
42953
42954 // With AVX512 vxi1 types are legal and we prefer using k-regs.
42955 // MOVMSK is supported in SSE2 or later.
42956 if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
42957 return SDValue();
42958
42959 // If the upper ops of a concatenation are undef, then try to bitcast the
42960 // lower op and extend.
42961 SmallVector<SDValue, 4> SubSrcOps;
42962 if (collectConcatOps(Src.getNode(), SubSrcOps, DAG) &&
42963 SubSrcOps.size() >= 2) {
42964 SDValue LowerOp = SubSrcOps[0];
42965 ArrayRef<SDValue> UpperOps(std::next(SubSrcOps.begin()), SubSrcOps.end());
42966 if (LowerOp.getOpcode() == ISD::SETCC &&
42967 all_of(UpperOps, [](SDValue Op) { return Op.isUndef(); })) {
42968 EVT SubVT = VT.getIntegerVT(
42969 *DAG.getContext(), LowerOp.getValueType().getVectorMinNumElements());
42970 if (SDValue V = combineBitcastvxi1(DAG, SubVT, LowerOp, DL, Subtarget)) {
42971 EVT IntVT = VT.getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
42972 return DAG.getBitcast(VT, DAG.getNode(ISD::ANY_EXTEND, DL, IntVT, V));
42973 }
42974 }
42975 }
42976
42977 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
42978 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
42979 // v8i16 and v16i16.
42980 // For these two cases, we can shuffle the upper element bytes to a
42981 // consecutive sequence at the start of the vector and treat the results as
42982 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
42983 // for v16i16 this is not the case, because the shuffle is expensive, so we
42984 // avoid sign-extending to this type entirely.
42985 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
42986 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
42987 MVT SExtVT;
42988 bool PropagateSExt = false;
42989 switch (SrcVT.getSimpleVT().SimpleTy) {
42990 default:
42991 return SDValue();
42992 case MVT::v2i1:
42993 SExtVT = MVT::v2i64;
42994 break;
42995 case MVT::v4i1:
42996 SExtVT = MVT::v4i32;
42997 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
42998 // sign-extend to a 256-bit operation to avoid truncation.
42999 if (Subtarget.hasAVX() &&
43000 checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2())) {
43001 SExtVT = MVT::v4i64;
43002 PropagateSExt = true;
43003 }
43004 break;
43005 case MVT::v8i1:
43006 SExtVT = MVT::v8i16;
43007 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
43008 // sign-extend to a 256-bit operation to match the compare.
43009 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
43010 // 256-bit because the shuffle is cheaper than sign extending the result of
43011 // the compare.
43012 if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true) ||
43013 checkBitcastSrcVectorSize(Src, 512, true))) {
43014 SExtVT = MVT::v8i32;
43015 PropagateSExt = true;
43016 }
43017 break;
43018 case MVT::v16i1:
43019 SExtVT = MVT::v16i8;
43020 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
43021 // it is not profitable to sign-extend to 256-bit because this will
43022 // require an extra cross-lane shuffle which is more expensive than
43023 // truncating the result of the compare to 128-bits.
43024 break;
43025 case MVT::v32i1:
43026 SExtVT = MVT::v32i8;
43027 break;
43028 case MVT::v64i1:
43029 // If we have AVX512F, but not AVX512BW and the input is truncated from
43030 // v64i8 checked earlier. Then split the input and make two pmovmskbs.
43031 if (Subtarget.hasAVX512()) {
43032 if (Subtarget.hasBWI())
43033 return SDValue();
43034 SExtVT = MVT::v64i8;
43035 break;
43036 }
43037 // Split if this is a <64 x i8> comparison result.
43038 if (checkBitcastSrcVectorSize(Src, 512, false)) {
43039 SExtVT = MVT::v64i8;
43040 break;
43041 }
43042 return SDValue();
43043 };
43044
43045 SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
43046 : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
43047
43048 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
43049 V = getPMOVMSKB(DL, V, DAG, Subtarget);
43050 } else {
43051 if (SExtVT == MVT::v8i16) {
43052 V = widenSubVector(V, false, Subtarget, DAG, DL, 256);
43053 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v16i8, V);
43054 }
43055 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
43056 }
43057
43058 EVT IntVT =
43060 V = DAG.getZExtOrTrunc(V, DL, IntVT);
43061 return DAG.getBitcast(VT, V);
43062}
43063
43064// Convert a vXi1 constant build vector to the same width scalar integer.
43066 EVT SrcVT = Op.getValueType();
43067 assert(SrcVT.getVectorElementType() == MVT::i1 &&
43068 "Expected a vXi1 vector");
43070 "Expected a constant build vector");
43071
43072 APInt Imm(SrcVT.getVectorNumElements(), 0);
43073 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
43074 SDValue In = Op.getOperand(Idx);
43075 if (!In.isUndef() && (In->getAsZExtVal() & 0x1))
43076 Imm.setBit(Idx);
43077 }
43078 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
43079 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
43080}
43081
43084 const X86Subtarget &Subtarget) {
43085 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
43086
43087 if (!DCI.isBeforeLegalizeOps())
43088 return SDValue();
43089
43090 // Only do this if we have k-registers.
43091 if (!Subtarget.hasAVX512())
43092 return SDValue();
43093
43094 EVT DstVT = N->getValueType(0);
43095 SDValue Op = N->getOperand(0);
43096 EVT SrcVT = Op.getValueType();
43097
43098 if (!Op.hasOneUse())
43099 return SDValue();
43100
43101 // Look for logic ops.
43102 if (Op.getOpcode() != ISD::AND &&
43103 Op.getOpcode() != ISD::OR &&
43104 Op.getOpcode() != ISD::XOR)
43105 return SDValue();
43106
43107 // Make sure we have a bitcast between mask registers and a scalar type.
43108 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
43109 DstVT.isScalarInteger()) &&
43110 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
43111 SrcVT.isScalarInteger()))
43112 return SDValue();
43113
43114 SDValue LHS = Op.getOperand(0);
43115 SDValue RHS = Op.getOperand(1);
43116
43117 if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
43118 LHS.getOperand(0).getValueType() == DstVT)
43119 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
43120 DAG.getBitcast(DstVT, RHS));
43121
43122 if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
43123 RHS.getOperand(0).getValueType() == DstVT)
43124 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
43125 DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
43126
43127 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
43128 // Most of these have to move a constant from the scalar domain anyway.
43131 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
43132 DAG.getBitcast(DstVT, LHS), RHS);
43133 }
43134
43135 return SDValue();
43136}
43137
43139 const X86Subtarget &Subtarget) {
43140 SDLoc DL(BV);
43141 unsigned NumElts = BV->getNumOperands();
43142 SDValue Splat = BV->getSplatValue();
43143
43144 // Build MMX element from integer GPR or SSE float values.
43145 auto CreateMMXElement = [&](SDValue V) {
43146 if (V.isUndef())
43147 return DAG.getUNDEF(MVT::x86mmx);
43148 if (V.getValueType().isFloatingPoint()) {
43149 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
43150 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
43151 V = DAG.getBitcast(MVT::v2i64, V);
43152 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
43153 }
43154 V = DAG.getBitcast(MVT::i32, V);
43155 } else {
43156 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
43157 }
43158 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
43159 };
43160
43161 // Convert build vector ops to MMX data in the bottom elements.
43163
43164 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43165
43166 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
43167 if (Splat) {
43168 if (Splat.isUndef())
43169 return DAG.getUNDEF(MVT::x86mmx);
43170
43171 Splat = CreateMMXElement(Splat);
43172
43173 if (Subtarget.hasSSE1()) {
43174 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
43175 if (NumElts == 8)
43176 Splat = DAG.getNode(
43177 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
43178 DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
43179 TLI.getPointerTy(DAG.getDataLayout())),
43180 Splat, Splat);
43181
43182 // Use PSHUFW to repeat 16-bit elements.
43183 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
43184 return DAG.getNode(
43185 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
43186 DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
43187 TLI.getPointerTy(DAG.getDataLayout())),
43188 Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
43189 }
43190 Ops.append(NumElts, Splat);
43191 } else {
43192 for (unsigned i = 0; i != NumElts; ++i)
43193 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
43194 }
43195
43196 // Use tree of PUNPCKLs to build up general MMX vector.
43197 while (Ops.size() > 1) {
43198 unsigned NumOps = Ops.size();
43199 unsigned IntrinOp =
43200 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
43201 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
43202 : Intrinsic::x86_mmx_punpcklbw));
43203 SDValue Intrin = DAG.getTargetConstant(
43204 IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
43205 for (unsigned i = 0; i != NumOps; i += 2)
43206 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
43207 Ops[i], Ops[i + 1]);
43208 Ops.resize(NumOps / 2);
43209 }
43210
43211 return Ops[0];
43212}
43213
43214// Recursive function that attempts to find if a bool vector node was originally
43215// a vector/float/double that got truncated/extended/bitcast to/from a scalar
43216// integer. If so, replace the scalar ops with bool vector equivalents back down
43217// the chain.
43219 SelectionDAG &DAG,
43220 const X86Subtarget &Subtarget) {
43221 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43222 unsigned Opc = V.getOpcode();
43223 switch (Opc) {
43224 case ISD::BITCAST: {
43225 // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
43226 SDValue Src = V.getOperand(0);
43227 EVT SrcVT = Src.getValueType();
43228 if (SrcVT.isVector() || SrcVT.isFloatingPoint())
43229 return DAG.getBitcast(VT, Src);
43230 break;
43231 }
43232 case ISD::TRUNCATE: {
43233 // If we find a suitable source, a truncated scalar becomes a subvector.
43234 SDValue Src = V.getOperand(0);
43235 EVT NewSrcVT =
43236 EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
43237 if (TLI.isTypeLegal(NewSrcVT))
43238 if (SDValue N0 =
43239 combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
43240 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
43241 DAG.getIntPtrConstant(0, DL));
43242 break;
43243 }
43244 case ISD::ANY_EXTEND:
43245 case ISD::ZERO_EXTEND: {
43246 // If we find a suitable source, an extended scalar becomes a subvector.
43247 SDValue Src = V.getOperand(0);
43248 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
43249 Src.getScalarValueSizeInBits());
43250 if (TLI.isTypeLegal(NewSrcVT))
43251 if (SDValue N0 =
43252 combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
43253 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
43254 Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
43255 : DAG.getConstant(0, DL, VT),
43256 N0, DAG.getIntPtrConstant(0, DL));
43257 break;
43258 }
43259 case ISD::OR: {
43260 // If we find suitable sources, we can just move an OR to the vector domain.
43261 SDValue Src0 = V.getOperand(0);
43262 SDValue Src1 = V.getOperand(1);
43263 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
43264 if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))
43265 return DAG.getNode(Opc, DL, VT, N0, N1);
43266 break;
43267 }
43268 case ISD::SHL: {
43269 // If we find a suitable source, a SHL becomes a KSHIFTL.
43270 SDValue Src0 = V.getOperand(0);
43271 if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
43272 ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
43273 break;
43274
43275 if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
43276 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
43277 return DAG.getNode(
43278 X86ISD::KSHIFTL, DL, VT, N0,
43279 DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
43280 break;
43281 }
43282 }
43283 return SDValue();
43284}
43285
43288 const X86Subtarget &Subtarget) {
43289 SDValue N0 = N->getOperand(0);
43290 EVT VT = N->getValueType(0);
43291 EVT SrcVT = N0.getValueType();
43292 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43293
43294 // Try to match patterns such as
43295 // (i16 bitcast (v16i1 x))
43296 // ->
43297 // (i16 movmsk (16i8 sext (v16i1 x)))
43298 // before the setcc result is scalarized on subtargets that don't have legal
43299 // vxi1 types.
43300 if (DCI.isBeforeLegalize()) {
43301 SDLoc dl(N);
43302 if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
43303 return V;
43304
43305 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
43306 // type, widen both sides to avoid a trip through memory.
43307 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
43308 Subtarget.hasAVX512()) {
43309 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
43310 N0 = DAG.getBitcast(MVT::v8i1, N0);
43311 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
43312 DAG.getIntPtrConstant(0, dl));
43313 }
43314
43315 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
43316 // type, widen both sides to avoid a trip through memory.
43317 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
43318 Subtarget.hasAVX512()) {
43319 // Use zeros for the widening if we already have some zeroes. This can
43320 // allow SimplifyDemandedBits to remove scalar ANDs that may be down
43321 // stream of this.
43322 // FIXME: It might make sense to detect a concat_vectors with a mix of
43323 // zeroes and undef and turn it into insert_subvector for i1 vectors as
43324 // a separate combine. What we can't do is canonicalize the operands of
43325 // such a concat or we'll get into a loop with SimplifyDemandedBits.
43326 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
43327 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
43328 if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
43329 SrcVT = LastOp.getValueType();
43330 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
43331 SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());
43332 Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
43333 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
43334 N0 = DAG.getBitcast(MVT::i8, N0);
43335 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
43336 }
43337 }
43338
43339 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
43340 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
43341 Ops[0] = N0;
43342 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
43343 N0 = DAG.getBitcast(MVT::i8, N0);
43344 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
43345 }
43346 } else {
43347 // If we're bitcasting from iX to vXi1, see if the integer originally
43348 // began as a vXi1 and whether we can remove the bitcast entirely.
43349 if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
43350 SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
43351 if (SDValue V =
43352 combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
43353 return V;
43354 }
43355 }
43356
43357 // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
43358 // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
43359 // due to insert_subvector legalization on KNL. By promoting the copy to i16
43360 // we can help with known bits propagation from the vXi1 domain to the
43361 // scalar domain.
43362 if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
43363 !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
43364 N0.getOperand(0).getValueType() == MVT::v16i1 &&
43366 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
43367 DAG.getBitcast(MVT::i16, N0.getOperand(0)));
43368
43369 // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
43370 // and the vbroadcast_load are both integer or both fp. In some cases this
43371 // will remove the bitcast entirely.
43372 if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
43373 VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
43374 auto *BCast = cast<MemIntrinsicSDNode>(N0);
43375 unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
43376 unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
43377 // Don't swap i8/i16 since don't have fp types that size.
43378 if (MemSize >= 32) {
43379 MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
43380 : MVT::getIntegerVT(MemSize);
43381 MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
43382 : MVT::getIntegerVT(SrcVTSize);
43383 LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
43384
43385 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
43386 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
43387 SDValue ResNode =
43389 MemVT, BCast->getMemOperand());
43390 DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
43391 return DAG.getBitcast(VT, ResNode);
43392 }
43393 }
43394
43395 // Since MMX types are special and don't usually play with other vector types,
43396 // it's better to handle them early to be sure we emit efficient code by
43397 // avoiding store-load conversions.
43398 if (VT == MVT::x86mmx) {
43399 // Detect MMX constant vectors.
43400 APInt UndefElts;
43401 SmallVector<APInt, 1> EltBits;
43402 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits,
43403 /*AllowWholeUndefs*/ true,
43404 /*AllowPartialUndefs*/ true)) {
43405 SDLoc DL(N0);
43406 // Handle zero-extension of i32 with MOVD.
43407 if (EltBits[0].countl_zero() >= 32)
43408 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
43409 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
43410 // Else, bitcast to a double.
43411 // TODO - investigate supporting sext 32-bit immediates on x86_64.
43412 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
43413 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
43414 }
43415
43416 // Detect bitcasts to x86mmx low word.
43417 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
43418 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
43419 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
43420 bool LowUndef = true, AllUndefOrZero = true;
43421 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
43422 SDValue Op = N0.getOperand(i);
43423 LowUndef &= Op.isUndef() || (i >= e/2);
43424 AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
43425 }
43426 if (AllUndefOrZero) {
43427 SDValue N00 = N0.getOperand(0);
43428 SDLoc dl(N00);
43429 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
43430 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
43431 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
43432 }
43433 }
43434
43435 // Detect bitcasts of 64-bit build vectors and convert to a
43436 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
43437 // lowest element.
43438 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
43439 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
43440 SrcVT == MVT::v8i8))
43441 return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
43442
43443 // Detect bitcasts between element or subvector extraction to x86mmx.
43444 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
43446 isNullConstant(N0.getOperand(1))) {
43447 SDValue N00 = N0.getOperand(0);
43448 if (N00.getValueType().is128BitVector())
43449 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
43450 DAG.getBitcast(MVT::v2i64, N00));
43451 }
43452
43453 // Detect bitcasts from FP_TO_SINT to x86mmx.
43454 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
43455 SDLoc DL(N0);
43456 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
43457 DAG.getUNDEF(MVT::v2i32));
43458 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
43459 DAG.getBitcast(MVT::v2i64, Res));
43460 }
43461 }
43462
43463 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
43464 // most of these to scalar anyway.
43465 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
43466 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
43468 return combinevXi1ConstantToInteger(N0, DAG);
43469 }
43470
43471 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
43472 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
43473 isa<ConstantSDNode>(N0)) {
43474 auto *C = cast<ConstantSDNode>(N0);
43475 if (C->isAllOnes())
43476 return DAG.getConstant(1, SDLoc(N0), VT);
43477 if (C->isZero())
43478 return DAG.getConstant(0, SDLoc(N0), VT);
43479 }
43480
43481 // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
43482 // Turn it into a sign bit compare that produces a k-register. This avoids
43483 // a trip through a GPR.
43484 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
43485 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
43487 unsigned NumElts = VT.getVectorNumElements();
43488 SDValue Src = N0;
43489
43490 // Peek through truncate.
43491 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
43492 Src = N0.getOperand(0);
43493
43494 if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
43495 SDValue MovmskIn = Src.getOperand(0);
43496 MVT MovmskVT = MovmskIn.getSimpleValueType();
43497 unsigned MovMskElts = MovmskVT.getVectorNumElements();
43498
43499 // We allow extra bits of the movmsk to be used since they are known zero.
43500 // We can't convert a VPMOVMSKB without avx512bw.
43501 if (MovMskElts <= NumElts &&
43502 (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
43503 EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
43504 MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
43505 SDLoc dl(N);
43506 MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
43507 SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
43508 DAG.getConstant(0, dl, IntVT), ISD::SETLT);
43509 if (EVT(CmpVT) == VT)
43510 return Cmp;
43511
43512 // Pad with zeroes up to original VT to replace the zeroes that were
43513 // being used from the MOVMSK.
43514 unsigned NumConcats = NumElts / MovMskElts;
43515 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
43516 Ops[0] = Cmp;
43517 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
43518 }
43519 }
43520 }
43521
43522 // Try to remove bitcasts from input and output of mask arithmetic to
43523 // remove GPR<->K-register crossings.
43524 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
43525 return V;
43526
43527 // Convert a bitcasted integer logic operation that has one bitcasted
43528 // floating-point operand into a floating-point logic operation. This may
43529 // create a load of a constant, but that is cheaper than materializing the
43530 // constant in an integer register and transferring it to an SSE register or
43531 // transferring the SSE operand to integer register and back.
43532 unsigned FPOpcode;
43533 switch (N0.getOpcode()) {
43534 // clang-format off
43535 case ISD::AND: FPOpcode = X86ISD::FAND; break;
43536 case ISD::OR: FPOpcode = X86ISD::FOR; break;
43537 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
43538 default: return SDValue();
43539 // clang-format on
43540 }
43541
43542 // Check if we have a bitcast from another integer type as well.
43543 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
43544 (Subtarget.hasSSE2() && VT == MVT::f64) ||
43545 (Subtarget.hasFP16() && VT == MVT::f16) ||
43546 (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
43547 TLI.isTypeLegal(VT))))
43548 return SDValue();
43549
43550 SDValue LogicOp0 = N0.getOperand(0);
43551 SDValue LogicOp1 = N0.getOperand(1);
43552 SDLoc DL0(N0);
43553
43554 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
43555 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
43556 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
43557 LogicOp0.getOperand(0).getValueType() == VT &&
43558 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
43559 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
43560 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
43561 return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
43562 }
43563 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
43564 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
43565 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
43566 LogicOp1.getOperand(0).getValueType() == VT &&
43567 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
43568 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
43569 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
43570 return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
43571 }
43572
43573 return SDValue();
43574}
43575
43576// (mul (zext a), (sext, b))
43577static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,
43578 SDValue &Op1) {
43579 Op0 = Mul.getOperand(0);
43580 Op1 = Mul.getOperand(1);
43581
43582 // The operand1 should be signed extend
43583 if (Op0.getOpcode() == ISD::SIGN_EXTEND)
43584 std::swap(Op0, Op1);
43585
43586 auto IsFreeTruncation = [](SDValue &Op) -> bool {
43587 if ((Op.getOpcode() == ISD::ZERO_EXTEND ||
43588 Op.getOpcode() == ISD::SIGN_EXTEND) &&
43589 Op.getOperand(0).getScalarValueSizeInBits() <= 8)
43590 return true;
43591
43592 auto *BV = dyn_cast<BuildVectorSDNode>(Op);
43593 return (BV && BV->isConstant());
43594 };
43595
43596 // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned
43597 // value, we need to check Op0 is zero extended value. Op1 should be signed
43598 // value, so we just check the signed bits.
43599 if ((IsFreeTruncation(Op0) &&
43600 DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&
43601 (IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))
43602 return true;
43603
43604 return false;
43605}
43606
43607// Given a ABS node, detect the following pattern:
43608// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
43609// This is useful as it is the input into a SAD pattern.
43610static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
43611 SDValue AbsOp1 = Abs->getOperand(0);
43612 if (AbsOp1.getOpcode() != ISD::SUB)
43613 return false;
43614
43615 Op0 = AbsOp1.getOperand(0);
43616 Op1 = AbsOp1.getOperand(1);
43617
43618 // Check if the operands of the sub are zero-extended from vectors of i8.
43619 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
43620 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
43621 Op1.getOpcode() != ISD::ZERO_EXTEND ||
43622 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
43623 return false;
43624
43625 return true;
43626}
43627
43629 unsigned &LogBias, const SDLoc &DL,
43630 const X86Subtarget &Subtarget) {
43631 // Extend or truncate to MVT::i8 first.
43632 MVT Vi8VT =
43633 MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());
43634 LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);
43635 RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);
43636
43637 // VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element
43638 // C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].
43639 // The src A, B element type is i8, but the dst C element type is i32.
43640 // When we calculate the reduce stage, we use src vector type vXi8 for it
43641 // so we need logbias 2 to avoid extra 2 stages.
43642 LogBias = 2;
43643
43644 unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());
43645 if (Subtarget.hasVNNI() && !Subtarget.hasVLX())
43646 RegSize = std::max(512u, RegSize);
43647
43648 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
43649 // fill in the missing vector elements with 0.
43650 unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();
43651 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));
43652 Ops[0] = LHS;
43653 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
43654 SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
43655 Ops[0] = RHS;
43656 SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
43657
43658 // Actually build the DotProduct, split as 256/512 bits for
43659 // AVXVNNI/AVX512VNNI.
43660 auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
43661 ArrayRef<SDValue> Ops) {
43662 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
43663 return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);
43664 };
43665 MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
43666 SDValue Zero = DAG.getConstant(0, DL, DpVT);
43667
43668 return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},
43669 DpBuilder, false);
43670}
43671
43672// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
43673// to these zexts.
43674static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
43675 const SDValue &Zext1, const SDLoc &DL,
43676 const X86Subtarget &Subtarget) {
43677 // Find the appropriate width for the PSADBW.
43678 EVT InVT = Zext0.getOperand(0).getValueType();
43679 unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());
43680
43681 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
43682 // fill in the missing vector elements with 0.
43683 unsigned NumConcat = RegSize / InVT.getSizeInBits();
43684 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
43685 Ops[0] = Zext0.getOperand(0);
43686 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
43687 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
43688 Ops[0] = Zext1.getOperand(0);
43689 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
43690
43691 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
43692 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
43693 ArrayRef<SDValue> Ops) {
43694 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
43695 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
43696 };
43697 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
43698 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
43699 PSADBWBuilder);
43700}
43701
43702// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
43703// PHMINPOSUW.
43705 const X86Subtarget &Subtarget) {
43706 // Bail without SSE41.
43707 if (!Subtarget.hasSSE41())
43708 return SDValue();
43709
43710 EVT ExtractVT = Extract->getValueType(0);
43711 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
43712 return SDValue();
43713
43714 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
43715 ISD::NodeType BinOp;
43716 SDValue Src = DAG.matchBinOpReduction(
43717 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
43718 if (!Src)
43719 return SDValue();
43720
43721 EVT SrcVT = Src.getValueType();
43722 EVT SrcSVT = SrcVT.getScalarType();
43723 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
43724 return SDValue();
43725
43726 SDLoc DL(Extract);
43727 SDValue MinPos = Src;
43728
43729 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
43730 while (SrcVT.getSizeInBits() > 128) {
43731 SDValue Lo, Hi;
43732 std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
43733 SrcVT = Lo.getValueType();
43734 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
43735 }
43736 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
43737 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
43738 "Unexpected value type");
43739
43740 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
43741 // to flip the value accordingly.
43742 SDValue Mask;
43743 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
43744 if (BinOp == ISD::SMAX)
43745 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
43746 else if (BinOp == ISD::SMIN)
43747 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
43748 else if (BinOp == ISD::UMAX)
43749 Mask = DAG.getAllOnesConstant(DL, SrcVT);
43750
43751 if (Mask)
43752 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
43753
43754 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
43755 // shuffling each upper element down and insert zeros. This means that the
43756 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
43757 // ready for the PHMINPOS.
43758 if (ExtractVT == MVT::i8) {
43760 SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
43761 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
43762 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
43763 }
43764
43765 // Perform the PHMINPOS on a v8i16 vector,
43766 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
43767 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
43768 MinPos = DAG.getBitcast(SrcVT, MinPos);
43769
43770 if (Mask)
43771 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
43772
43773 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
43774 DAG.getIntPtrConstant(0, DL));
43775}
43776
43777// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
43779 const X86Subtarget &Subtarget) {
43780 // Bail without SSE2.
43781 if (!Subtarget.hasSSE2())
43782 return SDValue();
43783
43784 EVT ExtractVT = Extract->getValueType(0);
43785 unsigned BitWidth = ExtractVT.getSizeInBits();
43786 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
43787 ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
43788 return SDValue();
43789
43790 // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
43791 ISD::NodeType BinOp;
43792 SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
43793 if (!Match && ExtractVT == MVT::i1)
43794 Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
43795 if (!Match)
43796 return SDValue();
43797
43798 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
43799 // which we can't support here for now.
43800 if (Match.getScalarValueSizeInBits() != BitWidth)
43801 return SDValue();
43802
43803 SDValue Movmsk;
43804 SDLoc DL(Extract);
43805 EVT MatchVT = Match.getValueType();
43806 unsigned NumElts = MatchVT.getVectorNumElements();
43807 unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
43808 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43809 LLVMContext &Ctx = *DAG.getContext();
43810
43811 if (ExtractVT == MVT::i1) {
43812 // Special case for (pre-legalization) vXi1 reductions.
43813 if (NumElts > 64 || !isPowerOf2_32(NumElts))
43814 return SDValue();
43815 if (Match.getOpcode() == ISD::SETCC) {
43816 ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get();
43817 if ((BinOp == ISD::AND && CC == ISD::CondCode::SETEQ) ||
43818 (BinOp == ISD::OR && CC == ISD::CondCode::SETNE)) {
43819 // For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y.
43820 // For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y.
43821 X86::CondCode X86CC;
43822 SDValue LHS = DAG.getFreeze(Match.getOperand(0));
43823 SDValue RHS = DAG.getFreeze(Match.getOperand(1));
43824 APInt Mask = APInt::getAllOnes(LHS.getScalarValueSizeInBits());
43825 if (SDValue V = LowerVectorAllEqual(DL, LHS, RHS, CC, Mask, Subtarget,
43826 DAG, X86CC))
43827 return DAG.getNode(ISD::TRUNCATE, DL, ExtractVT,
43828 getSETCC(X86CC, V, DL, DAG));
43829 }
43830 }
43831 if (TLI.isTypeLegal(MatchVT)) {
43832 // If this is a legal AVX512 predicate type then we can just bitcast.
43833 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
43834 Movmsk = DAG.getBitcast(MovmskVT, Match);
43835 } else {
43836 // Use combineBitcastvxi1 to create the MOVMSK.
43837 while (NumElts > MaxElts) {
43838 SDValue Lo, Hi;
43839 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
43840 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
43841 NumElts /= 2;
43842 }
43843 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
43844 Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
43845 }
43846 if (!Movmsk)
43847 return SDValue();
43848 Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
43849 } else {
43850 // FIXME: Better handling of k-registers or 512-bit vectors?
43851 unsigned MatchSizeInBits = Match.getValueSizeInBits();
43852 if (!(MatchSizeInBits == 128 ||
43853 (MatchSizeInBits == 256 && Subtarget.hasAVX())))
43854 return SDValue();
43855
43856 // Make sure this isn't a vector of 1 element. The perf win from using
43857 // MOVMSK diminishes with less elements in the reduction, but it is
43858 // generally better to get the comparison over to the GPRs as soon as
43859 // possible to reduce the number of vector ops.
43860 if (Match.getValueType().getVectorNumElements() < 2)
43861 return SDValue();
43862
43863 // Check that we are extracting a reduction of all sign bits.
43864 if (DAG.ComputeNumSignBits(Match) != BitWidth)
43865 return SDValue();
43866
43867 if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
43868 SDValue Lo, Hi;
43869 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
43870 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
43871 MatchSizeInBits = Match.getValueSizeInBits();
43872 }
43873
43874 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
43875 MVT MaskSrcVT;
43876 if (64 == BitWidth || 32 == BitWidth)
43878 MatchSizeInBits / BitWidth);
43879 else
43880 MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
43881
43882 SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
43883 Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
43884 NumElts = MaskSrcVT.getVectorNumElements();
43885 }
43886 assert((NumElts <= 32 || NumElts == 64) &&
43887 "Not expecting more than 64 elements");
43888
43889 MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
43890 if (BinOp == ISD::XOR) {
43891 // parity -> (PARITY(MOVMSK X))
43892 SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
43893 return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
43894 }
43895
43896 SDValue CmpC;
43897 ISD::CondCode CondCode;
43898 if (BinOp == ISD::OR) {
43899 // any_of -> MOVMSK != 0
43900 CmpC = DAG.getConstant(0, DL, CmpVT);
43901 CondCode = ISD::CondCode::SETNE;
43902 } else {
43903 // all_of -> MOVMSK == ((1 << NumElts) - 1)
43904 CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
43905 DL, CmpVT);
43906 CondCode = ISD::CondCode::SETEQ;
43907 }
43908
43909 // The setcc produces an i8 of 0/1, so extend that to the result width and
43910 // negate to get the final 0/-1 mask value.
43911 EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), Ctx, CmpVT);
43912 SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
43913 SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
43914 SDValue Zero = DAG.getConstant(0, DL, ExtractVT);
43915 return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);
43916}
43917
43919 const X86Subtarget &Subtarget) {
43920 if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())
43921 return SDValue();
43922
43923 EVT ExtractVT = Extract->getValueType(0);
43924 // Verify the type we're extracting is i32, as the output element type of
43925 // vpdpbusd is i32.
43926 if (ExtractVT != MVT::i32)
43927 return SDValue();
43928
43929 EVT VT = Extract->getOperand(0).getValueType();
43931 return SDValue();
43932
43933 // Match shuffle + add pyramid.
43934 ISD::NodeType BinOp;
43935 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
43936
43937 // We can't combine to vpdpbusd for zext, because each of the 4 multiplies
43938 // done by vpdpbusd compute a signed 16-bit product that will be sign extended
43939 // before adding into the accumulator.
43940 // TODO:
43941 // We also need to verify that the multiply has at least 2x the number of bits
43942 // of the input. We shouldn't match
43943 // (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).
43944 // if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))
43945 // Root = Root.getOperand(0);
43946
43947 // If there was a match, we want Root to be a mul.
43948 if (!Root || Root.getOpcode() != ISD::MUL)
43949 return SDValue();
43950
43951 // Check whether we have an extend and mul pattern
43952 SDValue LHS, RHS;
43953 if (!detectExtMul(DAG, Root, LHS, RHS))
43954 return SDValue();
43955
43956 // Create the dot product instruction.
43957 SDLoc DL(Extract);
43958 unsigned StageBias;
43959 SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);
43960
43961 // If the original vector was wider than 4 elements, sum over the results
43962 // in the DP vector.
43963 unsigned Stages = Log2_32(VT.getVectorNumElements());
43964 EVT DpVT = DP.getValueType();
43965
43966 if (Stages > StageBias) {
43967 unsigned DpElems = DpVT.getVectorNumElements();
43968
43969 for (unsigned i = Stages - StageBias; i > 0; --i) {
43970 SmallVector<int, 16> Mask(DpElems, -1);
43971 for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
43972 Mask[j] = MaskEnd + j;
43973
43974 SDValue Shuffle =
43975 DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);
43976 DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);
43977 }
43978 }
43979
43980 // Return the lowest ExtractSizeInBits bits.
43981 EVT ResVT =
43982 EVT::getVectorVT(*DAG.getContext(), ExtractVT,
43983 DpVT.getSizeInBits() / ExtractVT.getSizeInBits());
43984 DP = DAG.getBitcast(ResVT, DP);
43985 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,
43986 Extract->getOperand(1));
43987}
43988
43990 const X86Subtarget &Subtarget) {
43991 // PSADBW is only supported on SSE2 and up.
43992 if (!Subtarget.hasSSE2())
43993 return SDValue();
43994
43995 EVT ExtractVT = Extract->getValueType(0);
43996 // Verify the type we're extracting is either i32 or i64.
43997 // FIXME: Could support other types, but this is what we have coverage for.
43998 if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)
43999 return SDValue();
44000
44001 EVT VT = Extract->getOperand(0).getValueType();
44003 return SDValue();
44004
44005 // Match shuffle + add pyramid.
44006 ISD::NodeType BinOp;
44007 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
44008
44009 // The operand is expected to be zero extended from i8
44010 // (verified in detectZextAbsDiff).
44011 // In order to convert to i64 and above, additional any/zero/sign
44012 // extend is expected.
44013 // The zero extend from 32 bit has no mathematical effect on the result.
44014 // Also the sign extend is basically zero extend
44015 // (extends the sign bit which is zero).
44016 // So it is correct to skip the sign/zero extend instruction.
44017 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
44018 Root.getOpcode() == ISD::ZERO_EXTEND ||
44019 Root.getOpcode() == ISD::ANY_EXTEND))
44020 Root = Root.getOperand(0);
44021
44022 // If there was a match, we want Root to be a select that is the root of an
44023 // abs-diff pattern.
44024 if (!Root || Root.getOpcode() != ISD::ABS)
44025 return SDValue();
44026
44027 // Check whether we have an abs-diff pattern feeding into the select.
44028 SDValue Zext0, Zext1;
44029 if (!detectZextAbsDiff(Root, Zext0, Zext1))
44030 return SDValue();
44031
44032 // Create the SAD instruction.
44033 SDLoc DL(Extract);
44034 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
44035
44036 // If the original vector was wider than 8 elements, sum over the results
44037 // in the SAD vector.
44038 unsigned Stages = Log2_32(VT.getVectorNumElements());
44039 EVT SadVT = SAD.getValueType();
44040 if (Stages > 3) {
44041 unsigned SadElems = SadVT.getVectorNumElements();
44042
44043 for(unsigned i = Stages - 3; i > 0; --i) {
44044 SmallVector<int, 16> Mask(SadElems, -1);
44045 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
44046 Mask[j] = MaskEnd + j;
44047
44048 SDValue Shuffle =
44049 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
44050 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
44051 }
44052 }
44053
44054 unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
44055 // Return the lowest ExtractSizeInBits bits.
44056 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
44057 SadVT.getSizeInBits() / ExtractSizeInBits);
44058 SAD = DAG.getBitcast(ResVT, SAD);
44059 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
44060 Extract->getOperand(1));
44061}
44062
44063// If this extract is from a loaded vector value and will be used as an
44064// integer, that requires a potentially expensive XMM -> GPR transfer.
44065// Additionally, if we can convert to a scalar integer load, that will likely
44066// be folded into a subsequent integer op.
44067// Note: SrcVec might not have a VecVT type, but it must be the same size.
44068// Note: Unlike the related fold for this in DAGCombiner, this is not limited
44069// to a single-use of the loaded vector. For the reasons above, we
44070// expect this to be profitable even if it creates an extra load.
44071static SDValue
44073 const SDLoc &dl, SelectionDAG &DAG,
44075 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
44076 "Only EXTRACT_VECTOR_ELT supported so far");
44077
44078 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44079 EVT VT = N->getValueType(0);
44080
44081 bool LikelyUsedAsVector = any_of(N->uses(), [](SDNode *Use) {
44082 return Use->getOpcode() == ISD::STORE ||
44083 Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
44084 Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
44085 });
44086
44087 auto *LoadVec = dyn_cast<LoadSDNode>(SrcVec);
44088 if (LoadVec && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
44089 VecVT.getVectorElementType() == VT &&
44090 VecVT.getSizeInBits() == SrcVec.getValueSizeInBits() &&
44091 DCI.isAfterLegalizeDAG() && !LikelyUsedAsVector && LoadVec->isSimple()) {
44092 SDValue NewPtr = TLI.getVectorElementPointer(
44093 DAG, LoadVec->getBasePtr(), VecVT, DAG.getVectorIdxConstant(Idx, dl));
44094 unsigned PtrOff = VT.getSizeInBits() * Idx / 8;
44095 MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
44096 Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
44097 SDValue Load =
44098 DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
44099 LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
44100 DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
44101 return Load;
44102 }
44103
44104 return SDValue();
44105}
44106
44107// Attempt to peek through a target shuffle and extract the scalar from the
44108// source.
44111 const X86Subtarget &Subtarget) {
44112 if (DCI.isBeforeLegalizeOps())
44113 return SDValue();
44114
44115 SDLoc dl(N);
44116 SDValue Src = N->getOperand(0);
44117 SDValue Idx = N->getOperand(1);
44118
44119 EVT VT = N->getValueType(0);
44120 EVT SrcVT = Src.getValueType();
44121 EVT SrcSVT = SrcVT.getVectorElementType();
44122 unsigned SrcEltBits = SrcSVT.getSizeInBits();
44123 unsigned NumSrcElts = SrcVT.getVectorNumElements();
44124
44125 // Don't attempt this for boolean mask vectors or unknown extraction indices.
44126 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
44127 return SDValue();
44128
44129 const APInt &IdxC = N->getConstantOperandAPInt(1);
44130 if (IdxC.uge(NumSrcElts))
44131 return SDValue();
44132
44133 SDValue SrcBC = peekThroughBitcasts(Src);
44134
44135 // Handle extract(bitcast(broadcast(scalar_value))).
44136 if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
44137 SDValue SrcOp = SrcBC.getOperand(0);
44138 EVT SrcOpVT = SrcOp.getValueType();
44139 if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
44140 (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
44141 unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
44142 unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
44143 // TODO support non-zero offsets.
44144 if (Offset == 0) {
44145 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
44146 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
44147 return SrcOp;
44148 }
44149 }
44150 }
44151
44152 // If we're extracting a single element from a broadcast load and there are
44153 // no other users, just create a single load.
44154 if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
44155 auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
44156 unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
44157 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
44158 VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
44159 SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
44160 MemIntr->getBasePtr(),
44161 MemIntr->getPointerInfo(),
44162 MemIntr->getOriginalAlign(),
44163 MemIntr->getMemOperand()->getFlags());
44164 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
44165 return Load;
44166 }
44167 }
44168
44169 // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
44170 // TODO: Move to DAGCombine?
44171 if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
44172 SrcBC.getValueType().isInteger() &&
44173 (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
44174 SrcBC.getScalarValueSizeInBits() ==
44175 SrcBC.getOperand(0).getValueSizeInBits()) {
44176 unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
44177 if (IdxC.ult(Scale)) {
44178 unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
44179 SDValue Scl = SrcBC.getOperand(0);
44180 EVT SclVT = Scl.getValueType();
44181 if (Offset) {
44182 Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
44183 DAG.getShiftAmountConstant(Offset, SclVT, dl));
44184 }
44185 Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
44186 Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
44187 return Scl;
44188 }
44189 }
44190
44191 // Handle extract(truncate(x)) for 0'th index.
44192 // TODO: Treat this as a faux shuffle?
44193 // TODO: When can we use this for general indices?
44194 if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
44195 (SrcVT.getSizeInBits() % 128) == 0) {
44196 Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
44197 MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
44198 return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
44199 Idx);
44200 }
44201
44202 // We can only legally extract other elements from 128-bit vectors and in
44203 // certain circumstances, depending on SSE-level.
44204 // TODO: Investigate float/double extraction if it will be just stored.
44205 auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
44206 unsigned Idx) {
44207 EVT VecSVT = VecVT.getScalarType();
44208 if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
44209 (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
44210 VecSVT == MVT::i64)) {
44211 unsigned EltSizeInBits = VecSVT.getSizeInBits();
44212 unsigned NumEltsPerLane = 128 / EltSizeInBits;
44213 unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
44214 unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
44215 VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
44216 Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
44217 Idx &= (NumEltsPerLane - 1);
44218 }
44219 if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
44220 ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
44221 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
44222 DAG.getBitcast(VecVT, Vec),
44223 DAG.getIntPtrConstant(Idx, dl));
44224 }
44225 if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
44226 (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
44227 unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
44228 return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
44229 DAG.getTargetConstant(Idx, dl, MVT::i8));
44230 }
44231 return SDValue();
44232 };
44233
44234 // Resolve the target shuffle inputs and mask.
44237 if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
44238 return SDValue();
44239
44240 // Shuffle inputs must be the same size as the result.
44241 if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
44242 return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
44243 }))
44244 return SDValue();
44245
44246 // Attempt to narrow/widen the shuffle mask to the correct size.
44247 if (Mask.size() != NumSrcElts) {
44248 if ((NumSrcElts % Mask.size()) == 0) {
44249 SmallVector<int, 16> ScaledMask;
44250 int Scale = NumSrcElts / Mask.size();
44251 narrowShuffleMaskElts(Scale, Mask, ScaledMask);
44252 Mask = std::move(ScaledMask);
44253 } else if ((Mask.size() % NumSrcElts) == 0) {
44254 // Simplify Mask based on demanded element.
44255 int ExtractIdx = (int)IdxC.getZExtValue();
44256 int Scale = Mask.size() / NumSrcElts;
44257 int Lo = Scale * ExtractIdx;
44258 int Hi = Scale * (ExtractIdx + 1);
44259 for (int i = 0, e = (int)Mask.size(); i != e; ++i)
44260 if (i < Lo || Hi <= i)
44261 Mask[i] = SM_SentinelUndef;
44262
44263 SmallVector<int, 16> WidenedMask;
44264 while (Mask.size() > NumSrcElts &&
44265 canWidenShuffleElements(Mask, WidenedMask))
44266 Mask = std::move(WidenedMask);
44267 }
44268 }
44269
44270 // If narrowing/widening failed, see if we can extract+zero-extend.
44271 int ExtractIdx;
44272 EVT ExtractVT;
44273 if (Mask.size() == NumSrcElts) {
44274 ExtractIdx = Mask[IdxC.getZExtValue()];
44275 ExtractVT = SrcVT;
44276 } else {
44277 unsigned Scale = Mask.size() / NumSrcElts;
44278 if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
44279 return SDValue();
44280 unsigned ScaledIdx = Scale * IdxC.getZExtValue();
44281 if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
44282 return SDValue();
44283 ExtractIdx = Mask[ScaledIdx];
44284 EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
44285 ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
44286 assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&
44287 "Failed to widen vector type");
44288 }
44289
44290 // If the shuffle source element is undef/zero then we can just accept it.
44291 if (ExtractIdx == SM_SentinelUndef)
44292 return DAG.getUNDEF(VT);
44293
44294 if (ExtractIdx == SM_SentinelZero)
44295 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
44296 : DAG.getConstant(0, dl, VT);
44297
44298 SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
44299 ExtractIdx = ExtractIdx % Mask.size();
44300 if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
44301 return DAG.getZExtOrTrunc(V, dl, VT);
44302
44303 if (N->getOpcode() == ISD::EXTRACT_VECTOR_ELT && ExtractVT == SrcVT)
44305 N, SrcVT, peekThroughBitcasts(SrcOp), ExtractIdx, dl, DAG, DCI))
44306 return V;
44307
44308 return SDValue();
44309}
44310
44311/// Extracting a scalar FP value from vector element 0 is free, so extract each
44312/// operand first, then perform the math as a scalar op.
44314 const X86Subtarget &Subtarget) {
44315 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
44316 SDValue Vec = ExtElt->getOperand(0);
44317 SDValue Index = ExtElt->getOperand(1);
44318 EVT VT = ExtElt->getValueType(0);
44319 EVT VecVT = Vec.getValueType();
44320
44321 // TODO: If this is a unary/expensive/expand op, allow extraction from a
44322 // non-zero element because the shuffle+scalar op will be cheaper?
44323 if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
44324 return SDValue();
44325
44326 // Vector FP compares don't fit the pattern of FP math ops (propagate, not
44327 // extract, the condition code), so deal with those as a special-case.
44328 if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
44329 EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
44330 if (OpVT != MVT::f32 && OpVT != MVT::f64)
44331 return SDValue();
44332
44333 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
44334 SDLoc DL(ExtElt);
44335 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
44336 Vec.getOperand(0), Index);
44337 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
44338 Vec.getOperand(1), Index);
44339 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
44340 }
44341
44342 if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&
44343 VT != MVT::f64)
44344 return SDValue();
44345
44346 // Vector FP selects don't fit the pattern of FP math ops (because the
44347 // condition has a different type and we have to change the opcode), so deal
44348 // with those here.
44349 // FIXME: This is restricted to pre type legalization by ensuring the setcc
44350 // has i1 elements. If we loosen this we need to convert vector bool to a
44351 // scalar bool.
44352 if (Vec.getOpcode() == ISD::VSELECT &&
44353 Vec.getOperand(0).getOpcode() == ISD::SETCC &&
44354 Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
44355 Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
44356 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
44357 SDLoc DL(ExtElt);
44360 Vec.getOperand(0), Index);
44361 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
44362 Vec.getOperand(1), Index);
44363 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
44364 Vec.getOperand(2), Index);
44365 return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
44366 }
44367
44368 // TODO: This switch could include FNEG and the x86-specific FP logic ops
44369 // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
44370 // missed load folding and fma+fneg combining.
44371 switch (Vec.getOpcode()) {
44372 case ISD::FMA: // Begin 3 operands
44373 case ISD::FMAD:
44374 case ISD::FADD: // Begin 2 operands
44375 case ISD::FSUB:
44376 case ISD::FMUL:
44377 case ISD::FDIV:
44378 case ISD::FREM:
44379 case ISD::FCOPYSIGN:
44380 case ISD::FMINNUM:
44381 case ISD::FMAXNUM:
44382 case ISD::FMINNUM_IEEE:
44383 case ISD::FMAXNUM_IEEE:
44384 case ISD::FMAXIMUM:
44385 case ISD::FMINIMUM:
44386 case X86ISD::FMAX:
44387 case X86ISD::FMIN:
44388 case ISD::FABS: // Begin 1 operand
44389 case ISD::FSQRT:
44390 case ISD::FRINT:
44391 case ISD::FCEIL:
44392 case ISD::FTRUNC:
44393 case ISD::FNEARBYINT:
44394 case ISD::FROUNDEVEN:
44395 case ISD::FROUND:
44396 case ISD::FFLOOR:
44397 case X86ISD::FRCP:
44398 case X86ISD::FRSQRT: {
44399 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
44400 SDLoc DL(ExtElt);
44402 for (SDValue Op : Vec->ops())
44403 ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
44404 return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
44405 }
44406 default:
44407 return SDValue();
44408 }
44409 llvm_unreachable("All opcodes should return within switch");
44410}
44411
44412/// Try to convert a vector reduction sequence composed of binops and shuffles
44413/// into horizontal ops.
44415 const X86Subtarget &Subtarget) {
44416 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
44417
44418 // We need at least SSE2 to anything here.
44419 if (!Subtarget.hasSSE2())
44420 return SDValue();
44421
44422 ISD::NodeType Opc;
44423 SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
44424 {ISD::ADD, ISD::MUL, ISD::FADD}, true);
44425 if (!Rdx)
44426 return SDValue();
44427
44428 SDValue Index = ExtElt->getOperand(1);
44430 "Reduction doesn't end in an extract from index 0");
44431
44432 EVT VT = ExtElt->getValueType(0);
44433 EVT VecVT = Rdx.getValueType();
44434 if (VecVT.getScalarType() != VT)
44435 return SDValue();
44436
44437 SDLoc DL(ExtElt);
44438 unsigned NumElts = VecVT.getVectorNumElements();
44439 unsigned EltSizeInBits = VecVT.getScalarSizeInBits();
44440
44441 // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
44442 auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {
44443 if (V.getValueType() == MVT::v4i8) {
44444 if (ZeroExtend && Subtarget.hasSSE41()) {
44445 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
44446 DAG.getConstant(0, DL, MVT::v4i32),
44447 DAG.getBitcast(MVT::i32, V),
44448 DAG.getIntPtrConstant(0, DL));
44449 return DAG.getBitcast(MVT::v16i8, V);
44450 }
44451 V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,
44452 ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)
44453 : DAG.getUNDEF(MVT::v4i8));
44454 }
44455 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,
44456 DAG.getUNDEF(MVT::v8i8));
44457 };
44458
44459 // vXi8 mul reduction - promote to vXi16 mul reduction.
44460 if (Opc == ISD::MUL) {
44461 if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
44462 return SDValue();
44463 if (VecVT.getSizeInBits() >= 128) {
44464 EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
44465 SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
44466 SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
44467 Lo = DAG.getBitcast(WideVT, Lo);
44468 Hi = DAG.getBitcast(WideVT, Hi);
44469 Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
44470 while (Rdx.getValueSizeInBits() > 128) {
44471 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
44472 Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
44473 }
44474 } else {
44475 Rdx = WidenToV16I8(Rdx, false);
44476 Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
44477 Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
44478 }
44479 if (NumElts >= 8)
44480 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
44481 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
44482 {4, 5, 6, 7, -1, -1, -1, -1}));
44483 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
44484 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
44485 {2, 3, -1, -1, -1, -1, -1, -1}));
44486 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
44487 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
44488 {1, -1, -1, -1, -1, -1, -1, -1}));
44489 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
44490 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
44491 }
44492
44493 // vXi8 add reduction - sub 128-bit vector.
44494 if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
44495 Rdx = WidenToV16I8(Rdx, true);
44496 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
44497 DAG.getConstant(0, DL, MVT::v16i8));
44498 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
44499 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
44500 }
44501
44502 // Must be a >=128-bit vector with pow2 elements.
44503 if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))
44504 return SDValue();
44505
44506 // vXi8 add reduction - sum lo/hi halves then use PSADBW.
44507 if (VT == MVT::i8) {
44508 while (Rdx.getValueSizeInBits() > 128) {
44509 SDValue Lo, Hi;
44510 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
44511 VecVT = Lo.getValueType();
44512 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
44513 }
44514 assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
44515
44517 MVT::v16i8, DL, Rdx, Rdx,
44518 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
44519 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
44520 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
44521 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
44522 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
44523 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
44524 }
44525
44526 // See if we can use vXi8 PSADBW add reduction for larger zext types.
44527 // If the source vector values are 0-255, then we can use PSADBW to
44528 // sum+zext v8i8 subvectors to vXi64, then perform the reduction.
44529 // TODO: See if its worth avoiding vXi16/i32 truncations?
44530 if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&
44531 DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&
44532 (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||
44533 Subtarget.hasAVX512())) {
44534 if (Rdx.getValueType() == MVT::v8i16) {
44535 Rdx = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Rdx,
44536 DAG.getUNDEF(MVT::v8i16));
44537 } else {
44538 EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
44539 Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);
44540 if (ByteVT.getSizeInBits() < 128)
44541 Rdx = WidenToV16I8(Rdx, true);
44542 }
44543
44544 // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
44545 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
44546 ArrayRef<SDValue> Ops) {
44547 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
44548 SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());
44549 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);
44550 };
44551 MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);
44552 Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);
44553
44554 // TODO: We could truncate to vXi16/vXi32 before performing the reduction.
44555 while (Rdx.getValueSizeInBits() > 128) {
44556 SDValue Lo, Hi;
44557 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
44558 VecVT = Lo.getValueType();
44559 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
44560 }
44561 assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected");
44562
44563 if (NumElts > 8) {
44564 SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});
44565 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);
44566 }
44567
44568 VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());
44569 Rdx = DAG.getBitcast(VecVT, Rdx);
44570 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
44571 }
44572
44573 // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
44574 if (!shouldUseHorizontalOp(true, DAG, Subtarget))
44575 return SDValue();
44576
44577 unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
44578
44579 // 256-bit horizontal instructions operate on 128-bit chunks rather than
44580 // across the whole vector, so we need an extract + hop preliminary stage.
44581 // This is the only step where the operands of the hop are not the same value.
44582 // TODO: We could extend this to handle 512-bit or even longer vectors.
44583 if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
44584 ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
44585 unsigned NumElts = VecVT.getVectorNumElements();
44586 SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
44587 SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
44588 Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
44589 VecVT = Rdx.getValueType();
44590 }
44591 if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
44592 !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
44593 return SDValue();
44594
44595 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
44596 unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
44597 for (unsigned i = 0; i != ReductionSteps; ++i)
44598 Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
44599
44600 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
44601}
44602
44603/// Detect vector gather/scatter index generation and convert it from being a
44604/// bunch of shuffles and extracts into a somewhat faster sequence.
44605/// For i686, the best sequence is apparently storing the value and loading
44606/// scalars back, while for x64 we should use 64-bit extracts and shifts.
44609 const X86Subtarget &Subtarget) {
44610 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
44611 return NewOp;
44612
44613 SDValue InputVector = N->getOperand(0);
44614 SDValue EltIdx = N->getOperand(1);
44615 auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
44616
44617 EVT SrcVT = InputVector.getValueType();
44618 EVT VT = N->getValueType(0);
44619 SDLoc dl(InputVector);
44620 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
44621 unsigned NumSrcElts = SrcVT.getVectorNumElements();
44622 unsigned NumEltBits = VT.getScalarSizeInBits();
44623 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44624
44625 if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
44626 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
44627
44628 // Integer Constant Folding.
44629 if (CIdx && VT.isInteger()) {
44630 APInt UndefVecElts;
44631 SmallVector<APInt, 16> EltBits;
44632 unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
44633 if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
44634 EltBits, /*AllowWholeUndefs*/ true,
44635 /*AllowPartialUndefs*/ false)) {
44636 uint64_t Idx = CIdx->getZExtValue();
44637 if (UndefVecElts[Idx])
44638 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
44639 return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT);
44640 }
44641
44642 // Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).
44643 // Improves lowering of bool masks on rust which splits them into byte array.
44644 if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) {
44645 SDValue Src = peekThroughBitcasts(InputVector);
44646 if (Src.getValueType().getScalarType() == MVT::i1 &&
44647 TLI.isTypeLegal(Src.getValueType())) {
44648 MVT SubVT = MVT::getVectorVT(MVT::i1, NumEltBits);
44649 SDValue Sub = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src,
44650 DAG.getIntPtrConstant(CIdx->getZExtValue() * NumEltBits, dl));
44651 return DAG.getBitcast(VT, Sub);
44652 }
44653 }
44654 }
44655
44656 if (IsPextr) {
44657 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
44658 DCI))
44659 return SDValue(N, 0);
44660
44661 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
44662 if ((InputVector.getOpcode() == X86ISD::PINSRB ||
44663 InputVector.getOpcode() == X86ISD::PINSRW) &&
44664 InputVector.getOperand(2) == EltIdx) {
44665 assert(SrcVT == InputVector.getOperand(0).getValueType() &&
44666 "Vector type mismatch");
44667 SDValue Scl = InputVector.getOperand(1);
44668 Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
44669 return DAG.getZExtOrTrunc(Scl, dl, VT);
44670 }
44671
44672 // TODO - Remove this once we can handle the implicit zero-extension of
44673 // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
44674 // combineBasicSADPattern.
44675 return SDValue();
44676 }
44677
44678 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
44679 if (VT == MVT::i64 && SrcVT == MVT::v1i64 &&
44680 InputVector.getOpcode() == ISD::BITCAST &&
44681 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
44682 isNullConstant(EltIdx) && InputVector.hasOneUse())
44683 return DAG.getBitcast(VT, InputVector);
44684
44685 // Detect mmx to i32 conversion through a v2i32 elt extract.
44686 if (VT == MVT::i32 && SrcVT == MVT::v2i32 &&
44687 InputVector.getOpcode() == ISD::BITCAST &&
44688 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
44689 isNullConstant(EltIdx) && InputVector.hasOneUse())
44690 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32,
44691 InputVector.getOperand(0));
44692
44693 // Check whether this extract is the root of a sum of absolute differences
44694 // pattern. This has to be done here because we really want it to happen
44695 // pre-legalization,
44696 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
44697 return SAD;
44698
44699 if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))
44700 return VPDPBUSD;
44701
44702 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
44703 if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
44704 return Cmp;
44705
44706 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
44707 if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
44708 return MinMax;
44709
44710 // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
44711 if (SDValue V = combineArithReduction(N, DAG, Subtarget))
44712 return V;
44713
44714 if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget))
44715 return V;
44716
44717 if (CIdx)
44719 N, InputVector.getValueType(), InputVector, CIdx->getZExtValue(),
44720 dl, DAG, DCI))
44721 return V;
44722
44723 // Attempt to extract a i1 element by using MOVMSK to extract the signbits
44724 // and then testing the relevant element.
44725 //
44726 // Note that we only combine extracts on the *same* result number, i.e.
44727 // t0 = merge_values a0, a1, a2, a3
44728 // i1 = extract_vector_elt t0, Constant:i64<2>
44729 // i1 = extract_vector_elt t0, Constant:i64<3>
44730 // but not
44731 // i1 = extract_vector_elt t0:1, Constant:i64<2>
44732 // since the latter would need its own MOVMSK.
44733 if (SrcVT.getScalarType() == MVT::i1) {
44734 bool IsVar = !CIdx;
44735 SmallVector<SDNode *, 16> BoolExtracts;
44736 unsigned ResNo = InputVector.getResNo();
44737 auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {
44738 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
44739 Use->getOperand(0).getResNo() == ResNo &&
44740 Use->getValueType(0) == MVT::i1) {
44741 BoolExtracts.push_back(Use);
44742 IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
44743 return true;
44744 }
44745 return false;
44746 };
44747 // TODO: Can we drop the oneuse check for constant extracts?
44748 if (all_of(InputVector->uses(), IsBoolExtract) &&
44749 (IsVar || BoolExtracts.size() > 1)) {
44750 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
44751 if (SDValue BC =
44752 combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
44753 for (SDNode *Use : BoolExtracts) {
44754 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
44755 // Mask = 1 << MaskIdx
44756 SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
44757 SDValue MaskBit = DAG.getConstant(1, dl, BCVT);
44758 SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);
44759 SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
44760 Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
44761 DCI.CombineTo(Use, Res);
44762 }
44763 return SDValue(N, 0);
44764 }
44765 }
44766 }
44767
44768 // Attempt to fold extract(trunc(x),c) -> trunc(extract(x,c)).
44769 if (CIdx && InputVector.getOpcode() == ISD::TRUNCATE) {
44770 SDValue TruncSrc = InputVector.getOperand(0);
44771 EVT TruncSVT = TruncSrc.getValueType().getScalarType();
44772 if (DCI.isBeforeLegalize() && TLI.isTypeLegal(TruncSVT)) {
44773 SDValue NewExt =
44774 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TruncSVT, TruncSrc, EltIdx);
44775 return DAG.getAnyExtOrTrunc(NewExt, dl, VT);
44776 }
44777 }
44778
44779 return SDValue();
44780}
44781
44782// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
44783// This is more or less the reverse of combineBitcastvxi1.
44785 unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,
44786 TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
44787 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
44788 Opcode != ISD::ANY_EXTEND)
44789 return SDValue();
44790 if (!DCI.isBeforeLegalizeOps())
44791 return SDValue();
44792 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
44793 return SDValue();
44794
44795 EVT SVT = VT.getScalarType();
44796 EVT InSVT = N0.getValueType().getScalarType();
44797 unsigned EltSizeInBits = SVT.getSizeInBits();
44798
44799 // Input type must be extending a bool vector (bit-casted from a scalar
44800 // integer) to legal integer types.
44801 if (!VT.isVector())
44802 return SDValue();
44803 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
44804 return SDValue();
44805 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
44806 return SDValue();
44807
44808 SDValue N00 = N0.getOperand(0);
44809 EVT SclVT = N00.getValueType();
44810 if (!SclVT.isScalarInteger())
44811 return SDValue();
44812
44813 SDValue Vec;
44814 SmallVector<int> ShuffleMask;
44815 unsigned NumElts = VT.getVectorNumElements();
44816 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
44817
44818 // Broadcast the scalar integer to the vector elements.
44819 if (NumElts > EltSizeInBits) {
44820 // If the scalar integer is greater than the vector element size, then we
44821 // must split it down into sub-sections for broadcasting. For example:
44822 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
44823 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
44824 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
44825 unsigned Scale = NumElts / EltSizeInBits;
44826 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
44827 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
44828 Vec = DAG.getBitcast(VT, Vec);
44829
44830 for (unsigned i = 0; i != Scale; ++i)
44831 ShuffleMask.append(EltSizeInBits, i);
44832 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
44833 } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
44834 (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
44835 // If we have register broadcast instructions, use the scalar size as the
44836 // element type for the shuffle. Then cast to the wider element type. The
44837 // widened bits won't be used, and this might allow the use of a broadcast
44838 // load.
44839 assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
44840 unsigned Scale = EltSizeInBits / NumElts;
44841 EVT BroadcastVT =
44842 EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);
44843 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
44844 ShuffleMask.append(NumElts * Scale, 0);
44845 Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);
44846 Vec = DAG.getBitcast(VT, Vec);
44847 } else {
44848 // For smaller scalar integers, we can simply any-extend it to the vector
44849 // element size (we don't care about the upper bits) and broadcast it to all
44850 // elements.
44851 SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
44852 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
44853 ShuffleMask.append(NumElts, 0);
44854 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
44855 }
44856
44857 // Now, mask the relevant bit in each element.
44859 for (unsigned i = 0; i != NumElts; ++i) {
44860 int BitIdx = (i % EltSizeInBits);
44861 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
44862 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
44863 }
44864 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
44865 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
44866
44867 // Compare against the bitmask and extend the result.
44868 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
44869 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
44870 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
44871
44872 // For SEXT, this is now done, otherwise shift the result down for
44873 // zero-extension.
44874 if (Opcode == ISD::SIGN_EXTEND)
44875 return Vec;
44876 return DAG.getNode(ISD::SRL, DL, VT, Vec,
44877 DAG.getConstant(EltSizeInBits - 1, DL, VT));
44878}
44879
44880/// If a vector select has an operand that is -1 or 0, try to simplify the
44881/// select to a bitwise logic operation.
44882/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
44883static SDValue
44886 const X86Subtarget &Subtarget) {
44887 SDValue Cond = N->getOperand(0);
44888 SDValue LHS = N->getOperand(1);
44889 SDValue RHS = N->getOperand(2);
44890 EVT VT = LHS.getValueType();
44891 EVT CondVT = Cond.getValueType();
44892 SDLoc DL(N);
44893 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44894
44895 if (N->getOpcode() != ISD::VSELECT)
44896 return SDValue();
44897
44898 assert(CondVT.isVector() && "Vector select expects a vector selector!");
44899
44900 // TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
44901 // TODO: Can we assert that both operands are not zeros (because that should
44902 // get simplified at node creation time)?
44903 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
44904 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
44905
44906 // If both inputs are 0/undef, create a complete zero vector.
44907 // FIXME: As noted above this should be handled by DAGCombiner/getNode.
44908 if (TValIsAllZeros && FValIsAllZeros) {
44909 if (VT.isFloatingPoint())
44910 return DAG.getConstantFP(0.0, DL, VT);
44911 return DAG.getConstant(0, DL, VT);
44912 }
44913
44914 // To use the condition operand as a bitwise mask, it must have elements that
44915 // are the same size as the select elements. Ie, the condition operand must
44916 // have already been promoted from the IR select condition type <N x i1>.
44917 // Don't check if the types themselves are equal because that excludes
44918 // vector floating-point selects.
44919 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
44920 return SDValue();
44921
44922 // Try to invert the condition if true value is not all 1s and false value is
44923 // not all 0s. Only do this if the condition has one use.
44924 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
44925 if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
44926 // Check if the selector will be produced by CMPP*/PCMP*.
44927 Cond.getOpcode() == ISD::SETCC &&
44928 // Check if SETCC has already been promoted.
44929 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
44930 CondVT) {
44931 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
44932
44933 if (TValIsAllZeros || FValIsAllOnes) {
44934 SDValue CC = Cond.getOperand(2);
44936 cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
44937 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
44938 NewCC);
44939 std::swap(LHS, RHS);
44940 TValIsAllOnes = FValIsAllOnes;
44941 FValIsAllZeros = TValIsAllZeros;
44942 }
44943 }
44944
44945 // Cond value must be 'sign splat' to be converted to a logical op.
44946 if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
44947 return SDValue();
44948
44949 // vselect Cond, 111..., 000... -> Cond
44950 if (TValIsAllOnes && FValIsAllZeros)
44951 return DAG.getBitcast(VT, Cond);
44952
44953 if (!TLI.isTypeLegal(CondVT))
44954 return SDValue();
44955
44956 // vselect Cond, 111..., X -> or Cond, X
44957 if (TValIsAllOnes) {
44958 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
44959 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
44960 return DAG.getBitcast(VT, Or);
44961 }
44962
44963 // vselect Cond, X, 000... -> and Cond, X
44964 if (FValIsAllZeros) {
44965 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
44966 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
44967 return DAG.getBitcast(VT, And);
44968 }
44969
44970 // vselect Cond, 000..., X -> andn Cond, X
44971 if (TValIsAllZeros) {
44972 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
44973 SDValue AndN;
44974 // The canonical form differs for i1 vectors - x86andnp is not used
44975 if (CondVT.getScalarType() == MVT::i1)
44976 AndN = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT),
44977 CastRHS);
44978 else
44979 AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS);
44980 return DAG.getBitcast(VT, AndN);
44981 }
44982
44983 return SDValue();
44984}
44985
44986/// If both arms of a vector select are concatenated vectors, split the select,
44987/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
44988/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
44989/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
44991 const X86Subtarget &Subtarget) {
44992 unsigned Opcode = N->getOpcode();
44993 if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
44994 return SDValue();
44995
44996 // TODO: Split 512-bit vectors too?
44997 EVT VT = N->getValueType(0);
44998 if (!VT.is256BitVector())
44999 return SDValue();
45000
45001 // TODO: Split as long as any 2 of the 3 operands are concatenated?
45002 SDValue Cond = N->getOperand(0);
45003 SDValue TVal = N->getOperand(1);
45004 SDValue FVal = N->getOperand(2);
45005 if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
45006 !isFreeToSplitVector(TVal.getNode(), DAG) ||
45007 !isFreeToSplitVector(FVal.getNode(), DAG))
45008 return SDValue();
45009
45010 auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
45011 ArrayRef<SDValue> Ops) {
45012 return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
45013 };
45014 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },
45015 makeBlend, /*CheckBWI*/ false);
45016}
45017
45019 SDValue Cond = N->getOperand(0);
45020 SDValue LHS = N->getOperand(1);
45021 SDValue RHS = N->getOperand(2);
45022 SDLoc DL(N);
45023
45024 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
45025 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
45026 if (!TrueC || !FalseC)
45027 return SDValue();
45028
45029 // Don't do this for crazy integer types.
45030 EVT VT = N->getValueType(0);
45031 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
45032 return SDValue();
45033
45034 // We're going to use the condition bit in math or logic ops. We could allow
45035 // this with a wider condition value (post-legalization it becomes an i8),
45036 // but if nothing is creating selects that late, it doesn't matter.
45037 if (Cond.getValueType() != MVT::i1)
45038 return SDValue();
45039
45040 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
45041 // 3, 5, or 9 with i32/i64, so those get transformed too.
45042 // TODO: For constants that overflow or do not differ by power-of-2 or small
45043 // multiplier, convert to 'and' + 'add'.
45044 const APInt &TrueVal = TrueC->getAPIntValue();
45045 const APInt &FalseVal = FalseC->getAPIntValue();
45046
45047 // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.
45048 if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&
45049 Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {
45050 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
45051 if (CC == ISD::SETEQ || CC == ISD::SETNE)
45052 return SDValue();
45053 }
45054
45055 bool OV;
45056 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
45057 if (OV)
45058 return SDValue();
45059
45060 APInt AbsDiff = Diff.abs();
45061 if (AbsDiff.isPowerOf2() ||
45062 ((VT == MVT::i32 || VT == MVT::i64) &&
45063 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
45064
45065 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
45066 // of the condition can usually be folded into a compare predicate, but even
45067 // without that, the sequence should be cheaper than a CMOV alternative.
45068 if (TrueVal.slt(FalseVal)) {
45069 Cond = DAG.getNOT(DL, Cond, MVT::i1);
45070 std::swap(TrueC, FalseC);
45071 }
45072
45073 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
45074 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
45075
45076 // Multiply condition by the difference if non-one.
45077 if (!AbsDiff.isOne())
45078 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
45079
45080 // Add the base if non-zero.
45081 if (!FalseC->isZero())
45082 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
45083
45084 return R;
45085 }
45086
45087 return SDValue();
45088}
45089
45090/// If this is a *dynamic* select (non-constant condition) and we can match
45091/// this node with one of the variable blend instructions, restructure the
45092/// condition so that blends can use the high (sign) bit of each element.
45093/// This function will also call SimplifyDemandedBits on already created
45094/// BLENDV to perform additional simplifications.
45097 const X86Subtarget &Subtarget) {
45098 SDValue Cond = N->getOperand(0);
45099 if ((N->getOpcode() != ISD::VSELECT &&
45100 N->getOpcode() != X86ISD::BLENDV) ||
45102 return SDValue();
45103
45104 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45105 unsigned BitWidth = Cond.getScalarValueSizeInBits();
45106 EVT VT = N->getValueType(0);
45107
45108 // We can only handle the cases where VSELECT is directly legal on the
45109 // subtarget. We custom lower VSELECT nodes with constant conditions and
45110 // this makes it hard to see whether a dynamic VSELECT will correctly
45111 // lower, so we both check the operation's status and explicitly handle the
45112 // cases where a *dynamic* blend will fail even though a constant-condition
45113 // blend could be custom lowered.
45114 // FIXME: We should find a better way to handle this class of problems.
45115 // Potentially, we should combine constant-condition vselect nodes
45116 // pre-legalization into shuffles and not mark as many types as custom
45117 // lowered.
45119 return SDValue();
45120 // FIXME: We don't support i16-element blends currently. We could and
45121 // should support them by making *all* the bits in the condition be set
45122 // rather than just the high bit and using an i8-element blend.
45123 if (VT.getVectorElementType() == MVT::i16)
45124 return SDValue();
45125 // Dynamic blending was only available from SSE4.1 onward.
45126 if (VT.is128BitVector() && !Subtarget.hasSSE41())
45127 return SDValue();
45128 // Byte blends are only available in AVX2
45129 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
45130 return SDValue();
45131 // There are no 512-bit blend instructions that use sign bits.
45132 if (VT.is512BitVector())
45133 return SDValue();
45134
45135 // Don't optimize before the condition has been transformed to a legal type
45136 // and don't ever optimize vector selects that map to AVX512 mask-registers.
45137 if (BitWidth < 8 || BitWidth > 64)
45138 return SDValue();
45139
45140 auto OnlyUsedAsSelectCond = [](SDValue Cond) {
45141 for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
45142 UI != UE; ++UI)
45143 if ((UI->getOpcode() != ISD::VSELECT &&
45144 UI->getOpcode() != X86ISD::BLENDV) ||
45145 UI.getOperandNo() != 0)
45146 return false;
45147
45148 return true;
45149 };
45150
45152
45153 if (OnlyUsedAsSelectCond(Cond)) {
45154 KnownBits Known;
45156 !DCI.isBeforeLegalizeOps());
45157 if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
45158 return SDValue();
45159
45160 // If we changed the computation somewhere in the DAG, this change will
45161 // affect all users of Cond. Update all the nodes so that we do not use
45162 // the generic VSELECT anymore. Otherwise, we may perform wrong
45163 // optimizations as we messed with the actual expectation for the vector
45164 // boolean values.
45165 for (SDNode *U : Cond->uses()) {
45166 if (U->getOpcode() == X86ISD::BLENDV)
45167 continue;
45168
45169 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
45170 Cond, U->getOperand(1), U->getOperand(2));
45171 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
45172 DCI.AddToWorklist(U);
45173 }
45174 DCI.CommitTargetLoweringOpt(TLO);
45175 return SDValue(N, 0);
45176 }
45177
45178 // Otherwise we can still at least try to simplify multiple use bits.
45180 return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,
45181 N->getOperand(1), N->getOperand(2));
45182
45183 return SDValue();
45184}
45185
45186// Try to match:
45187// (or (and (M, (sub 0, X)), (pandn M, X)))
45188// which is a special case of:
45189// (select M, (sub 0, X), X)
45190// Per:
45191// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
45192// We know that, if fNegate is 0 or 1:
45193// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
45194//
45195// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
45196// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
45197// ( M ? -X : X) == ((X ^ M ) + (M & 1))
45198// This lets us transform our vselect to:
45199// (add (xor X, M), (and M, 1))
45200// And further to:
45201// (sub (xor X, M), M)
45203 EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
45204 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
45205 EVT MaskVT = Mask.getValueType();
45206 assert(MaskVT.isInteger() &&
45207 DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
45208 "Mask must be zero/all-bits");
45209
45210 if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
45211 return SDValue();
45213 return SDValue();
45214
45215 auto IsNegV = [](SDNode *N, SDValue V) {
45216 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
45217 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
45218 };
45219
45220 SDValue V;
45221 if (IsNegV(Y.getNode(), X))
45222 V = X;
45223 else if (IsNegV(X.getNode(), Y))
45224 V = Y;
45225 else
45226 return SDValue();
45227
45228 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
45229 SDValue SubOp2 = Mask;
45230
45231 // If the negate was on the false side of the select, then
45232 // the operands of the SUB need to be swapped. PR 27251.
45233 // This is because the pattern being matched above is
45234 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
45235 // but if the pattern matched was
45236 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
45237 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
45238 // pattern also needs to be a negation of the replacement pattern above.
45239 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
45240 // sub accomplishes the negation of the replacement pattern.
45241 if (V == Y)
45242 std::swap(SubOp1, SubOp2);
45243
45244 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
45245 return DAG.getBitcast(VT, Res);
45246}
45247
45249 const X86Subtarget &Subtarget) {
45250 if (!Subtarget.hasAVX512())
45251 return SDValue();
45252 if (N->getOpcode() != ISD::VSELECT)
45253 return SDValue();
45254
45255 SDLoc DL(N);
45256 SDValue Cond = N->getOperand(0);
45257 SDValue LHS = N->getOperand(1);
45258 SDValue RHS = N->getOperand(2);
45259
45260 if (canCombineAsMaskOperation(LHS, Subtarget))
45261 return SDValue();
45262
45263 if (!canCombineAsMaskOperation(RHS, Subtarget))
45264 return SDValue();
45265
45266 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
45267 return SDValue();
45268
45269 // Commute LHS and RHS to create opportunity to select mask instruction.
45270 // (vselect M, L, R) -> (vselect ~M, R, L)
45271 ISD::CondCode NewCC =
45272 ISD::getSetCCInverse(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
45273 Cond.getOperand(0).getValueType());
45274 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), Cond.getOperand(0),
45275 Cond.getOperand(1), NewCC);
45276 return DAG.getSelect(DL, LHS.getValueType(), Cond, RHS, LHS);
45277}
45278
45279/// Do target-specific dag combines on SELECT and VSELECT nodes.
45282 const X86Subtarget &Subtarget) {
45283 SDLoc DL(N);
45284 SDValue Cond = N->getOperand(0);
45285 SDValue LHS = N->getOperand(1);
45286 SDValue RHS = N->getOperand(2);
45287
45288 // Try simplification again because we use this function to optimize
45289 // BLENDV nodes that are not handled by the generic combiner.
45290 if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
45291 return V;
45292
45293 // When avx512 is available the lhs operand of select instruction can be
45294 // folded with mask instruction, while the rhs operand can't. Commute the
45295 // lhs and rhs of the select instruction to create the opportunity of
45296 // folding.
45297 if (SDValue V = commuteSelect(N, DAG, Subtarget))
45298 return V;
45299
45300 EVT VT = LHS.getValueType();
45301 EVT CondVT = Cond.getValueType();
45302 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45303 bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
45304
45305 // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
45306 // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
45307 // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
45308 if (CondVT.isVector() && CondVT.isInteger() &&
45309 CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
45310 (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
45313 DL, DAG, Subtarget))
45314 return V;
45315
45316 // Convert vselects with constant condition into shuffles.
45317 if (CondConstantVector && DCI.isBeforeLegalizeOps() &&
45318 (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV)) {
45321 N->getOpcode() == X86ISD::BLENDV))
45322 return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
45323 }
45324
45325 // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
45326 // by forcing the unselected elements to zero.
45327 // TODO: Can we handle more shuffles with this?
45328 if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&
45329 LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&
45330 LHS.hasOneUse() && RHS.hasOneUse()) {
45331 MVT SimpleVT = VT.getSimpleVT();
45332 SmallVector<SDValue, 1> LHSOps, RHSOps;
45333 SmallVector<int, 64> LHSMask, RHSMask, CondMask;
45334 if (createShuffleMaskFromVSELECT(CondMask, Cond) &&
45335 getTargetShuffleMask(LHS, true, LHSOps, LHSMask) &&
45336 getTargetShuffleMask(RHS, true, RHSOps, RHSMask)) {
45337 int NumElts = VT.getVectorNumElements();
45338 for (int i = 0; i != NumElts; ++i) {
45339 // getConstVector sets negative shuffle mask values as undef, so ensure
45340 // we hardcode SM_SentinelZero values to zero (0x80).
45341 if (CondMask[i] < NumElts) {
45342 LHSMask[i] = isUndefOrZero(LHSMask[i]) ? 0x80 : LHSMask[i];
45343 RHSMask[i] = 0x80;
45344 } else {
45345 LHSMask[i] = 0x80;
45346 RHSMask[i] = isUndefOrZero(RHSMask[i]) ? 0x80 : RHSMask[i];
45347 }
45348 }
45349 LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),
45350 getConstVector(LHSMask, SimpleVT, DAG, DL, true));
45351 RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),
45352 getConstVector(RHSMask, SimpleVT, DAG, DL, true));
45353 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
45354 }
45355 }
45356
45357 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
45358 // instructions match the semantics of the common C idiom x<y?x:y but not
45359 // x<=y?x:y, because of how they handle negative zero (which can be
45360 // ignored in unsafe-math mode).
45361 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
45362 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
45363 VT != MVT::f80 && VT != MVT::f128 && !isSoftF16(VT, Subtarget) &&
45364 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
45365 (Subtarget.hasSSE2() ||
45366 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
45367 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
45368
45369 unsigned Opcode = 0;
45370 // Check for x CC y ? x : y.
45371 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
45372 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
45373 switch (CC) {
45374 default: break;
45375 case ISD::SETULT:
45376 // Converting this to a min would handle NaNs incorrectly, and swapping
45377 // the operands would cause it to handle comparisons between positive
45378 // and negative zero incorrectly.
45379 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
45381 !(DAG.isKnownNeverZeroFloat(LHS) ||
45383 break;
45384 std::swap(LHS, RHS);
45385 }
45386 Opcode = X86ISD::FMIN;
45387 break;
45388 case ISD::SETOLE:
45389 // Converting this to a min would handle comparisons between positive
45390 // and negative zero incorrectly.
45393 break;
45394 Opcode = X86ISD::FMIN;
45395 break;
45396 case ISD::SETULE:
45397 // Converting this to a min would handle both negative zeros and NaNs
45398 // incorrectly, but we can swap the operands to fix both.
45399 std::swap(LHS, RHS);
45400 [[fallthrough]];
45401 case ISD::SETOLT:
45402 case ISD::SETLT:
45403 case ISD::SETLE:
45404 Opcode = X86ISD::FMIN;
45405 break;
45406
45407 case ISD::SETOGE:
45408 // Converting this to a max would handle comparisons between positive
45409 // and negative zero incorrectly.
45412 break;
45413 Opcode = X86ISD::FMAX;
45414 break;
45415 case ISD::SETUGT:
45416 // Converting this to a max would handle NaNs incorrectly, and swapping
45417 // the operands would cause it to handle comparisons between positive
45418 // and negative zero incorrectly.
45419 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
45421 !(DAG.isKnownNeverZeroFloat(LHS) ||
45423 break;
45424 std::swap(LHS, RHS);
45425 }
45426 Opcode = X86ISD::FMAX;
45427 break;
45428 case ISD::SETUGE:
45429 // Converting this to a max would handle both negative zeros and NaNs
45430 // incorrectly, but we can swap the operands to fix both.
45431 std::swap(LHS, RHS);
45432 [[fallthrough]];
45433 case ISD::SETOGT:
45434 case ISD::SETGT:
45435 case ISD::SETGE:
45436 Opcode = X86ISD::FMAX;
45437 break;
45438 }
45439 // Check for x CC y ? y : x -- a min/max with reversed arms.
45440 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
45441 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
45442 switch (CC) {
45443 default: break;
45444 case ISD::SETOGE:
45445 // Converting this to a min would handle comparisons between positive
45446 // and negative zero incorrectly, and swapping the operands would
45447 // cause it to handle NaNs incorrectly.
45449 !(DAG.isKnownNeverZeroFloat(LHS) ||
45450 DAG.isKnownNeverZeroFloat(RHS))) {
45451 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
45452 break;
45453 std::swap(LHS, RHS);
45454 }
45455 Opcode = X86ISD::FMIN;
45456 break;
45457 case ISD::SETUGT:
45458 // Converting this to a min would handle NaNs incorrectly.
45459 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
45460 break;
45461 Opcode = X86ISD::FMIN;
45462 break;
45463 case ISD::SETUGE:
45464 // Converting this to a min would handle both negative zeros and NaNs
45465 // incorrectly, but we can swap the operands to fix both.
45466 std::swap(LHS, RHS);
45467 [[fallthrough]];
45468 case ISD::SETOGT:
45469 case ISD::SETGT:
45470 case ISD::SETGE:
45471 Opcode = X86ISD::FMIN;
45472 break;
45473
45474 case ISD::SETULT:
45475 // Converting this to a max would handle NaNs incorrectly.
45476 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
45477 break;
45478 Opcode = X86ISD::FMAX;
45479 break;
45480 case ISD::SETOLE:
45481 // Converting this to a max would handle comparisons between positive
45482 // and negative zero incorrectly, and swapping the operands would
45483 // cause it to handle NaNs incorrectly.
45485 !DAG.isKnownNeverZeroFloat(LHS) &&
45486 !DAG.isKnownNeverZeroFloat(RHS)) {
45487 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
45488 break;
45489 std::swap(LHS, RHS);
45490 }
45491 Opcode = X86ISD::FMAX;
45492 break;
45493 case ISD::SETULE:
45494 // Converting this to a max would handle both negative zeros and NaNs
45495 // incorrectly, but we can swap the operands to fix both.
45496 std::swap(LHS, RHS);
45497 [[fallthrough]];
45498 case ISD::SETOLT:
45499 case ISD::SETLT:
45500 case ISD::SETLE:
45501 Opcode = X86ISD::FMAX;
45502 break;
45503 }
45504 }
45505
45506 if (Opcode)
45507 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
45508 }
45509
45510 // Some mask scalar intrinsics rely on checking if only one bit is set
45511 // and implement it in C code like this:
45512 // A[0] = (U & 1) ? A[0] : W[0];
45513 // This creates some redundant instructions that break pattern matching.
45514 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
45515 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
45516 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
45517 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
45518 SDValue AndNode = Cond.getOperand(0);
45519 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
45520 isNullConstant(Cond.getOperand(1)) &&
45521 isOneConstant(AndNode.getOperand(1))) {
45522 // LHS and RHS swapped due to
45523 // setcc outputting 1 when AND resulted in 0 and vice versa.
45524 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
45525 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
45526 }
45527 }
45528
45529 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
45530 // lowering on KNL. In this case we convert it to
45531 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
45532 // The same situation all vectors of i8 and i16 without BWI.
45533 // Make sure we extend these even before type legalization gets a chance to
45534 // split wide vectors.
45535 // Since SKX these selects have a proper lowering.
45536 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
45537 CondVT.getVectorElementType() == MVT::i1 &&
45538 (VT.getVectorElementType() == MVT::i8 ||
45539 VT.getVectorElementType() == MVT::i16)) {
45540 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
45541 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
45542 }
45543
45544 // AVX512 - Extend select with zero to merge with target shuffle.
45545 // select(mask, extract_subvector(shuffle(x)), zero) -->
45546 // extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
45547 // TODO - support non target shuffles as well.
45548 if (Subtarget.hasAVX512() && CondVT.isVector() &&
45549 CondVT.getVectorElementType() == MVT::i1) {
45550 auto SelectableOp = [&TLI](SDValue Op) {
45551 return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
45552 isTargetShuffle(Op.getOperand(0).getOpcode()) &&
45553 isNullConstant(Op.getOperand(1)) &&
45554 TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
45555 Op.hasOneUse() && Op.getOperand(0).hasOneUse();
45556 };
45557
45558 bool SelectableLHS = SelectableOp(LHS);
45559 bool SelectableRHS = SelectableOp(RHS);
45560 bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());
45561 bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());
45562
45563 if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {
45564 EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
45565 : RHS.getOperand(0).getValueType();
45566 EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
45567 LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
45568 VT.getSizeInBits());
45569 RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
45570 VT.getSizeInBits());
45571 Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
45572 DAG.getUNDEF(SrcCondVT), Cond,
45573 DAG.getIntPtrConstant(0, DL));
45574 SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
45575 return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
45576 }
45577 }
45578
45580 return V;
45581
45582 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
45583 Cond.hasOneUse()) {
45584 EVT CondVT = Cond.getValueType();
45585 SDValue Cond0 = Cond.getOperand(0);
45586 SDValue Cond1 = Cond.getOperand(1);
45587 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
45588
45589 // Canonicalize min/max:
45590 // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
45591 // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
45592 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
45593 // the need for an extra compare against zero. e.g.
45594 // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
45595 // subl %esi, %edi
45596 // testl %edi, %edi
45597 // movl $0, %eax
45598 // cmovgl %edi, %eax
45599 // =>
45600 // xorl %eax, %eax
45601 // subl %esi, $edi
45602 // cmovsl %eax, %edi
45603 //
45604 // We can also canonicalize
45605 // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
45606 // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
45607 // This allows the use of a test instruction for the compare.
45608 if (LHS == Cond0 && RHS == Cond1) {
45609 if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
45612 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
45613 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
45614 }
45615 if (CC == ISD::SETUGT && isOneConstant(RHS)) {
45616 ISD::CondCode NewCC = ISD::SETUGE;
45617 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
45618 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
45619 }
45620 }
45621
45622 // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
45623 // fold eq + gt/lt nested selects into ge/le selects
45624 // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
45625 // --> (select (cmpuge Cond0, Cond1), LHS, Y)
45626 // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
45627 // --> (select (cmpsle Cond0, Cond1), LHS, Y)
45628 // .. etc ..
45629 if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
45630 RHS.getOperand(0).getOpcode() == ISD::SETCC) {
45631 SDValue InnerSetCC = RHS.getOperand(0);
45632 ISD::CondCode InnerCC =
45633 cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
45634 if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
45635 Cond0 == InnerSetCC.getOperand(0) &&
45636 Cond1 == InnerSetCC.getOperand(1)) {
45637 ISD::CondCode NewCC;
45638 switch (CC == ISD::SETEQ ? InnerCC : CC) {
45639 // clang-format off
45640 case ISD::SETGT: NewCC = ISD::SETGE; break;
45641 case ISD::SETLT: NewCC = ISD::SETLE; break;
45642 case ISD::SETUGT: NewCC = ISD::SETUGE; break;
45643 case ISD::SETULT: NewCC = ISD::SETULE; break;
45644 default: NewCC = ISD::SETCC_INVALID; break;
45645 // clang-format on
45646 }
45647 if (NewCC != ISD::SETCC_INVALID) {
45648 Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
45649 return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
45650 }
45651 }
45652 }
45653 }
45654
45655 // Check if the first operand is all zeros and Cond type is vXi1.
45656 // If this an avx512 target we can improve the use of zero masking by
45657 // swapping the operands and inverting the condition.
45658 if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
45659 Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
45660 ISD::isBuildVectorAllZeros(LHS.getNode()) &&
45661 !ISD::isBuildVectorAllZeros(RHS.getNode())) {
45662 // Invert the cond to not(cond) : xor(op,allones)=not(op)
45663 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
45664 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
45665 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
45666 }
45667
45668 // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
45669 // get split by legalization.
45670 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
45671 CondVT.getVectorElementType() == MVT::i1 &&
45672 TLI.isTypeLegal(VT.getScalarType())) {
45673 EVT ExtCondVT = VT.changeVectorElementTypeToInteger();
45675 ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {
45676 ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);
45677 return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);
45678 }
45679 }
45680
45681 // Early exit check
45682 if (!TLI.isTypeLegal(VT) || isSoftF16(VT, Subtarget))
45683 return SDValue();
45684
45685 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
45686 return V;
45687
45688 if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
45689 return V;
45690
45691 if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
45692 return V;
45693
45694 // select(~Cond, X, Y) -> select(Cond, Y, X)
45695 if (CondVT.getScalarType() != MVT::i1) {
45696 if (SDValue CondNot = IsNOT(Cond, DAG))
45697 return DAG.getNode(N->getOpcode(), DL, VT,
45698 DAG.getBitcast(CondVT, CondNot), RHS, LHS);
45699
45700 // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the
45701 // signbit.
45702 if (Cond.getOpcode() == X86ISD::PCMPGT &&
45703 ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode()) &&
45704 Cond.hasOneUse()) {
45705 Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
45706 DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
45707 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
45708 }
45709 }
45710
45711 // Try to optimize vXi1 selects if both operands are either all constants or
45712 // bitcasts from scalar integer type. In that case we can convert the operands
45713 // to integer and use an integer select which will be converted to a CMOV.
45714 // We need to take a little bit of care to avoid creating an i64 type after
45715 // type legalization.
45716 if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
45717 VT.getVectorElementType() == MVT::i1 &&
45718 (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
45720 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(IntVT)) {
45721 bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
45722 bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
45723
45724 if ((LHSIsConst || (LHS.getOpcode() == ISD::BITCAST &&
45725 LHS.getOperand(0).getValueType() == IntVT)) &&
45726 (RHSIsConst || (RHS.getOpcode() == ISD::BITCAST &&
45727 RHS.getOperand(0).getValueType() == IntVT))) {
45728 if (LHSIsConst)
45730 else
45731 LHS = LHS.getOperand(0);
45732
45733 if (RHSIsConst)
45735 else
45736 RHS = RHS.getOperand(0);
45737
45738 SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
45739 return DAG.getBitcast(VT, Select);
45740 }
45741 }
45742 }
45743
45744 // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
45745 // single bits, then invert the predicate and swap the select operands.
45746 // This can lower using a vector shift bit-hack rather than mask and compare.
45747 if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
45748 N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
45749 Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
45750 Cond.getOperand(0).getOpcode() == ISD::AND &&
45751 isNullOrNullSplat(Cond.getOperand(1)) &&
45752 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
45753 Cond.getOperand(0).getValueType() == VT) {
45754 // The 'and' mask must be composed of power-of-2 constants.
45755 SDValue And = Cond.getOperand(0);
45756 auto *C = isConstOrConstSplat(And.getOperand(1));
45757 if (C && C->getAPIntValue().isPowerOf2()) {
45758 // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
45759 SDValue NotCond =
45760 DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
45761 return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
45762 }
45763
45764 // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
45765 // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
45766 // 16-bit lacks a proper blendv.
45767 unsigned EltBitWidth = VT.getScalarSizeInBits();
45768 bool CanShiftBlend =
45769 TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
45770 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
45771 (Subtarget.hasXOP()));
45772 if (CanShiftBlend &&
45773 ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
45774 return C->getAPIntValue().isPowerOf2();
45775 })) {
45776 // Create a left-shift constant to get the mask bits over to the sign-bit.
45777 SDValue Mask = And.getOperand(1);
45778 SmallVector<int, 32> ShlVals;
45779 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
45780 auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
45781 ShlVals.push_back(EltBitWidth - 1 -
45782 MaskVal->getAPIntValue().exactLogBase2());
45783 }
45784 // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
45785 SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
45786 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
45787 SDValue NewCond =
45788 DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
45789 return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
45790 }
45791 }
45792
45793 return SDValue();
45794}
45795
45796/// Combine:
45797/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
45798/// to:
45799/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
45800/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
45801/// Note that this is only legal for some op/cc combinations.
45803 SelectionDAG &DAG,
45804 const X86Subtarget &Subtarget) {
45805 // This combine only operates on CMP-like nodes.
45806 if (!(Cmp.getOpcode() == X86ISD::CMP ||
45807 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
45808 return SDValue();
45809
45810 // Can't replace the cmp if it has more uses than the one we're looking at.
45811 // FIXME: We would like to be able to handle this, but would need to make sure
45812 // all uses were updated.
45813 if (!Cmp.hasOneUse())
45814 return SDValue();
45815
45816 // This only applies to variations of the common case:
45817 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
45818 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
45819 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
45820 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
45821 // Using the proper condcodes (see below), overflow is checked for.
45822
45823 // FIXME: We can generalize both constraints:
45824 // - XOR/OR/AND (if they were made to survive AtomicExpand)
45825 // - LHS != 1
45826 // if the result is compared.
45827
45828 SDValue CmpLHS = Cmp.getOperand(0);
45829 SDValue CmpRHS = Cmp.getOperand(1);
45830 EVT CmpVT = CmpLHS.getValueType();
45831
45832 if (!CmpLHS.hasOneUse())
45833 return SDValue();
45834
45835 unsigned Opc = CmpLHS.getOpcode();
45836 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
45837 return SDValue();
45838
45839 SDValue OpRHS = CmpLHS.getOperand(2);
45840 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
45841 if (!OpRHSC)
45842 return SDValue();
45843
45844 APInt Addend = OpRHSC->getAPIntValue();
45845 if (Opc == ISD::ATOMIC_LOAD_SUB)
45846 Addend = -Addend;
45847
45848 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
45849 if (!CmpRHSC)
45850 return SDValue();
45851
45852 APInt Comparison = CmpRHSC->getAPIntValue();
45853 APInt NegAddend = -Addend;
45854
45855 // See if we can adjust the CC to make the comparison match the negated
45856 // addend.
45857 if (Comparison != NegAddend) {
45858 APInt IncComparison = Comparison + 1;
45859 if (IncComparison == NegAddend) {
45860 if (CC == X86::COND_A && !Comparison.isMaxValue()) {
45861 Comparison = IncComparison;
45862 CC = X86::COND_AE;
45863 } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
45864 Comparison = IncComparison;
45865 CC = X86::COND_L;
45866 }
45867 }
45868 APInt DecComparison = Comparison - 1;
45869 if (DecComparison == NegAddend) {
45870 if (CC == X86::COND_AE && !Comparison.isMinValue()) {
45871 Comparison = DecComparison;
45872 CC = X86::COND_A;
45873 } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
45874 Comparison = DecComparison;
45875 CC = X86::COND_LE;
45876 }
45877 }
45878 }
45879
45880 // If the addend is the negation of the comparison value, then we can do
45881 // a full comparison by emitting the atomic arithmetic as a locked sub.
45882 if (Comparison == NegAddend) {
45883 // The CC is fine, but we need to rewrite the LHS of the comparison as an
45884 // atomic sub.
45885 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
45886 auto AtomicSub = DAG.getAtomic(
45887 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
45888 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
45889 /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
45890 AN->getMemOperand());
45891 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
45892 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
45893 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
45894 return LockOp;
45895 }
45896
45897 // We can handle comparisons with zero in a number of cases by manipulating
45898 // the CC used.
45899 if (!Comparison.isZero())
45900 return SDValue();
45901
45902 if (CC == X86::COND_S && Addend == 1)
45903 CC = X86::COND_LE;
45904 else if (CC == X86::COND_NS && Addend == 1)
45905 CC = X86::COND_G;
45906 else if (CC == X86::COND_G && Addend == -1)
45907 CC = X86::COND_GE;
45908 else if (CC == X86::COND_LE && Addend == -1)
45909 CC = X86::COND_L;
45910 else
45911 return SDValue();
45912
45913 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
45914 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
45915 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
45916 return LockOp;
45917}
45918
45919// Check whether a boolean test is testing a boolean value generated by
45920// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
45921// code.
45922//
45923// Simplify the following patterns:
45924// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
45925// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
45926// to (Op EFLAGS Cond)
45927//
45928// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
45929// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
45930// to (Op EFLAGS !Cond)
45931//
45932// where Op could be BRCOND or CMOV.
45933//
45935 // This combine only operates on CMP-like nodes.
45936 if (!(Cmp.getOpcode() == X86ISD::CMP ||
45937 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
45938 return SDValue();
45939
45940 // Quit if not used as a boolean value.
45941 if (CC != X86::COND_E && CC != X86::COND_NE)
45942 return SDValue();
45943
45944 // Check CMP operands. One of them should be 0 or 1 and the other should be
45945 // an SetCC or extended from it.
45946 SDValue Op1 = Cmp.getOperand(0);
45947 SDValue Op2 = Cmp.getOperand(1);
45948
45949 SDValue SetCC;
45950 const ConstantSDNode* C = nullptr;
45951 bool needOppositeCond = (CC == X86::COND_E);
45952 bool checkAgainstTrue = false; // Is it a comparison against 1?
45953
45954 if ((C = dyn_cast<ConstantSDNode>(Op1)))
45955 SetCC = Op2;
45956 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
45957 SetCC = Op1;
45958 else // Quit if all operands are not constants.
45959 return SDValue();
45960
45961 if (C->getZExtValue() == 1) {
45962 needOppositeCond = !needOppositeCond;
45963 checkAgainstTrue = true;
45964 } else if (C->getZExtValue() != 0)
45965 // Quit if the constant is neither 0 or 1.
45966 return SDValue();
45967
45968 bool truncatedToBoolWithAnd = false;
45969 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
45970 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
45971 SetCC.getOpcode() == ISD::TRUNCATE ||
45972 SetCC.getOpcode() == ISD::AND) {
45973 if (SetCC.getOpcode() == ISD::AND) {
45974 int OpIdx = -1;
45975 if (isOneConstant(SetCC.getOperand(0)))
45976 OpIdx = 1;
45977 if (isOneConstant(SetCC.getOperand(1)))
45978 OpIdx = 0;
45979 if (OpIdx < 0)
45980 break;
45981 SetCC = SetCC.getOperand(OpIdx);
45982 truncatedToBoolWithAnd = true;
45983 } else
45984 SetCC = SetCC.getOperand(0);
45985 }
45986
45987 switch (SetCC.getOpcode()) {
45989 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
45990 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
45991 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
45992 // truncated to i1 using 'and'.
45993 if (checkAgainstTrue && !truncatedToBoolWithAnd)
45994 break;
45996 "Invalid use of SETCC_CARRY!");
45997 [[fallthrough]];
45998 case X86ISD::SETCC:
45999 // Set the condition code or opposite one if necessary.
46001 if (needOppositeCond)
46003 return SetCC.getOperand(1);
46004 case X86ISD::CMOV: {
46005 // Check whether false/true value has canonical one, i.e. 0 or 1.
46006 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
46007 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
46008 // Quit if true value is not a constant.
46009 if (!TVal)
46010 return SDValue();
46011 // Quit if false value is not a constant.
46012 if (!FVal) {
46013 SDValue Op = SetCC.getOperand(0);
46014 // Skip 'zext' or 'trunc' node.
46015 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
46016 Op.getOpcode() == ISD::TRUNCATE)
46017 Op = Op.getOperand(0);
46018 // A special case for rdrand/rdseed, where 0 is set if false cond is
46019 // found.
46020 if ((Op.getOpcode() != X86ISD::RDRAND &&
46021 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
46022 return SDValue();
46023 }
46024 // Quit if false value is not the constant 0 or 1.
46025 bool FValIsFalse = true;
46026 if (FVal && FVal->getZExtValue() != 0) {
46027 if (FVal->getZExtValue() != 1)
46028 return SDValue();
46029 // If FVal is 1, opposite cond is needed.
46030 needOppositeCond = !needOppositeCond;
46031 FValIsFalse = false;
46032 }
46033 // Quit if TVal is not the constant opposite of FVal.
46034 if (FValIsFalse && TVal->getZExtValue() != 1)
46035 return SDValue();
46036 if (!FValIsFalse && TVal->getZExtValue() != 0)
46037 return SDValue();
46039 if (needOppositeCond)
46041 return SetCC.getOperand(3);
46042 }
46043 }
46044
46045 return SDValue();
46046}
46047
46048/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
46049/// Match:
46050/// (X86or (X86setcc) (X86setcc))
46051/// (X86cmp (and (X86setcc) (X86setcc)), 0)
46053 X86::CondCode &CC1, SDValue &Flags,
46054 bool &isAnd) {
46055 if (Cond->getOpcode() == X86ISD::CMP) {
46056 if (!isNullConstant(Cond->getOperand(1)))
46057 return false;
46058
46059 Cond = Cond->getOperand(0);
46060 }
46061
46062 isAnd = false;
46063
46064 SDValue SetCC0, SetCC1;
46065 switch (Cond->getOpcode()) {
46066 default: return false;
46067 case ISD::AND:
46068 case X86ISD::AND:
46069 isAnd = true;
46070 [[fallthrough]];
46071 case ISD::OR:
46072 case X86ISD::OR:
46073 SetCC0 = Cond->getOperand(0);
46074 SetCC1 = Cond->getOperand(1);
46075 break;
46076 };
46077
46078 // Make sure we have SETCC nodes, using the same flags value.
46079 if (SetCC0.getOpcode() != X86ISD::SETCC ||
46080 SetCC1.getOpcode() != X86ISD::SETCC ||
46081 SetCC0->getOperand(1) != SetCC1->getOperand(1))
46082 return false;
46083
46084 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
46085 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
46086 Flags = SetCC0->getOperand(1);
46087 return true;
46088}
46089
46090// When legalizing carry, we create carries via add X, -1
46091// If that comes from an actual carry, via setcc, we use the
46092// carry directly.
46094 if (EFLAGS.getOpcode() == X86ISD::ADD) {
46095 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
46096 bool FoundAndLSB = false;
46097 SDValue Carry = EFLAGS.getOperand(0);
46098 while (Carry.getOpcode() == ISD::TRUNCATE ||
46099 Carry.getOpcode() == ISD::ZERO_EXTEND ||
46100 (Carry.getOpcode() == ISD::AND &&
46101 isOneConstant(Carry.getOperand(1)))) {
46102 FoundAndLSB |= Carry.getOpcode() == ISD::AND;
46103 Carry = Carry.getOperand(0);
46104 }
46105 if (Carry.getOpcode() == X86ISD::SETCC ||
46106 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
46107 // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
46108 uint64_t CarryCC = Carry.getConstantOperandVal(0);
46109 SDValue CarryOp1 = Carry.getOperand(1);
46110 if (CarryCC == X86::COND_B)
46111 return CarryOp1;
46112 if (CarryCC == X86::COND_A) {
46113 // Try to convert COND_A into COND_B in an attempt to facilitate
46114 // materializing "setb reg".
46115 //
46116 // Do not flip "e > c", where "c" is a constant, because Cmp
46117 // instruction cannot take an immediate as its first operand.
46118 //
46119 if (CarryOp1.getOpcode() == X86ISD::SUB &&
46120 CarryOp1.getNode()->hasOneUse() &&
46121 CarryOp1.getValueType().isInteger() &&
46122 !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
46123 SDValue SubCommute =
46124 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
46125 CarryOp1.getOperand(1), CarryOp1.getOperand(0));
46126 return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
46127 }
46128 }
46129 // If this is a check of the z flag of an add with 1, switch to the
46130 // C flag.
46131 if (CarryCC == X86::COND_E &&
46132 CarryOp1.getOpcode() == X86ISD::ADD &&
46133 isOneConstant(CarryOp1.getOperand(1)))
46134 return CarryOp1;
46135 } else if (FoundAndLSB) {
46136 SDLoc DL(Carry);
46137 SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());
46138 if (Carry.getOpcode() == ISD::SRL) {
46139 BitNo = Carry.getOperand(1);
46140 Carry = Carry.getOperand(0);
46141 }
46142 return getBT(Carry, BitNo, DL, DAG);
46143 }
46144 }
46145 }
46146
46147 return SDValue();
46148}
46149
46150/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
46151/// to avoid the inversion.
46153 SelectionDAG &DAG,
46154 const X86Subtarget &Subtarget) {
46155 // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
46156 if (EFLAGS.getOpcode() != X86ISD::PTEST &&
46157 EFLAGS.getOpcode() != X86ISD::TESTP)
46158 return SDValue();
46159
46160 // PTEST/TESTP sets EFLAGS as:
46161 // TESTZ: ZF = (Op0 & Op1) == 0
46162 // TESTC: CF = (~Op0 & Op1) == 0
46163 // TESTNZC: ZF == 0 && CF == 0
46164 MVT VT = EFLAGS.getSimpleValueType();
46165 SDValue Op0 = EFLAGS.getOperand(0);
46166 SDValue Op1 = EFLAGS.getOperand(1);
46167 MVT OpVT = Op0.getSimpleValueType();
46168 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46169
46170 // TEST*(~X,Y) == TEST*(X,Y)
46171 if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
46172 X86::CondCode InvCC;
46173 switch (CC) {
46174 case X86::COND_B:
46175 // testc -> testz.
46176 InvCC = X86::COND_E;
46177 break;
46178 case X86::COND_AE:
46179 // !testc -> !testz.
46180 InvCC = X86::COND_NE;
46181 break;
46182 case X86::COND_E:
46183 // testz -> testc.
46184 InvCC = X86::COND_B;
46185 break;
46186 case X86::COND_NE:
46187 // !testz -> !testc.
46188 InvCC = X86::COND_AE;
46189 break;
46190 case X86::COND_A:
46191 case X86::COND_BE:
46192 // testnzc -> testnzc (no change).
46193 InvCC = CC;
46194 break;
46195 default:
46196 InvCC = X86::COND_INVALID;
46197 break;
46198 }
46199
46200 if (InvCC != X86::COND_INVALID) {
46201 CC = InvCC;
46202 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
46203 DAG.getBitcast(OpVT, NotOp0), Op1);
46204 }
46205 }
46206
46207 if (CC == X86::COND_B || CC == X86::COND_AE) {
46208 // TESTC(X,~X) == TESTC(X,-1)
46209 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
46210 if (peekThroughBitcasts(NotOp1) == peekThroughBitcasts(Op0)) {
46211 SDLoc DL(EFLAGS);
46212 return DAG.getNode(
46213 EFLAGS.getOpcode(), DL, VT, DAG.getBitcast(OpVT, NotOp1),
46214 DAG.getBitcast(OpVT,
46215 DAG.getAllOnesConstant(DL, NotOp1.getValueType())));
46216 }
46217 }
46218 }
46219
46220 if (CC == X86::COND_E || CC == X86::COND_NE) {
46221 // TESTZ(X,~Y) == TESTC(Y,X)
46222 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
46224 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
46225 DAG.getBitcast(OpVT, NotOp1), Op0);
46226 }
46227
46228 if (Op0 == Op1) {
46229 SDValue BC = peekThroughBitcasts(Op0);
46230 EVT BCVT = BC.getValueType();
46231
46232 // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
46233 if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
46234 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
46235 DAG.getBitcast(OpVT, BC.getOperand(0)),
46236 DAG.getBitcast(OpVT, BC.getOperand(1)));
46237 }
46238
46239 // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
46240 if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
46242 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
46243 DAG.getBitcast(OpVT, BC.getOperand(0)),
46244 DAG.getBitcast(OpVT, BC.getOperand(1)));
46245 }
46246
46247 // If every element is an all-sign value, see if we can use TESTP/MOVMSK
46248 // to more efficiently extract the sign bits and compare that.
46249 // TODO: Handle TESTC with comparison inversion.
46250 // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
46251 // TESTP/MOVMSK combines to make sure its never worse than PTEST?
46252 if (BCVT.isVector() && TLI.isTypeLegal(BCVT)) {
46253 unsigned EltBits = BCVT.getScalarSizeInBits();
46254 if (DAG.ComputeNumSignBits(BC) == EltBits) {
46255 assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
46256 APInt SignMask = APInt::getSignMask(EltBits);
46257 if (SDValue Res =
46258 TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
46259 // For vXi16 cases we need to use pmovmksb and extract every other
46260 // sign bit.
46261 SDLoc DL(EFLAGS);
46262 if ((EltBits == 32 || EltBits == 64) && Subtarget.hasAVX()) {
46263 MVT FloatSVT = MVT::getFloatingPointVT(EltBits);
46264 MVT FloatVT =
46265 MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits);
46266 Res = DAG.getBitcast(FloatVT, Res);
46267 return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res);
46268 } else if (EltBits == 16) {
46269 MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
46270 Res = DAG.getBitcast(MovmskVT, Res);
46271 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
46272 Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
46273 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
46274 } else {
46275 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
46276 }
46277 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
46278 DAG.getConstant(0, DL, MVT::i32));
46279 }
46280 }
46281 }
46282 }
46283
46284 // TESTZ(-1,X) == TESTZ(X,X)
46286 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
46287
46288 // TESTZ(X,-1) == TESTZ(X,X)
46290 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
46291
46292 // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)
46293 // TODO: Add COND_NE handling?
46294 if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {
46295 SDValue Src0 = peekThroughBitcasts(Op0);
46296 SDValue Src1 = peekThroughBitcasts(Op1);
46297 if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {
46299 peekThroughBitcasts(Src0.getOperand(1)), true);
46301 peekThroughBitcasts(Src1.getOperand(1)), true);
46302 if (Src0 && Src1) {
46303 MVT OpVT2 = OpVT.getDoubleNumVectorElementsVT();
46304 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
46305 DAG.getBitcast(OpVT2, Src0),
46306 DAG.getBitcast(OpVT2, Src1));
46307 }
46308 }
46309 }
46310 }
46311
46312 return SDValue();
46313}
46314
46315// Attempt to simplify the MOVMSK input based on the comparison type.
46317 SelectionDAG &DAG,
46318 const X86Subtarget &Subtarget) {
46319 // Handle eq/ne against zero (any_of).
46320 // Handle eq/ne against -1 (all_of).
46321 if (!(CC == X86::COND_E || CC == X86::COND_NE))
46322 return SDValue();
46323 if (EFLAGS.getValueType() != MVT::i32)
46324 return SDValue();
46325 unsigned CmpOpcode = EFLAGS.getOpcode();
46326 if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
46327 return SDValue();
46328 auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
46329 if (!CmpConstant)
46330 return SDValue();
46331 const APInt &CmpVal = CmpConstant->getAPIntValue();
46332
46333 SDValue CmpOp = EFLAGS.getOperand(0);
46334 unsigned CmpBits = CmpOp.getValueSizeInBits();
46335 assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch");
46336
46337 // Peek through any truncate.
46338 if (CmpOp.getOpcode() == ISD::TRUNCATE)
46339 CmpOp = CmpOp.getOperand(0);
46340
46341 // Bail if we don't find a MOVMSK.
46342 if (CmpOp.getOpcode() != X86ISD::MOVMSK)
46343 return SDValue();
46344
46345 SDValue Vec = CmpOp.getOperand(0);
46346 MVT VecVT = Vec.getSimpleValueType();
46347 assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
46348 "Unexpected MOVMSK operand");
46349 unsigned NumElts = VecVT.getVectorNumElements();
46350 unsigned NumEltBits = VecVT.getScalarSizeInBits();
46351
46352 bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();
46353 bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&
46354 NumElts <= CmpBits && CmpVal.isMask(NumElts);
46355 if (!IsAnyOf && !IsAllOf)
46356 return SDValue();
46357
46358 // TODO: Check more combining cases for me.
46359 // Here we check the cmp use number to decide do combining or not.
46360 // Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))"
46361 // and "MOVMSK(PCMPEQ(..))" are fit to use this constraint.
46362 bool IsOneUse = CmpOp.getNode()->hasOneUse();
46363
46364 // See if we can peek through to a vector with a wider element type, if the
46365 // signbits extend down to all the sub-elements as well.
46366 // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
46367 // potential SimplifyDemandedBits/Elts cases.
46368 // If we looked through a truncate that discard bits, we can't do this
46369 // transform.
46370 // FIXME: We could do this transform for truncates that discarded bits by
46371 // inserting an AND mask between the new MOVMSK and the CMP.
46372 if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {
46373 SDValue BC = peekThroughBitcasts(Vec);
46374 MVT BCVT = BC.getSimpleValueType();
46375 unsigned BCNumElts = BCVT.getVectorNumElements();
46376 unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
46377 if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
46378 BCNumEltBits > NumEltBits &&
46379 DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
46380 SDLoc DL(EFLAGS);
46381 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);
46382 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
46383 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
46384 DAG.getConstant(CmpMask, DL, MVT::i32));
46385 }
46386 }
46387
46388 // MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)).
46389 // MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).
46390 // MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).
46391 // MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).
46392 if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {
46394 if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&
46395 Ops.size() == 2) {
46396 SDLoc DL(EFLAGS);
46397 EVT SubVT = Ops[0].getValueType().changeTypeToInteger();
46398 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);
46399 SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,
46400 DAG.getBitcast(SubVT, Ops[0]),
46401 DAG.getBitcast(SubVT, Ops[1]));
46402 V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);
46403 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
46404 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),
46405 DAG.getConstant(CmpMask, DL, MVT::i32));
46406 }
46407 }
46408
46409 // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
46410 // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
46411 // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(XOR(X,Y),XOR(X,Y)).
46412 // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(XOR(X,Y),XOR(X,Y)).
46413 if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) {
46414 MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
46415 SDValue BC = peekThroughBitcasts(Vec);
46416 // Ensure MOVMSK was testing every signbit of BC.
46417 if (BC.getValueType().getVectorNumElements() <= NumElts) {
46418 if (BC.getOpcode() == X86ISD::PCMPEQ) {
46419 SDValue V = DAG.getNode(ISD::XOR, SDLoc(BC), BC.getValueType(),
46420 BC.getOperand(0), BC.getOperand(1));
46421 V = DAG.getBitcast(TestVT, V);
46422 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
46423 }
46424 // Check for 256-bit split vector cases.
46425 if (BC.getOpcode() == ISD::AND &&
46426 BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&
46427 BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {
46428 SDValue LHS = BC.getOperand(0);
46429 SDValue RHS = BC.getOperand(1);
46430 LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), LHS.getValueType(),
46431 LHS.getOperand(0), LHS.getOperand(1));
46432 RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), RHS.getValueType(),
46433 RHS.getOperand(0), RHS.getOperand(1));
46434 LHS = DAG.getBitcast(TestVT, LHS);
46435 RHS = DAG.getBitcast(TestVT, RHS);
46436 SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);
46437 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
46438 }
46439 }
46440 }
46441
46442 // See if we can avoid a PACKSS by calling MOVMSK on the sources.
46443 // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
46444 // sign bits prior to the comparison with zero unless we know that
46445 // the vXi16 splats the sign bit down to the lower i8 half.
46446 // TODO: Handle all_of patterns.
46447 if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
46448 SDValue VecOp0 = Vec.getOperand(0);
46449 SDValue VecOp1 = Vec.getOperand(1);
46450 bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
46451 bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
46452 // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
46453 if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
46454 SDLoc DL(EFLAGS);
46455 SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
46456 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
46457 Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
46458 if (!SignExt0) {
46459 Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
46460 DAG.getConstant(0xAAAA, DL, MVT::i16));
46461 }
46462 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
46463 DAG.getConstant(0, DL, MVT::i16));
46464 }
46465 // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
46466 // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
46467 if (CmpBits >= 16 && Subtarget.hasInt256() &&
46468 (IsAnyOf || (SignExt0 && SignExt1))) {
46469 if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {
46470 SDLoc DL(EFLAGS);
46471 SDValue Result = peekThroughBitcasts(Src);
46472 if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&
46473 Result.getValueType().getVectorNumElements() <= NumElts) {
46474 SDValue V = DAG.getNode(ISD::XOR, DL, Result.getValueType(),
46475 Result.getOperand(0), Result.getOperand(1));
46476 V = DAG.getBitcast(MVT::v4i64, V);
46477 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
46478 }
46479 Result = DAG.getBitcast(MVT::v32i8, Result);
46480 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
46481 unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
46482 if (!SignExt0 || !SignExt1) {
46483 assert(IsAnyOf &&
46484 "Only perform v16i16 signmasks for any_of patterns");
46485 Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
46486 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
46487 }
46488 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
46489 DAG.getConstant(CmpMask, DL, MVT::i32));
46490 }
46491 }
46492 }
46493
46494 // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
46495 // Since we peek through a bitcast, we need to be careful if the base vector
46496 // type has smaller elements than the MOVMSK type. In that case, even if
46497 // all the elements are demanded by the shuffle mask, only the "high"
46498 // elements which have highbits that align with highbits in the MOVMSK vec
46499 // elements are actually demanded. A simplification of spurious operations
46500 // on the "low" elements take place during other simplifications.
46501 //
46502 // For example:
46503 // MOVMSK64(BITCAST(SHUF32 X, (1,0,3,2))) even though all the elements are
46504 // demanded, because we are swapping around the result can change.
46505 //
46506 // To address this, we check that we can scale the shuffle mask to MOVMSK
46507 // element width (this will ensure "high" elements match). Its slightly overly
46508 // conservative, but fine for an edge case fold.
46509 SmallVector<int, 32> ShuffleMask, ScaledMaskUnused;
46510 SmallVector<SDValue, 2> ShuffleInputs;
46511 if (NumElts <= CmpBits &&
46512 getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
46513 ShuffleMask, DAG) &&
46514 ShuffleInputs.size() == 1 && isCompletePermute(ShuffleMask) &&
46515 ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits() &&
46516 scaleShuffleElements(ShuffleMask, NumElts, ScaledMaskUnused)) {
46517 SDLoc DL(EFLAGS);
46518 SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
46519 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
46520 Result =
46521 DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
46522 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result, EFLAGS.getOperand(1));
46523 }
46524
46525 // MOVMSKPS(V) !=/== 0 -> TESTPS(V,V)
46526 // MOVMSKPD(V) !=/== 0 -> TESTPD(V,V)
46527 // MOVMSKPS(V) !=/== -1 -> TESTPS(V,V)
46528 // MOVMSKPD(V) !=/== -1 -> TESTPD(V,V)
46529 // iff every element is referenced.
46530 if (NumElts <= CmpBits && Subtarget.hasAVX() &&
46531 !Subtarget.preferMovmskOverVTest() && IsOneUse &&
46532 (NumEltBits == 32 || NumEltBits == 64)) {
46533 SDLoc DL(EFLAGS);
46534 MVT FloatSVT = MVT::getFloatingPointVT(NumEltBits);
46535 MVT FloatVT = MVT::getVectorVT(FloatSVT, NumElts);
46536 MVT IntVT = FloatVT.changeVectorElementTypeToInteger();
46537 SDValue LHS = Vec;
46538 SDValue RHS = IsAnyOf ? Vec : DAG.getAllOnesConstant(DL, IntVT);
46539 CC = IsAnyOf ? CC : (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
46540 return DAG.getNode(X86ISD::TESTP, DL, MVT::i32,
46541 DAG.getBitcast(FloatVT, LHS),
46542 DAG.getBitcast(FloatVT, RHS));
46543 }
46544
46545 return SDValue();
46546}
46547
46548/// Optimize an EFLAGS definition used according to the condition code \p CC
46549/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
46550/// uses of chain values.
46552 SelectionDAG &DAG,
46553 const X86Subtarget &Subtarget) {
46554 if (CC == X86::COND_B)
46555 if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
46556 return Flags;
46557
46558 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
46559 return R;
46560
46561 if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
46562 return R;
46563
46564 if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
46565 return R;
46566
46567 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
46568}
46569
46570/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
46573 const X86Subtarget &Subtarget) {
46574 SDLoc DL(N);
46575
46576 SDValue FalseOp = N->getOperand(0);
46577 SDValue TrueOp = N->getOperand(1);
46578 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
46579 SDValue Cond = N->getOperand(3);
46580
46581 // cmov X, X, ?, ? --> X
46582 if (TrueOp == FalseOp)
46583 return TrueOp;
46584
46585 // Try to simplify the EFLAGS and condition code operands.
46586 // We can't always do this as FCMOV only supports a subset of X86 cond.
46587 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
46588 if (!(FalseOp.getValueType() == MVT::f80 ||
46589 (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
46590 (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
46591 !Subtarget.canUseCMOV() || hasFPCMov(CC)) {
46592 SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
46593 Flags};
46594 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
46595 }
46596 }
46597
46598 // If this is a select between two integer constants, try to do some
46599 // optimizations. Note that the operands are ordered the opposite of SELECT
46600 // operands.
46601 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
46602 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
46603 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
46604 // larger than FalseC (the false value).
46605 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
46607 std::swap(TrueC, FalseC);
46608 std::swap(TrueOp, FalseOp);
46609 }
46610
46611 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
46612 // This is efficient for any integer data type (including i8/i16) and
46613 // shift amount.
46614 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
46615 Cond = getSETCC(CC, Cond, DL, DAG);
46616
46617 // Zero extend the condition if needed.
46618 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
46619
46620 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
46621 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
46622 DAG.getConstant(ShAmt, DL, MVT::i8));
46623 return Cond;
46624 }
46625
46626 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
46627 // for any integer data type, including i8/i16.
46628 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
46629 Cond = getSETCC(CC, Cond, DL, DAG);
46630
46631 // Zero extend the condition if needed.
46633 FalseC->getValueType(0), Cond);
46634 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
46635 SDValue(FalseC, 0));
46636 return Cond;
46637 }
46638
46639 // Optimize cases that will turn into an LEA instruction. This requires
46640 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
46641 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
46642 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
46643 assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&
46644 "Implicit constant truncation");
46645
46646 bool isFastMultiplier = false;
46647 if (Diff.ult(10)) {
46648 switch (Diff.getZExtValue()) {
46649 default: break;
46650 case 1: // result = add base, cond
46651 case 2: // result = lea base( , cond*2)
46652 case 3: // result = lea base(cond, cond*2)
46653 case 4: // result = lea base( , cond*4)
46654 case 5: // result = lea base(cond, cond*4)
46655 case 8: // result = lea base( , cond*8)
46656 case 9: // result = lea base(cond, cond*8)
46657 isFastMultiplier = true;
46658 break;
46659 }
46660 }
46661
46662 if (isFastMultiplier) {
46663 Cond = getSETCC(CC, Cond, DL ,DAG);
46664 // Zero extend the condition if needed.
46665 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
46666 Cond);
46667 // Scale the condition by the difference.
46668 if (Diff != 1)
46669 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
46670 DAG.getConstant(Diff, DL, Cond.getValueType()));
46671
46672 // Add the base if non-zero.
46673 if (FalseC->getAPIntValue() != 0)
46674 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
46675 SDValue(FalseC, 0));
46676 return Cond;
46677 }
46678 }
46679 }
46680 }
46681
46682 // Handle these cases:
46683 // (select (x != c), e, c) -> select (x != c), e, x),
46684 // (select (x == c), c, e) -> select (x == c), x, e)
46685 // where the c is an integer constant, and the "select" is the combination
46686 // of CMOV and CMP.
46687 //
46688 // The rationale for this change is that the conditional-move from a constant
46689 // needs two instructions, however, conditional-move from a register needs
46690 // only one instruction.
46691 //
46692 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
46693 // some instruction-combining opportunities. This opt needs to be
46694 // postponed as late as possible.
46695 //
46696 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
46697 // the DCI.xxxx conditions are provided to postpone the optimization as
46698 // late as possible.
46699
46700 ConstantSDNode *CmpAgainst = nullptr;
46701 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
46702 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
46703 !isa<ConstantSDNode>(Cond.getOperand(0))) {
46704
46705 if (CC == X86::COND_NE &&
46706 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
46708 std::swap(TrueOp, FalseOp);
46709 }
46710
46711 if (CC == X86::COND_E &&
46712 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
46713 SDValue Ops[] = {FalseOp, Cond.getOperand(0),
46714 DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
46715 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
46716 }
46717 }
46718 }
46719
46720 // Transform:
46721 //
46722 // (cmov 1 T (uge T 2))
46723 //
46724 // to:
46725 //
46726 // (adc T 0 (sub T 1))
46727 if (CC == X86::COND_AE && isOneConstant(FalseOp) &&
46728 Cond.getOpcode() == X86ISD::SUB && Cond->hasOneUse()) {
46729 SDValue Cond0 = Cond.getOperand(0);
46730 if (Cond0.getOpcode() == ISD::TRUNCATE)
46731 Cond0 = Cond0.getOperand(0);
46732 auto *Sub1C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
46733 if (Cond0 == TrueOp && Sub1C && Sub1C->getZExtValue() == 2) {
46734 EVT CondVT = Cond->getValueType(0);
46735 EVT OuterVT = N->getValueType(0);
46736 // Subtract 1 and generate a carry.
46737 SDValue NewSub =
46738 DAG.getNode(X86ISD::SUB, DL, Cond->getVTList(), Cond.getOperand(0),
46739 DAG.getConstant(1, DL, CondVT));
46740 SDValue EFLAGS(NewSub.getNode(), 1);
46741 return DAG.getNode(X86ISD::ADC, DL, DAG.getVTList(OuterVT, MVT::i32),
46742 TrueOp, DAG.getConstant(0, DL, OuterVT), EFLAGS);
46743 }
46744 }
46745
46746 // Fold and/or of setcc's to double CMOV:
46747 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
46748 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
46749 //
46750 // This combine lets us generate:
46751 // cmovcc1 (jcc1 if we don't have CMOV)
46752 // cmovcc2 (same)
46753 // instead of:
46754 // setcc1
46755 // setcc2
46756 // and/or
46757 // cmovne (jne if we don't have CMOV)
46758 // When we can't use the CMOV instruction, it might increase branch
46759 // mispredicts.
46760 // When we can use CMOV, or when there is no mispredict, this improves
46761 // throughput and reduces register pressure.
46762 //
46763 if (CC == X86::COND_NE) {
46764 SDValue Flags;
46765 X86::CondCode CC0, CC1;
46766 bool isAndSetCC;
46767 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
46768 if (isAndSetCC) {
46769 std::swap(FalseOp, TrueOp);
46772 }
46773
46774 SDValue LOps[] = {FalseOp, TrueOp,
46775 DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
46776 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
46777 SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
46778 Flags};
46779 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
46780 return CMOV;
46781 }
46782 }
46783
46784 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
46785 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
46786 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
46787 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
46788 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
46789 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
46790 SDValue Add = TrueOp;
46791 SDValue Const = FalseOp;
46792 // Canonicalize the condition code for easier matching and output.
46793 if (CC == X86::COND_E)
46794 std::swap(Add, Const);
46795
46796 // We might have replaced the constant in the cmov with the LHS of the
46797 // compare. If so change it to the RHS of the compare.
46798 if (Const == Cond.getOperand(0))
46799 Const = Cond.getOperand(1);
46800
46801 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
46802 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
46803 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
46804 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
46805 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
46806 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
46807 EVT VT = N->getValueType(0);
46808 // This should constant fold.
46809 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
46810 SDValue CMov =
46811 DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
46812 DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
46813 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
46814 }
46815 }
46816
46817 return SDValue();
46818}
46819
46820/// Different mul shrinking modes.
46822
46824 EVT VT = N->getOperand(0).getValueType();
46825 if (VT.getScalarSizeInBits() != 32)
46826 return false;
46827
46828 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
46829 unsigned SignBits[2] = {1, 1};
46830 bool IsPositive[2] = {false, false};
46831 for (unsigned i = 0; i < 2; i++) {
46832 SDValue Opd = N->getOperand(i);
46833
46834 SignBits[i] = DAG.ComputeNumSignBits(Opd);
46835 IsPositive[i] = DAG.SignBitIsZero(Opd);
46836 }
46837
46838 bool AllPositive = IsPositive[0] && IsPositive[1];
46839 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
46840 // When ranges are from -128 ~ 127, use MULS8 mode.
46841 if (MinSignBits >= 25)
46842 Mode = ShrinkMode::MULS8;
46843 // When ranges are from 0 ~ 255, use MULU8 mode.
46844 else if (AllPositive && MinSignBits >= 24)
46845 Mode = ShrinkMode::MULU8;
46846 // When ranges are from -32768 ~ 32767, use MULS16 mode.
46847 else if (MinSignBits >= 17)
46848 Mode = ShrinkMode::MULS16;
46849 // When ranges are from 0 ~ 65535, use MULU16 mode.
46850 else if (AllPositive && MinSignBits >= 16)
46851 Mode = ShrinkMode::MULU16;
46852 else
46853 return false;
46854 return true;
46855}
46856
46857/// When the operands of vector mul are extended from smaller size values,
46858/// like i8 and i16, the type of mul may be shrinked to generate more
46859/// efficient code. Two typical patterns are handled:
46860/// Pattern1:
46861/// %2 = sext/zext <N x i8> %1 to <N x i32>
46862/// %4 = sext/zext <N x i8> %3 to <N x i32>
46863// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
46864/// %5 = mul <N x i32> %2, %4
46865///
46866/// Pattern2:
46867/// %2 = zext/sext <N x i16> %1 to <N x i32>
46868/// %4 = zext/sext <N x i16> %3 to <N x i32>
46869/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
46870/// %5 = mul <N x i32> %2, %4
46871///
46872/// There are four mul shrinking modes:
46873/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
46874/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
46875/// generate pmullw+sext32 for it (MULS8 mode).
46876/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
46877/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
46878/// generate pmullw+zext32 for it (MULU8 mode).
46879/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
46880/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
46881/// generate pmullw+pmulhw for it (MULS16 mode).
46882/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
46883/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
46884/// generate pmullw+pmulhuw for it (MULU16 mode).
46886 const X86Subtarget &Subtarget) {
46887 // Check for legality
46888 // pmullw/pmulhw are not supported by SSE.
46889 if (!Subtarget.hasSSE2())
46890 return SDValue();
46891
46892 // Check for profitability
46893 // pmulld is supported since SSE41. It is better to use pmulld
46894 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
46895 // the expansion.
46896 bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
46897 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
46898 return SDValue();
46899
46900 ShrinkMode Mode;
46901 if (!canReduceVMulWidth(N, DAG, Mode))
46902 return SDValue();
46903
46904 SDLoc DL(N);
46905 SDValue N0 = N->getOperand(0);
46906 SDValue N1 = N->getOperand(1);
46907 EVT VT = N->getOperand(0).getValueType();
46908 unsigned NumElts = VT.getVectorNumElements();
46909 if ((NumElts % 2) != 0)
46910 return SDValue();
46911
46912 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
46913
46914 // Shrink the operands of mul.
46915 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
46916 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
46917
46918 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
46919 // lower part is needed.
46920 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
46921 if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)
46922 return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
46924 DL, VT, MulLo);
46925
46926 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
46927 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
46928 // the higher part is also needed.
46929 SDValue MulHi =
46930 DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
46931 ReducedVT, NewN0, NewN1);
46932
46933 // Repack the lower part and higher part result of mul into a wider
46934 // result.
46935 // Generate shuffle functioning as punpcklwd.
46936 SmallVector<int, 16> ShuffleMask(NumElts);
46937 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
46938 ShuffleMask[2 * i] = i;
46939 ShuffleMask[2 * i + 1] = i + NumElts;
46940 }
46941 SDValue ResLo =
46942 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
46943 ResLo = DAG.getBitcast(ResVT, ResLo);
46944 // Generate shuffle functioning as punpckhwd.
46945 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
46946 ShuffleMask[2 * i] = i + NumElts / 2;
46947 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
46948 }
46949 SDValue ResHi =
46950 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
46951 ResHi = DAG.getBitcast(ResVT, ResHi);
46952 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
46953}
46954
46956 EVT VT, const SDLoc &DL) {
46957
46958 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
46959 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
46960 DAG.getConstant(Mult, DL, VT));
46961 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
46962 DAG.getConstant(Shift, DL, MVT::i8));
46963 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
46964 N->getOperand(0));
46965 return Result;
46966 };
46967
46968 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
46969 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
46970 DAG.getConstant(Mul1, DL, VT));
46971 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
46972 DAG.getConstant(Mul2, DL, VT));
46973 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
46974 N->getOperand(0));
46975 return Result;
46976 };
46977
46978 switch (MulAmt) {
46979 default:
46980 break;
46981 case 11:
46982 // mul x, 11 => add ((shl (mul x, 5), 1), x)
46983 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
46984 case 21:
46985 // mul x, 21 => add ((shl (mul x, 5), 2), x)
46986 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
46987 case 41:
46988 // mul x, 41 => add ((shl (mul x, 5), 3), x)
46989 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
46990 case 22:
46991 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
46992 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
46993 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
46994 case 19:
46995 // mul x, 19 => add ((shl (mul x, 9), 1), x)
46996 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
46997 case 37:
46998 // mul x, 37 => add ((shl (mul x, 9), 2), x)
46999 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
47000 case 73:
47001 // mul x, 73 => add ((shl (mul x, 9), 3), x)
47002 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
47003 case 13:
47004 // mul x, 13 => add ((shl (mul x, 3), 2), x)
47005 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
47006 case 23:
47007 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
47008 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
47009 case 26:
47010 // mul x, 26 => add ((mul (mul x, 5), 5), x)
47011 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
47012 case 28:
47013 // mul x, 28 => add ((mul (mul x, 9), 3), x)
47014 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
47015 case 29:
47016 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
47017 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
47018 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
47019 }
47020
47021 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
47022 // by a single LEA.
47023 // First check if this a sum of two power of 2s because that's easy. Then
47024 // count how many zeros are up to the first bit.
47025 // TODO: We can do this even without LEA at a cost of two shifts and an add.
47026 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
47027 unsigned ScaleShift = llvm::countr_zero(MulAmt);
47028 if (ScaleShift >= 1 && ScaleShift < 4) {
47029 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
47030 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47031 DAG.getConstant(ShiftAmt, DL, MVT::i8));
47032 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47033 DAG.getConstant(ScaleShift, DL, MVT::i8));
47034 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
47035 }
47036 }
47037
47038 return SDValue();
47039}
47040
47041// If the upper 17 bits of either element are zero and the other element are
47042// zero/sign bits then we can use PMADDWD, which is always at least as quick as
47043// PMULLD, except on KNL.
47045 const X86Subtarget &Subtarget) {
47046 if (!Subtarget.hasSSE2())
47047 return SDValue();
47048
47049 if (Subtarget.isPMADDWDSlow())
47050 return SDValue();
47051
47052 EVT VT = N->getValueType(0);
47053
47054 // Only support vXi32 vectors.
47055 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
47056 return SDValue();
47057
47058 // Make sure the type is legal or can split/widen to a legal type.
47059 // With AVX512 but without BWI, we would need to split v32i16.
47060 unsigned NumElts = VT.getVectorNumElements();
47061 if (NumElts == 1 || !isPowerOf2_32(NumElts))
47062 return SDValue();
47063
47064 // With AVX512 but without BWI, we would need to split v32i16.
47065 if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())
47066 return SDValue();
47067
47068 SDValue N0 = N->getOperand(0);
47069 SDValue N1 = N->getOperand(1);
47070
47071 // If we are zero/sign extending two steps without SSE4.1, its better to
47072 // reduce the vmul width instead.
47073 if (!Subtarget.hasSSE41() &&
47074 (((N0.getOpcode() == ISD::ZERO_EXTEND &&
47075 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
47076 (N1.getOpcode() == ISD::ZERO_EXTEND &&
47077 N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||
47078 ((N0.getOpcode() == ISD::SIGN_EXTEND &&
47079 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
47080 (N1.getOpcode() == ISD::SIGN_EXTEND &&
47081 N1.getOperand(0).getScalarValueSizeInBits() <= 8))))
47082 return SDValue();
47083
47084 // If we are sign extending a wide vector without SSE4.1, its better to reduce
47085 // the vmul width instead.
47086 if (!Subtarget.hasSSE41() &&
47087 (N0.getOpcode() == ISD::SIGN_EXTEND &&
47088 N0.getOperand(0).getValueSizeInBits() > 128) &&
47089 (N1.getOpcode() == ISD::SIGN_EXTEND &&
47090 N1.getOperand(0).getValueSizeInBits() > 128))
47091 return SDValue();
47092
47093 // Sign bits must extend down to the lowest i16.
47094 if (DAG.ComputeMaxSignificantBits(N1) > 16 ||
47095 DAG.ComputeMaxSignificantBits(N0) > 16)
47096 return SDValue();
47097
47098 // At least one of the elements must be zero in the upper 17 bits, or can be
47099 // safely made zero without altering the final result.
47100 auto GetZeroableOp = [&](SDValue Op) {
47101 APInt Mask17 = APInt::getHighBitsSet(32, 17);
47102 if (DAG.MaskedValueIsZero(Op, Mask17))
47103 return Op;
47104 // Mask off upper 16-bits of sign-extended constants.
47106 return DAG.getNode(ISD::AND, SDLoc(N), VT, Op,
47107 DAG.getConstant(0xFFFF, SDLoc(N), VT));
47108 if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
47109 SDValue Src = Op.getOperand(0);
47110 // Convert sext(vXi16) to zext(vXi16).
47111 if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)
47112 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);
47113 // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
47114 // which will expand the extension.
47115 if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {
47116 EVT ExtVT = VT.changeVectorElementType(MVT::i16);
47117 Src = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), ExtVT, Src);
47118 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);
47119 }
47120 }
47121 // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.
47122 if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
47123 N->isOnlyUserOf(Op.getNode())) {
47124 SDValue Src = Op.getOperand(0);
47125 if (Src.getScalarValueSizeInBits() == 16)
47126 return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(N), VT, Src);
47127 }
47128 // Convert VSRAI(Op, 16) to VSRLI(Op, 16).
47129 if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&
47130 N->isOnlyUserOf(Op.getNode())) {
47131 return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, Op.getOperand(0),
47132 Op.getOperand(1));
47133 }
47134 return SDValue();
47135 };
47136 SDValue ZeroN0 = GetZeroableOp(N0);
47137 SDValue ZeroN1 = GetZeroableOp(N1);
47138 if (!ZeroN0 && !ZeroN1)
47139 return SDValue();
47140 N0 = ZeroN0 ? ZeroN0 : N0;
47141 N1 = ZeroN1 ? ZeroN1 : N1;
47142
47143 // Use SplitOpsAndApply to handle AVX splitting.
47144 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47145 ArrayRef<SDValue> Ops) {
47146 MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
47147 MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16);
47148 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
47149 DAG.getBitcast(OpVT, Ops[0]),
47150 DAG.getBitcast(OpVT, Ops[1]));
47151 };
47152 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {N0, N1},
47153 PMADDWDBuilder);
47154}
47155
47157 const X86Subtarget &Subtarget) {
47158 if (!Subtarget.hasSSE2())
47159 return SDValue();
47160
47161 EVT VT = N->getValueType(0);
47162
47163 // Only support vXi64 vectors.
47164 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
47165 VT.getVectorNumElements() < 2 ||
47167 return SDValue();
47168
47169 SDValue N0 = N->getOperand(0);
47170 SDValue N1 = N->getOperand(1);
47171
47172 // MULDQ returns the 64-bit result of the signed multiplication of the lower
47173 // 32-bits. We can lower with this if the sign bits stretch that far.
47174 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
47175 DAG.ComputeNumSignBits(N1) > 32) {
47176 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47177 ArrayRef<SDValue> Ops) {
47178 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
47179 };
47180 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
47181 PMULDQBuilder, /*CheckBWI*/false);
47182 }
47183
47184 // If the upper bits are zero we can use a single pmuludq.
47185 APInt Mask = APInt::getHighBitsSet(64, 32);
47186 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
47187 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47188 ArrayRef<SDValue> Ops) {
47189 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
47190 };
47191 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
47192 PMULUDQBuilder, /*CheckBWI*/false);
47193 }
47194
47195 return SDValue();
47196}
47197
47200 const X86Subtarget &Subtarget) {
47201 EVT VT = N->getValueType(0);
47202
47203 if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
47204 return V;
47205
47206 if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
47207 return V;
47208
47209 if (DCI.isBeforeLegalize() && VT.isVector())
47210 return reduceVMULWidth(N, DAG, Subtarget);
47211
47212 // Optimize a single multiply with constant into two operations in order to
47213 // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
47215 return SDValue();
47216
47217 // An imul is usually smaller than the alternative sequence.
47219 return SDValue();
47220
47221 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
47222 return SDValue();
47223
47224 if (VT != MVT::i64 && VT != MVT::i32 &&
47225 (!VT.isVector() || !VT.isSimple() || !VT.isInteger()))
47226 return SDValue();
47227
47229 N->getOperand(1), /*AllowUndefs*/ true, /*AllowTrunc*/ false);
47230 const APInt *C = nullptr;
47231 if (!CNode) {
47232 if (VT.isVector())
47233 if (auto *RawC = getTargetConstantFromNode(N->getOperand(1)))
47234 if (auto *SplatC = RawC->getSplatValue())
47235 C = &(SplatC->getUniqueInteger());
47236
47237 if (!C || C->getBitWidth() != VT.getScalarSizeInBits())
47238 return SDValue();
47239 } else {
47240 C = &(CNode->getAPIntValue());
47241 }
47242
47243 if (isPowerOf2_64(C->getZExtValue()))
47244 return SDValue();
47245
47246 int64_t SignMulAmt = C->getSExtValue();
47247 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
47248 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
47249
47250 SDLoc DL(N);
47251 SDValue NewMul = SDValue();
47252 if (VT == MVT::i64 || VT == MVT::i32) {
47253 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
47254 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
47255 DAG.getConstant(AbsMulAmt, DL, VT));
47256 if (SignMulAmt < 0)
47257 NewMul =
47258 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
47259
47260 return NewMul;
47261 }
47262
47263 uint64_t MulAmt1 = 0;
47264 uint64_t MulAmt2 = 0;
47265 if ((AbsMulAmt % 9) == 0) {
47266 MulAmt1 = 9;
47267 MulAmt2 = AbsMulAmt / 9;
47268 } else if ((AbsMulAmt % 5) == 0) {
47269 MulAmt1 = 5;
47270 MulAmt2 = AbsMulAmt / 5;
47271 } else if ((AbsMulAmt % 3) == 0) {
47272 MulAmt1 = 3;
47273 MulAmt2 = AbsMulAmt / 3;
47274 }
47275
47276 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
47277 if (MulAmt2 &&
47278 (isPowerOf2_64(MulAmt2) ||
47279 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
47280
47281 if (isPowerOf2_64(MulAmt2) && !(SignMulAmt >= 0 && N->hasOneUse() &&
47282 N->use_begin()->getOpcode() == ISD::ADD))
47283 // If second multiplifer is pow2, issue it first. We want the multiply
47284 // by 3, 5, or 9 to be folded into the addressing mode unless the lone
47285 // use is an add. Only do this for positive multiply amounts since the
47286 // negate would prevent it from being used as an address mode anyway.
47287 std::swap(MulAmt1, MulAmt2);
47288
47289 if (isPowerOf2_64(MulAmt1))
47290 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47291 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
47292 else
47293 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
47294 DAG.getConstant(MulAmt1, DL, VT));
47295
47296 if (isPowerOf2_64(MulAmt2))
47297 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
47298 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
47299 else
47300 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
47301 DAG.getConstant(MulAmt2, DL, VT));
47302
47303 // Negate the result.
47304 if (SignMulAmt < 0)
47305 NewMul =
47306 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
47307 } else if (!Subtarget.slowLEA())
47308 NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
47309 }
47310 if (!NewMul) {
47311 EVT ShiftVT = VT.isVector() ? VT : MVT::i8;
47312 assert(C->getZExtValue() != 0 &&
47313 C->getZExtValue() != maxUIntN(VT.getScalarSizeInBits()) &&
47314 "Both cases that could cause potential overflows should have "
47315 "already been handled.");
47316 if (isPowerOf2_64(AbsMulAmt - 1)) {
47317 // (mul x, 2^N + 1) => (add (shl x, N), x)
47318 NewMul = DAG.getNode(
47319 ISD::ADD, DL, VT, N->getOperand(0),
47320 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47321 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL, ShiftVT)));
47322 // To negate, subtract the number from zero
47323 if (SignMulAmt < 0)
47324 NewMul =
47325 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
47326 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
47327 // (mul x, 2^N - 1) => (sub (shl x, N), x)
47328 NewMul =
47329 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47330 DAG.getConstant(Log2_64(AbsMulAmt + 1), DL, ShiftVT));
47331 // To negate, reverse the operands of the subtract.
47332 if (SignMulAmt < 0)
47333 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
47334 else
47335 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
47336 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2) &&
47337 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
47338 // (mul x, 2^N + 2) => (add (shl x, N), (add x, x))
47339 NewMul =
47340 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47341 DAG.getConstant(Log2_64(AbsMulAmt - 2), DL, ShiftVT));
47342 NewMul = DAG.getNode(
47343 ISD::ADD, DL, VT, NewMul,
47344 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
47345 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2) &&
47346 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
47347 // (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))
47348 NewMul =
47349 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47350 DAG.getConstant(Log2_64(AbsMulAmt + 2), DL, ShiftVT));
47351 NewMul = DAG.getNode(
47352 ISD::SUB, DL, VT, NewMul,
47353 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
47354 } else if (SignMulAmt >= 0 && VT.isVector() &&
47355 Subtarget.fastImmVectorShift()) {
47356 uint64_t AbsMulAmtLowBit = AbsMulAmt & (-AbsMulAmt);
47357 uint64_t ShiftAmt1;
47358 std::optional<unsigned> Opc;
47359 if (isPowerOf2_64(AbsMulAmt - AbsMulAmtLowBit)) {
47360 ShiftAmt1 = AbsMulAmt - AbsMulAmtLowBit;
47361 Opc = ISD::ADD;
47362 } else if (isPowerOf2_64(AbsMulAmt + AbsMulAmtLowBit)) {
47363 ShiftAmt1 = AbsMulAmt + AbsMulAmtLowBit;
47364 Opc = ISD::SUB;
47365 }
47366
47367 if (Opc) {
47368 SDValue Shift1 =
47369 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47370 DAG.getConstant(Log2_64(ShiftAmt1), DL, ShiftVT));
47371 SDValue Shift2 =
47372 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47373 DAG.getConstant(Log2_64(AbsMulAmtLowBit), DL, ShiftVT));
47374 NewMul = DAG.getNode(*Opc, DL, VT, Shift1, Shift2);
47375 }
47376 }
47377 }
47378
47379 return NewMul;
47380}
47381
47382// Try to form a MULHU or MULHS node by looking for
47383// (srl (mul ext, ext), 16)
47384// TODO: This is X86 specific because we want to be able to handle wide types
47385// before type legalization. But we can only do it if the vector will be
47386// legalized via widening/splitting. Type legalization can't handle promotion
47387// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
47388// combiner.
47390 const X86Subtarget &Subtarget) {
47391 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
47392 "SRL or SRA node is required here!");
47393 SDLoc DL(N);
47394
47395 if (!Subtarget.hasSSE2())
47396 return SDValue();
47397
47398 // The operation feeding into the shift must be a multiply.
47399 SDValue ShiftOperand = N->getOperand(0);
47400 if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())
47401 return SDValue();
47402
47403 // Input type should be at least vXi32.
47404 EVT VT = N->getValueType(0);
47405 if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
47406 return SDValue();
47407
47408 // Need a shift by 16.
47409 APInt ShiftAmt;
47410 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||
47411 ShiftAmt != 16)
47412 return SDValue();
47413
47414 SDValue LHS = ShiftOperand.getOperand(0);
47415 SDValue RHS = ShiftOperand.getOperand(1);
47416
47417 unsigned ExtOpc = LHS.getOpcode();
47418 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
47419 RHS.getOpcode() != ExtOpc)
47420 return SDValue();
47421
47422 // Peek through the extends.
47423 LHS = LHS.getOperand(0);
47424 RHS = RHS.getOperand(0);
47425
47426 // Ensure the input types match.
47427 EVT MulVT = LHS.getValueType();
47428 if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
47429 return SDValue();
47430
47431 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
47432 SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
47433
47434 ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
47435 return DAG.getNode(ExtOpc, DL, VT, Mulh);
47436}
47437
47439 SDValue N0 = N->getOperand(0);
47440 SDValue N1 = N->getOperand(1);
47441 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
47442 EVT VT = N0.getValueType();
47443
47444 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
47445 // since the result of setcc_c is all zero's or all ones.
47446 if (VT.isInteger() && !VT.isVector() &&
47447 N1C && N0.getOpcode() == ISD::AND &&
47448 N0.getOperand(1).getOpcode() == ISD::Constant) {
47449 SDValue N00 = N0.getOperand(0);
47450 APInt Mask = N0.getConstantOperandAPInt(1);
47451 Mask <<= N1C->getAPIntValue();
47452 bool MaskOK = false;
47453 // We can handle cases concerning bit-widening nodes containing setcc_c if
47454 // we carefully interrogate the mask to make sure we are semantics
47455 // preserving.
47456 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
47457 // of the underlying setcc_c operation if the setcc_c was zero extended.
47458 // Consider the following example:
47459 // zext(setcc_c) -> i32 0x0000FFFF
47460 // c1 -> i32 0x0000FFFF
47461 // c2 -> i32 0x00000001
47462 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
47463 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
47464 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
47465 MaskOK = true;
47466 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
47468 MaskOK = true;
47469 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
47470 N00.getOpcode() == ISD::ANY_EXTEND) &&
47472 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
47473 }
47474 if (MaskOK && Mask != 0) {
47475 SDLoc DL(N);
47476 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
47477 }
47478 }
47479
47480 return SDValue();
47481}
47482
47484 const X86Subtarget &Subtarget) {
47485 SDValue N0 = N->getOperand(0);
47486 SDValue N1 = N->getOperand(1);
47487 EVT VT = N0.getValueType();
47488 unsigned Size = VT.getSizeInBits();
47489
47490 if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
47491 return V;
47492
47493 APInt ShiftAmt;
47494 if (supportedVectorVarShift(VT, Subtarget, ISD::SRA) &&
47495 N1.getOpcode() == ISD::UMIN &&
47496 ISD::isConstantSplatVector(N1.getOperand(1).getNode(), ShiftAmt) &&
47497 ShiftAmt == VT.getScalarSizeInBits() - 1) {
47498 SDValue ShrAmtVal = N1.getOperand(0);
47499 SDLoc DL(N);
47500 return DAG.getNode(X86ISD::VSRAV, DL, N->getVTList(), N0, ShrAmtVal);
47501 }
47502
47503 // fold (SRA (SHL X, ShlConst), SraConst)
47504 // into (SHL (sext_in_reg X), ShlConst - SraConst)
47505 // or (sext_in_reg X)
47506 // or (SRA (sext_in_reg X), SraConst - ShlConst)
47507 // depending on relation between SraConst and ShlConst.
47508 // We only do this if (Size - ShlConst) is equal to 8, 16 or 32. That allows
47509 // us to do the sext_in_reg from corresponding bit.
47510
47511 // sexts in X86 are MOVs. The MOVs have the same code size
47512 // as above SHIFTs (only SHIFT on 1 has lower code size).
47513 // However the MOVs have 2 advantages to a SHIFT:
47514 // 1. MOVs can write to a register that differs from source
47515 // 2. MOVs accept memory operands
47516
47517 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
47518 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
47520 return SDValue();
47521
47522 SDValue N00 = N0.getOperand(0);
47523 SDValue N01 = N0.getOperand(1);
47524 APInt ShlConst = N01->getAsAPIntVal();
47525 APInt SraConst = N1->getAsAPIntVal();
47526 EVT CVT = N1.getValueType();
47527
47528 if (CVT != N01.getValueType())
47529 return SDValue();
47530 if (SraConst.isNegative())
47531 return SDValue();
47532
47533 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
47534 unsigned ShiftSize = SVT.getSizeInBits();
47535 // Only deal with (Size - ShlConst) being equal to 8, 16 or 32.
47536 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
47537 continue;
47538 SDLoc DL(N);
47539 SDValue NN =
47540 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
47541 if (SraConst.eq(ShlConst))
47542 return NN;
47543 if (SraConst.ult(ShlConst))
47544 return DAG.getNode(ISD::SHL, DL, VT, NN,
47545 DAG.getConstant(ShlConst - SraConst, DL, CVT));
47546 return DAG.getNode(ISD::SRA, DL, VT, NN,
47547 DAG.getConstant(SraConst - ShlConst, DL, CVT));
47548 }
47549 return SDValue();
47550}
47551
47554 const X86Subtarget &Subtarget) {
47555 SDValue N0 = N->getOperand(0);
47556 SDValue N1 = N->getOperand(1);
47557 EVT VT = N0.getValueType();
47558
47559 if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
47560 return V;
47561
47562 // Only do this on the last DAG combine as it can interfere with other
47563 // combines.
47564 if (!DCI.isAfterLegalizeDAG())
47565 return SDValue();
47566
47567 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
47568 // TODO: This is a generic DAG combine that became an x86-only combine to
47569 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
47570 // and-not ('andn').
47571 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
47572 return SDValue();
47573
47574 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
47575 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
47576 if (!ShiftC || !AndC)
47577 return SDValue();
47578
47579 // If we can shrink the constant mask below 8-bits or 32-bits, then this
47580 // transform should reduce code size. It may also enable secondary transforms
47581 // from improved known-bits analysis or instruction selection.
47582 APInt MaskVal = AndC->getAPIntValue();
47583
47584 // If this can be matched by a zero extend, don't optimize.
47585 if (MaskVal.isMask()) {
47586 unsigned TO = MaskVal.countr_one();
47587 if (TO >= 8 && isPowerOf2_32(TO))
47588 return SDValue();
47589 }
47590
47591 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
47592 unsigned OldMaskSize = MaskVal.getSignificantBits();
47593 unsigned NewMaskSize = NewMaskVal.getSignificantBits();
47594 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
47595 (OldMaskSize > 32 && NewMaskSize <= 32)) {
47596 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
47597 SDLoc DL(N);
47598 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
47599 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
47600 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
47601 }
47602 return SDValue();
47603}
47604
47606 const X86Subtarget &Subtarget) {
47607 unsigned Opcode = N->getOpcode();
47608 assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode");
47609
47610 SDLoc DL(N);
47611 EVT VT = N->getValueType(0);
47612 SDValue N0 = N->getOperand(0);
47613 SDValue N1 = N->getOperand(1);
47614 EVT SrcVT = N0.getValueType();
47615
47616 SDValue BC0 =
47617 N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
47618 SDValue BC1 =
47619 N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
47620
47621 // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
47622 // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
47623 // truncation trees that help us avoid lane crossing shuffles.
47624 // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
47625 // TODO: We don't handle vXf64 shuffles yet.
47626 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
47627 if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {
47629 SmallVector<int> ShuffleMask, ScaledMask;
47630 SDValue Vec = peekThroughBitcasts(BCSrc);
47631 if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
47633 // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
47634 // shuffle to a v4X64 width - we can probably relax this in the future.
47635 if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
47636 ShuffleOps[0].getValueType().is256BitVector() &&
47637 scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
47638 SDValue Lo, Hi;
47639 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
47640 std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
47641 Lo = DAG.getBitcast(SrcVT, Lo);
47642 Hi = DAG.getBitcast(SrcVT, Hi);
47643 SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
47644 Res = DAG.getBitcast(ShufVT, Res);
47645 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
47646 return DAG.getBitcast(VT, Res);
47647 }
47648 }
47649 }
47650 }
47651
47652 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
47653 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
47654 // If either/both ops are a shuffle that can scale to v2x64,
47655 // then see if we can perform this as a v4x32 post shuffle.
47656 SmallVector<SDValue> Ops0, Ops1;
47657 SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
47658 bool IsShuf0 =
47659 getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
47660 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
47661 all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
47662 bool IsShuf1 =
47663 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
47664 scaleShuffleElements(Mask1, 2, ScaledMask1) &&
47665 all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
47666 if (IsShuf0 || IsShuf1) {
47667 if (!IsShuf0) {
47668 Ops0.assign({BC0});
47669 ScaledMask0.assign({0, 1});
47670 }
47671 if (!IsShuf1) {
47672 Ops1.assign({BC1});
47673 ScaledMask1.assign({0, 1});
47674 }
47675
47676 SDValue LHS, RHS;
47677 int PostShuffle[4] = {-1, -1, -1, -1};
47678 auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
47679 if (M < 0)
47680 return true;
47681 Idx = M % 2;
47682 SDValue Src = Ops[M / 2];
47683 if (!LHS || LHS == Src) {
47684 LHS = Src;
47685 return true;
47686 }
47687 if (!RHS || RHS == Src) {
47688 Idx += 2;
47689 RHS = Src;
47690 return true;
47691 }
47692 return false;
47693 };
47694 if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
47695 FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
47696 FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
47697 FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
47698 LHS = DAG.getBitcast(SrcVT, LHS);
47699 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
47700 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
47701 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
47702 Res = DAG.getBitcast(ShufVT, Res);
47703 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
47704 return DAG.getBitcast(VT, Res);
47705 }
47706 }
47707 }
47708
47709 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
47710 if (VT.is256BitVector() && Subtarget.hasInt256()) {
47711 SmallVector<int> Mask0, Mask1;
47712 SmallVector<SDValue> Ops0, Ops1;
47713 SmallVector<int, 2> ScaledMask0, ScaledMask1;
47714 if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
47715 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
47716 !Ops0.empty() && !Ops1.empty() &&
47717 all_of(Ops0,
47718 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
47719 all_of(Ops1,
47720 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
47721 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
47722 scaleShuffleElements(Mask1, 2, ScaledMask1)) {
47723 SDValue Op00 = peekThroughBitcasts(Ops0.front());
47724 SDValue Op10 = peekThroughBitcasts(Ops1.front());
47725 SDValue Op01 = peekThroughBitcasts(Ops0.back());
47726 SDValue Op11 = peekThroughBitcasts(Ops1.back());
47727 if ((Op00 == Op11) && (Op01 == Op10)) {
47728 std::swap(Op10, Op11);
47730 }
47731 if ((Op00 == Op10) && (Op01 == Op11)) {
47732 const int Map[4] = {0, 2, 1, 3};
47733 SmallVector<int, 4> ShuffleMask(
47734 {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
47735 Map[ScaledMask1[1]]});
47736 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
47737 SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
47738 DAG.getBitcast(SrcVT, Op01));
47739 Res = DAG.getBitcast(ShufVT, Res);
47740 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
47741 return DAG.getBitcast(VT, Res);
47742 }
47743 }
47744 }
47745
47746 return SDValue();
47747}
47748
47751 const X86Subtarget &Subtarget) {
47752 unsigned Opcode = N->getOpcode();
47753 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
47754 "Unexpected pack opcode");
47755
47756 EVT VT = N->getValueType(0);
47757 SDValue N0 = N->getOperand(0);
47758 SDValue N1 = N->getOperand(1);
47759 unsigned NumDstElts = VT.getVectorNumElements();
47760 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
47761 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
47762 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
47763 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
47764 "Unexpected PACKSS/PACKUS input type");
47765
47766 bool IsSigned = (X86ISD::PACKSS == Opcode);
47767
47768 // Constant Folding.
47769 APInt UndefElts0, UndefElts1;
47770 SmallVector<APInt, 32> EltBits0, EltBits1;
47771 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
47772 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
47773 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0,
47774 /*AllowWholeUndefs*/ true,
47775 /*AllowPartialUndefs*/ true) &&
47776 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1,
47777 /*AllowWholeUndefs*/ true,
47778 /*AllowPartialUndefs*/ true)) {
47779 unsigned NumLanes = VT.getSizeInBits() / 128;
47780 unsigned NumSrcElts = NumDstElts / 2;
47781 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
47782 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
47783
47784 APInt Undefs(NumDstElts, 0);
47785 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));
47786 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
47787 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
47788 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
47789 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
47790 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
47791
47792 if (UndefElts[SrcIdx]) {
47793 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
47794 continue;
47795 }
47796
47797 APInt &Val = EltBits[SrcIdx];
47798 if (IsSigned) {
47799 // PACKSS: Truncate signed value with signed saturation.
47800 // Source values less than dst minint are saturated to minint.
47801 // Source values greater than dst maxint are saturated to maxint.
47802 Val = Val.truncSSat(DstBitsPerElt);
47803 } else {
47804 // PACKUS: Truncate signed value with unsigned saturation.
47805 // Source values less than zero are saturated to zero.
47806 // Source values greater than dst maxuint are saturated to maxuint.
47807 // NOTE: This is different from APInt::truncUSat.
47808 if (Val.isIntN(DstBitsPerElt))
47809 Val = Val.trunc(DstBitsPerElt);
47810 else if (Val.isNegative())
47811 Val = APInt::getZero(DstBitsPerElt);
47812 else
47813 Val = APInt::getAllOnes(DstBitsPerElt);
47814 }
47815 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
47816 }
47817 }
47818
47819 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
47820 }
47821
47822 // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
47823 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
47824 return V;
47825
47826 // Try to fold PACKSS(NOT(X),NOT(Y)) -> NOT(PACKSS(X,Y)).
47827 // Currently limit this to allsignbits cases only.
47828 if (IsSigned &&
47829 (N0.isUndef() || DAG.ComputeNumSignBits(N0) == SrcBitsPerElt) &&
47830 (N1.isUndef() || DAG.ComputeNumSignBits(N1) == SrcBitsPerElt)) {
47831 SDValue Not0 = N0.isUndef() ? N0 : IsNOT(N0, DAG);
47832 SDValue Not1 = N1.isUndef() ? N1 : IsNOT(N1, DAG);
47833 if (Not0 && Not1) {
47834 SDLoc DL(N);
47835 MVT SrcVT = N0.getSimpleValueType();
47836 SDValue Pack =
47837 DAG.getNode(X86ISD::PACKSS, DL, VT, DAG.getBitcast(SrcVT, Not0),
47838 DAG.getBitcast(SrcVT, Not1));
47839 return DAG.getNOT(DL, Pack, VT);
47840 }
47841 }
47842
47843 // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
47844 // truncate to create a larger truncate.
47845 if (Subtarget.hasAVX512() &&
47846 N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
47847 N0.getOperand(0).getValueType() == MVT::v8i32) {
47848 if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
47849 (!IsSigned &&
47850 DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
47851 if (Subtarget.hasVLX())
47852 return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
47853
47854 // Widen input to v16i32 so we can truncate that.
47855 SDLoc dl(N);
47856 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
47857 N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
47858 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
47859 }
47860 }
47861
47862 // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
47863 if (VT.is128BitVector()) {
47864 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
47865 SDValue Src0, Src1;
47866 if (N0.getOpcode() == ExtOpc &&
47868 N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
47869 Src0 = N0.getOperand(0);
47870 }
47871 if (N1.getOpcode() == ExtOpc &&
47873 N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
47874 Src1 = N1.getOperand(0);
47875 }
47876 if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
47877 assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)");
47878 Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
47879 Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
47880 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
47881 }
47882
47883 // Try again with pack(*_extend_vector_inreg, undef).
47884 unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG
47886 if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&
47887 N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)
47888 return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),
47889 DAG);
47890 }
47891
47892 // Attempt to combine as shuffle.
47893 SDValue Op(N, 0);
47894 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
47895 return Res;
47896
47897 return SDValue();
47898}
47899
47902 const X86Subtarget &Subtarget) {
47903 assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||
47904 X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
47905 "Unexpected horizontal add/sub opcode");
47906
47907 if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
47908 MVT VT = N->getSimpleValueType(0);
47909 SDValue LHS = N->getOperand(0);
47910 SDValue RHS = N->getOperand(1);
47911
47912 // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
47913 if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
47914 LHS.getOpcode() == RHS.getOpcode() &&
47915 LHS.getValueType() == RHS.getValueType() &&
47916 N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {
47917 SDValue LHS0 = LHS.getOperand(0);
47918 SDValue LHS1 = LHS.getOperand(1);
47919 SDValue RHS0 = RHS.getOperand(0);
47920 SDValue RHS1 = RHS.getOperand(1);
47921 if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&
47922 (RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {
47923 SDLoc DL(N);
47924 SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
47925 LHS0.isUndef() ? LHS1 : LHS0,
47926 RHS0.isUndef() ? RHS1 : RHS0);
47927 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
47928 Res = DAG.getBitcast(ShufVT, Res);
47929 SDValue NewLHS =
47930 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
47931 getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
47932 SDValue NewRHS =
47933 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
47934 getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
47935 return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),
47936 DAG.getBitcast(VT, NewRHS));
47937 }
47938 }
47939 }
47940
47941 // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
47942 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
47943 return V;
47944
47945 return SDValue();
47946}
47947
47950 const X86Subtarget &Subtarget) {
47951 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||
47952 X86ISD::VSRL == N->getOpcode()) &&
47953 "Unexpected shift opcode");
47954 EVT VT = N->getValueType(0);
47955 SDValue N0 = N->getOperand(0);
47956 SDValue N1 = N->getOperand(1);
47957
47958 // Shift zero -> zero.
47960 return DAG.getConstant(0, SDLoc(N), VT);
47961
47962 // Detect constant shift amounts.
47963 APInt UndefElts;
47964 SmallVector<APInt, 32> EltBits;
47965 if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits,
47966 /*AllowWholeUndefs*/ true,
47967 /*AllowPartialUndefs*/ false)) {
47968 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
47969 return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
47970 EltBits[0].getZExtValue(), DAG);
47971 }
47972
47973 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47974 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
47975 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
47976 return SDValue(N, 0);
47977
47978 return SDValue();
47979}
47980
47983 const X86Subtarget &Subtarget) {
47984 unsigned Opcode = N->getOpcode();
47985 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
47986 X86ISD::VSRLI == Opcode) &&
47987 "Unexpected shift opcode");
47988 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
47989 EVT VT = N->getValueType(0);
47990 SDValue N0 = N->getOperand(0);
47991 SDValue N1 = N->getOperand(1);
47992 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
47993 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
47994 "Unexpected value type");
47995 assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type");
47996
47997 // (shift undef, X) -> 0
47998 if (N0.isUndef())
47999 return DAG.getConstant(0, SDLoc(N), VT);
48000
48001 // Out of range logical bit shifts are guaranteed to be zero.
48002 // Out of range arithmetic bit shifts splat the sign bit.
48003 unsigned ShiftVal = N->getConstantOperandVal(1);
48004 if (ShiftVal >= NumBitsPerElt) {
48005 if (LogicalShift)
48006 return DAG.getConstant(0, SDLoc(N), VT);
48007 ShiftVal = NumBitsPerElt - 1;
48008 }
48009
48010 // (shift X, 0) -> X
48011 if (!ShiftVal)
48012 return N0;
48013
48014 // (shift 0, C) -> 0
48016 // N0 is all zeros or undef. We guarantee that the bits shifted into the
48017 // result are all zeros, not undef.
48018 return DAG.getConstant(0, SDLoc(N), VT);
48019
48020 // (VSRAI -1, C) -> -1
48021 if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
48022 // N0 is all ones or undef. We guarantee that the bits shifted into the
48023 // result are all ones, not undef.
48024 return DAG.getConstant(-1, SDLoc(N), VT);
48025
48026 auto MergeShifts = [&](SDValue X, uint64_t Amt0, uint64_t Amt1) {
48027 unsigned NewShiftVal = Amt0 + Amt1;
48028 if (NewShiftVal >= NumBitsPerElt) {
48029 // Out of range logical bit shifts are guaranteed to be zero.
48030 // Out of range arithmetic bit shifts splat the sign bit.
48031 if (LogicalShift)
48032 return DAG.getConstant(0, SDLoc(N), VT);
48033 NewShiftVal = NumBitsPerElt - 1;
48034 }
48035 return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
48036 DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
48037 };
48038
48039 // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
48040 if (Opcode == N0.getOpcode())
48041 return MergeShifts(N0.getOperand(0), ShiftVal, N0.getConstantOperandVal(1));
48042
48043 // (shl (add X, X), C) -> (shl X, (C + 1))
48044 if (Opcode == X86ISD::VSHLI && N0.getOpcode() == ISD::ADD &&
48045 N0.getOperand(0) == N0.getOperand(1))
48046 return MergeShifts(N0.getOperand(0), ShiftVal, 1);
48047
48048 // We can decode 'whole byte' logical bit shifts as shuffles.
48049 if (LogicalShift && (ShiftVal % 8) == 0) {
48050 SDValue Op(N, 0);
48051 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
48052 return Res;
48053 }
48054
48055 // Attempt to detect an expanded vXi64 SIGN_EXTEND_INREG vXi1 pattern, and
48056 // convert to a splatted v2Xi32 SIGN_EXTEND_INREG pattern:
48057 // psrad(pshufd(psllq(X,63),1,1,3,3),31) ->
48058 // pshufd(psrad(pslld(X,31),31),0,0,2,2).
48059 if (Opcode == X86ISD::VSRAI && NumBitsPerElt == 32 && ShiftVal == 31 &&
48060 N0.getOpcode() == X86ISD::PSHUFD &&
48061 N0.getConstantOperandVal(1) == getV4X86ShuffleImm({1, 1, 3, 3}) &&
48062 N0->hasOneUse()) {
48064 if (BC.getOpcode() == X86ISD::VSHLI &&
48065 BC.getScalarValueSizeInBits() == 64 &&
48066 BC.getConstantOperandVal(1) == 63) {
48067 SDLoc DL(N);
48068 SDValue Src = BC.getOperand(0);
48069 Src = DAG.getBitcast(VT, Src);
48070 Src = DAG.getNode(X86ISD::PSHUFD, DL, VT, Src,
48071 getV4X86ShuffleImm8ForMask({0, 0, 2, 2}, DL, DAG));
48072 Src = DAG.getNode(X86ISD::VSHLI, DL, VT, Src, N1);
48073 Src = DAG.getNode(X86ISD::VSRAI, DL, VT, Src, N1);
48074 return Src;
48075 }
48076 }
48077
48078 auto TryConstantFold = [&](SDValue V) {
48079 APInt UndefElts;
48080 SmallVector<APInt, 32> EltBits;
48081 if (!getTargetConstantBitsFromNode(V, NumBitsPerElt, UndefElts, EltBits,
48082 /*AllowWholeUndefs*/ true,
48083 /*AllowPartialUndefs*/ true))
48084 return SDValue();
48085 assert(EltBits.size() == VT.getVectorNumElements() &&
48086 "Unexpected shift value type");
48087 // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
48088 // created an undef input due to no input bits being demanded, but user
48089 // still expects 0 in other bits.
48090 for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
48091 APInt &Elt = EltBits[i];
48092 if (UndefElts[i])
48093 Elt = 0;
48094 else if (X86ISD::VSHLI == Opcode)
48095 Elt <<= ShiftVal;
48096 else if (X86ISD::VSRAI == Opcode)
48097 Elt.ashrInPlace(ShiftVal);
48098 else
48099 Elt.lshrInPlace(ShiftVal);
48100 }
48101 // Reset undef elements since they were zeroed above.
48102 UndefElts = 0;
48103 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
48104 };
48105
48106 // Constant Folding.
48107 if (N->isOnlyUserOf(N0.getNode())) {
48108 if (SDValue C = TryConstantFold(N0))
48109 return C;
48110
48111 // Fold (shift (logic X, C2), C1) -> (logic (shift X, C1), (shift C2, C1))
48112 // Don't break NOT patterns.
48114 if (ISD::isBitwiseLogicOp(BC.getOpcode()) &&
48115 BC->isOnlyUserOf(BC.getOperand(1).getNode()) &&
48117 if (SDValue RHS = TryConstantFold(BC.getOperand(1))) {
48118 SDLoc DL(N);
48119 SDValue LHS = DAG.getNode(Opcode, DL, VT,
48120 DAG.getBitcast(VT, BC.getOperand(0)), N1);
48121 return DAG.getNode(BC.getOpcode(), DL, VT, LHS, RHS);
48122 }
48123 }
48124 }
48125
48126 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48127 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),
48128 DCI))
48129 return SDValue(N, 0);
48130
48131 return SDValue();
48132}
48133
48136 const X86Subtarget &Subtarget) {
48137 EVT VT = N->getValueType(0);
48138 unsigned Opcode = N->getOpcode();
48139 assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) ||
48140 (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) ||
48141 Opcode == ISD::INSERT_VECTOR_ELT) &&
48142 "Unexpected vector insertion");
48143
48144 SDValue Vec = N->getOperand(0);
48145 SDValue Scl = N->getOperand(1);
48146 SDValue Idx = N->getOperand(2);
48147
48148 // Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).
48149 if (Opcode == ISD::INSERT_VECTOR_ELT && Vec.isUndef() && isNullConstant(Idx))
48150 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Scl);
48151
48152 if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) {
48153 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
48154 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48155 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
48156 APInt::getAllOnes(NumBitsPerElt), DCI))
48157 return SDValue(N, 0);
48158 }
48159
48160 // Attempt to combine insertion patterns to a shuffle.
48161 if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
48162 SDValue Op(N, 0);
48163 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
48164 return Res;
48165 }
48166
48167 return SDValue();
48168}
48169
48170/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
48171/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
48172/// OR -> CMPNEQSS.
48175 const X86Subtarget &Subtarget) {
48176 unsigned opcode;
48177
48178 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
48179 // we're requiring SSE2 for both.
48180 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
48181 SDValue N0 = N->getOperand(0);
48182 SDValue N1 = N->getOperand(1);
48183 SDValue CMP0 = N0.getOperand(1);
48184 SDValue CMP1 = N1.getOperand(1);
48185 SDLoc DL(N);
48186
48187 // The SETCCs should both refer to the same CMP.
48188 if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
48189 return SDValue();
48190
48191 SDValue CMP00 = CMP0->getOperand(0);
48192 SDValue CMP01 = CMP0->getOperand(1);
48193 EVT VT = CMP00.getValueType();
48194
48195 if (VT == MVT::f32 || VT == MVT::f64 ||
48196 (VT == MVT::f16 && Subtarget.hasFP16())) {
48197 bool ExpectingFlags = false;
48198 // Check for any users that want flags:
48199 for (const SDNode *U : N->uses()) {
48200 if (ExpectingFlags)
48201 break;
48202
48203 switch (U->getOpcode()) {
48204 default:
48205 case ISD::BR_CC:
48206 case ISD::BRCOND:
48207 case ISD::SELECT:
48208 ExpectingFlags = true;
48209 break;
48210 case ISD::CopyToReg:
48211 case ISD::SIGN_EXTEND:
48212 case ISD::ZERO_EXTEND:
48213 case ISD::ANY_EXTEND:
48214 break;
48215 }
48216 }
48217
48218 if (!ExpectingFlags) {
48219 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
48220 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
48221
48222 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
48223 X86::CondCode tmp = cc0;
48224 cc0 = cc1;
48225 cc1 = tmp;
48226 }
48227
48228 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
48229 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
48230 // FIXME: need symbolic constants for these magic numbers.
48231 // See X86ATTInstPrinter.cpp:printSSECC().
48232 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
48233 if (Subtarget.hasAVX512()) {
48234 SDValue FSetCC =
48235 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
48236 DAG.getTargetConstant(x86cc, DL, MVT::i8));
48237 // Need to fill with zeros to ensure the bitcast will produce zeroes
48238 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
48239 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
48240 DAG.getConstant(0, DL, MVT::v16i1),
48241 FSetCC, DAG.getIntPtrConstant(0, DL));
48242 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
48243 N->getSimpleValueType(0));
48244 }
48245 SDValue OnesOrZeroesF =
48246 DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
48247 CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
48248
48249 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
48250 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
48251
48252 if (is64BitFP && !Subtarget.is64Bit()) {
48253 // On a 32-bit target, we cannot bitcast the 64-bit float to a
48254 // 64-bit integer, since that's not a legal type. Since
48255 // OnesOrZeroesF is all ones or all zeroes, we don't need all the
48256 // bits, but can do this little dance to extract the lowest 32 bits
48257 // and work with those going forward.
48258 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
48259 OnesOrZeroesF);
48260 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
48261 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
48262 Vector32, DAG.getIntPtrConstant(0, DL));
48263 IntVT = MVT::i32;
48264 }
48265
48266 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
48267 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
48268 DAG.getConstant(1, DL, IntVT));
48269 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
48270 ANDed);
48271 return OneBitOfTruth;
48272 }
48273 }
48274 }
48275 }
48276 return SDValue();
48277}
48278
48279/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
48281 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
48282
48283 MVT VT = N->getSimpleValueType(0);
48284 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
48285 return SDValue();
48286
48287 SDValue X, Y;
48288 SDValue N0 = N->getOperand(0);
48289 SDValue N1 = N->getOperand(1);
48290
48291 if (SDValue Not = IsNOT(N0, DAG)) {
48292 X = Not;
48293 Y = N1;
48294 } else if (SDValue Not = IsNOT(N1, DAG)) {
48295 X = Not;
48296 Y = N0;
48297 } else
48298 return SDValue();
48299
48300 X = DAG.getBitcast(VT, X);
48301 Y = DAG.getBitcast(VT, Y);
48302 return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
48303}
48304
48305/// Try to fold:
48306/// and (vector_shuffle<Z,...,Z>
48307/// (insert_vector_elt undef, (xor X, -1), Z), undef), Y
48308/// ->
48309/// andnp (vector_shuffle<Z,...,Z>
48310/// (insert_vector_elt undef, X, Z), undef), Y
48312 const X86Subtarget &Subtarget) {
48313 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
48314
48315 EVT VT = N->getValueType(0);
48316 // Do not split 256 and 512 bit vectors with SSE2 as they overwrite original
48317 // value and require extra moves.
48318 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
48319 ((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX())))
48320 return SDValue();
48321
48322 auto GetNot = [&DAG](SDValue V) {
48323 auto *SVN = dyn_cast<ShuffleVectorSDNode>(peekThroughOneUseBitcasts(V));
48324 // TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all
48325 // end-users are ISD::AND including cases
48326 // (and(extract_vector_element(SVN), Y)).
48327 if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||
48328 !SVN->getOperand(1).isUndef()) {
48329 return SDValue();
48330 }
48331 SDValue IVEN = SVN->getOperand(0);
48332 if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT ||
48333 !IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse())
48334 return SDValue();
48335 if (!isa<ConstantSDNode>(IVEN.getOperand(2)) ||
48336 IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())
48337 return SDValue();
48338 SDValue Src = IVEN.getOperand(1);
48339 if (SDValue Not = IsNOT(Src, DAG)) {
48340 SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not);
48341 SDValue NotIVEN =
48343 IVEN.getOperand(0), NotSrc, IVEN.getOperand(2));
48344 return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,
48345 SVN->getOperand(1), SVN->getMask());
48346 }
48347 return SDValue();
48348 };
48349
48350 SDValue X, Y;
48351 SDValue N0 = N->getOperand(0);
48352 SDValue N1 = N->getOperand(1);
48353 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48354
48355 if (SDValue Not = GetNot(N0)) {
48356 X = Not;
48357 Y = N1;
48358 } else if (SDValue Not = GetNot(N1)) {
48359 X = Not;
48360 Y = N0;
48361 } else
48362 return SDValue();
48363
48364 X = DAG.getBitcast(VT, X);
48365 Y = DAG.getBitcast(VT, Y);
48366 SDLoc DL(N);
48367
48368 // We do not split for SSE at all, but we need to split vectors for AVX1 and
48369 // AVX2.
48370 if (!Subtarget.useAVX512Regs() && VT.is512BitVector() &&
48372 SDValue LoX, HiX;
48373 std::tie(LoX, HiX) = splitVector(X, DAG, DL);
48374 SDValue LoY, HiY;
48375 std::tie(LoY, HiY) = splitVector(Y, DAG, DL);
48376 EVT SplitVT = LoX.getValueType();
48377 SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY});
48378 SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY});
48379 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV});
48380 }
48381
48382 if (TLI.isTypeLegal(VT))
48383 return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});
48384
48385 return SDValue();
48386}
48387
48388// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
48389// logical operations, like in the example below.
48390// or (and (truncate x, truncate y)),
48391// (xor (truncate z, build_vector (constants)))
48392// Given a target type \p VT, we generate
48393// or (and x, y), (xor z, zext(build_vector (constants)))
48394// given x, y and z are of type \p VT. We can do so, if operands are either
48395// truncates from VT types, the second operand is a vector of constants or can
48396// be recursively promoted.
48398 SelectionDAG &DAG, unsigned Depth) {
48399 // Limit recursion to avoid excessive compile times.
48401 return SDValue();
48402
48403 if (!ISD::isBitwiseLogicOp(N.getOpcode()))
48404 return SDValue();
48405
48406 SDValue N0 = N.getOperand(0);
48407 SDValue N1 = N.getOperand(1);
48408
48409 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48410 if (!TLI.isOperationLegalOrPromote(N.getOpcode(), VT))
48411 return SDValue();
48412
48413 if (SDValue NN0 = PromoteMaskArithmetic(N0, DL, VT, DAG, Depth + 1))
48414 N0 = NN0;
48415 else {
48416 // The left side has to be a trunc.
48417 if (N0.getOpcode() != ISD::TRUNCATE)
48418 return SDValue();
48419
48420 // The type of the truncated inputs.
48421 if (N0.getOperand(0).getValueType() != VT)
48422 return SDValue();
48423
48424 N0 = N0.getOperand(0);
48425 }
48426
48427 if (SDValue NN1 = PromoteMaskArithmetic(N1, DL, VT, DAG, Depth + 1))
48428 N1 = NN1;
48429 else {
48430 // The right side has to be a 'trunc' or a (foldable) constant.
48431 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
48432 N1.getOperand(0).getValueType() == VT;
48433 if (RHSTrunc)
48434 N1 = N1.getOperand(0);
48435 else if (SDValue Cst =
48437 N1 = Cst;
48438 else
48439 return SDValue();
48440 }
48441
48442 return DAG.getNode(N.getOpcode(), DL, VT, N0, N1);
48443}
48444
48445// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
48446// register. In most cases we actually compare or select YMM-sized registers
48447// and mixing the two types creates horrible code. This method optimizes
48448// some of the transition sequences.
48449// Even with AVX-512 this is still useful for removing casts around logical
48450// operations on vXi1 mask types.
48452 SelectionDAG &DAG,
48453 const X86Subtarget &Subtarget) {
48454 EVT VT = N.getValueType();
48455 assert(VT.isVector() && "Expected vector type");
48456 assert((N.getOpcode() == ISD::ANY_EXTEND ||
48457 N.getOpcode() == ISD::ZERO_EXTEND ||
48458 N.getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
48459
48460 SDValue Narrow = N.getOperand(0);
48461 EVT NarrowVT = Narrow.getValueType();
48462
48463 // Generate the wide operation.
48464 SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, 0);
48465 if (!Op)
48466 return SDValue();
48467 switch (N.getOpcode()) {
48468 default: llvm_unreachable("Unexpected opcode");
48469 case ISD::ANY_EXTEND:
48470 return Op;
48471 case ISD::ZERO_EXTEND:
48472 return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
48473 case ISD::SIGN_EXTEND:
48474 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
48475 Op, DAG.getValueType(NarrowVT));
48476 }
48477}
48478
48479static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
48480 unsigned FPOpcode;
48481 switch (Opcode) {
48482 // clang-format off
48483 default: llvm_unreachable("Unexpected input node for FP logic conversion");
48484 case ISD::AND: FPOpcode = X86ISD::FAND; break;
48485 case ISD::OR: FPOpcode = X86ISD::FOR; break;
48486 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
48487 // clang-format on
48488 }
48489 return FPOpcode;
48490}
48491
48492/// If both input operands of a logic op are being cast from floating-point
48493/// types or FP compares, try to convert this into a floating-point logic node
48494/// to avoid unnecessary moves from SSE to integer registers.
48497 const X86Subtarget &Subtarget) {
48498 EVT VT = N->getValueType(0);
48499 SDValue N0 = N->getOperand(0);
48500 SDValue N1 = N->getOperand(1);
48501 SDLoc DL(N);
48502
48503 if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||
48504 (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))
48505 return SDValue();
48506
48507 SDValue N00 = N0.getOperand(0);
48508 SDValue N10 = N1.getOperand(0);
48509 EVT N00Type = N00.getValueType();
48510 EVT N10Type = N10.getValueType();
48511
48512 // Ensure that both types are the same and are legal scalar fp types.
48513 if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
48514 (Subtarget.hasSSE2() && N00Type == MVT::f64) ||
48515 (Subtarget.hasFP16() && N00Type == MVT::f16)))
48516 return SDValue();
48517
48518 if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {
48519 unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());
48520 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
48521 return DAG.getBitcast(VT, FPLogic);
48522 }
48523
48524 if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||
48525 !N1.hasOneUse())
48526 return SDValue();
48527
48528 ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();
48529 ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();
48530
48531 // The vector ISA for FP predicates is incomplete before AVX, so converting
48532 // COMIS* to CMPS* may not be a win before AVX.
48533 if (!Subtarget.hasAVX() &&
48534 !(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))
48535 return SDValue();
48536
48537 // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)
48538 // and vector logic:
48539 // logic (setcc N00, N01), (setcc N10, N11) -->
48540 // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0
48541 unsigned NumElts = 128 / N00Type.getSizeInBits();
48542 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);
48543 EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
48544 SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);
48545 SDValue N01 = N0.getOperand(1);
48546 SDValue N11 = N1.getOperand(1);
48547 SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);
48548 SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);
48549 SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);
48550 SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);
48551 SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);
48552 SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);
48553 SDValue Logic = DAG.getNode(N->getOpcode(), DL, BoolVecVT, Setcc0, Setcc1);
48554 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);
48555}
48556
48557// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
48558// to reduce XMM->GPR traffic.
48560 unsigned Opc = N->getOpcode();
48561 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
48562 "Unexpected bit opcode");
48563
48564 SDValue N0 = N->getOperand(0);
48565 SDValue N1 = N->getOperand(1);
48566
48567 // Both operands must be single use MOVMSK.
48568 if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
48569 N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
48570 return SDValue();
48571
48572 SDValue Vec0 = N0.getOperand(0);
48573 SDValue Vec1 = N1.getOperand(0);
48574 EVT VecVT0 = Vec0.getValueType();
48575 EVT VecVT1 = Vec1.getValueType();
48576
48577 // Both MOVMSK operands must be from vectors of the same size and same element
48578 // size, but its OK for a fp/int diff.
48579 if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
48580 VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
48581 return SDValue();
48582
48583 SDLoc DL(N);
48584 unsigned VecOpc =
48585 VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;
48586 SDValue Result =
48587 DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
48588 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
48589}
48590
48591// Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).
48592// NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws
48593// handles in InstCombine.
48595 unsigned Opc = N->getOpcode();
48596 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
48597 "Unexpected bit opcode");
48598
48599 SDValue N0 = N->getOperand(0);
48600 SDValue N1 = N->getOperand(1);
48601 EVT VT = N->getValueType(0);
48602
48603 // Both operands must be single use.
48604 if (!N0.hasOneUse() || !N1.hasOneUse())
48605 return SDValue();
48606
48607 // Search for matching shifts.
48610
48611 unsigned BCOpc = BC0.getOpcode();
48612 EVT BCVT = BC0.getValueType();
48613 if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())
48614 return SDValue();
48615
48616 switch (BCOpc) {
48617 case X86ISD::VSHLI:
48618 case X86ISD::VSRLI:
48619 case X86ISD::VSRAI: {
48620 if (BC0.getOperand(1) != BC1.getOperand(1))
48621 return SDValue();
48622
48623 SDLoc DL(N);
48624 SDValue BitOp =
48625 DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));
48626 SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));
48627 return DAG.getBitcast(VT, Shift);
48628 }
48629 }
48630
48631 return SDValue();
48632}
48633
48634// Attempt to fold:
48635// BITOP(PACKSS(X,Z),PACKSS(Y,W)) --> PACKSS(BITOP(X,Y),BITOP(Z,W)).
48636// TODO: Handle PACKUS handling.
48638 unsigned Opc = N->getOpcode();
48639 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
48640 "Unexpected bit opcode");
48641
48642 SDValue N0 = N->getOperand(0);
48643 SDValue N1 = N->getOperand(1);
48644 EVT VT = N->getValueType(0);
48645
48646 // Both operands must be single use.
48647 if (!N0.hasOneUse() || !N1.hasOneUse())
48648 return SDValue();
48649
48650 // Search for matching packs.
48653
48654 if (N0.getOpcode() != X86ISD::PACKSS || N1.getOpcode() != X86ISD::PACKSS)
48655 return SDValue();
48656
48657 MVT DstVT = N0.getSimpleValueType();
48658 if (DstVT != N1.getSimpleValueType())
48659 return SDValue();
48660
48661 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
48662 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
48663
48664 // Limit to allsignbits packing.
48665 if (DAG.ComputeNumSignBits(N0.getOperand(0)) != NumSrcBits ||
48666 DAG.ComputeNumSignBits(N0.getOperand(1)) != NumSrcBits ||
48667 DAG.ComputeNumSignBits(N1.getOperand(0)) != NumSrcBits ||
48668 DAG.ComputeNumSignBits(N1.getOperand(1)) != NumSrcBits)
48669 return SDValue();
48670
48671 SDLoc DL(N);
48672 SDValue LHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(0), N1.getOperand(0));
48673 SDValue RHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(1), N1.getOperand(1));
48674 return DAG.getBitcast(VT, DAG.getNode(X86ISD::PACKSS, DL, DstVT, LHS, RHS));
48675}
48676
48677/// If this is a zero/all-bits result that is bitwise-anded with a low bits
48678/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
48679/// with a shift-right to eliminate loading the vector constant mask value.
48681 const X86Subtarget &Subtarget) {
48682 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
48683 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
48684 EVT VT = Op0.getValueType();
48685 if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())
48686 return SDValue();
48687
48688 // Try to convert an "is positive" signbit masking operation into arithmetic
48689 // shift and "andn". This saves a materialization of a -1 vector constant.
48690 // The "is negative" variant should be handled more generally because it only
48691 // requires "and" rather than "andn":
48692 // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y
48693 //
48694 // This is limited to the original type to avoid producing even more bitcasts.
48695 // If the bitcasts can't be eliminated, then it is unlikely that this fold
48696 // will be profitable.
48697 if (N->getValueType(0) == VT &&
48698 supportedVectorShiftWithImm(VT, Subtarget, ISD::SRA)) {
48699 SDValue X, Y;
48700 if (Op1.getOpcode() == X86ISD::PCMPGT &&
48701 isAllOnesOrAllOnesSplat(Op1.getOperand(1)) && Op1.hasOneUse()) {
48702 X = Op1.getOperand(0);
48703 Y = Op0;
48704 } else if (Op0.getOpcode() == X86ISD::PCMPGT &&
48705 isAllOnesOrAllOnesSplat(Op0.getOperand(1)) && Op0.hasOneUse()) {
48706 X = Op0.getOperand(0);
48707 Y = Op1;
48708 }
48709 if (X && Y) {
48710 SDLoc DL(N);
48711 SDValue Sra =
48713 VT.getScalarSizeInBits() - 1, DAG);
48714 return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);
48715 }
48716 }
48717
48718 APInt SplatVal;
48719 if (!X86::isConstantSplat(Op1, SplatVal, false) || !SplatVal.isMask())
48720 return SDValue();
48721
48722 // Don't prevent creation of ANDN.
48723 if (isBitwiseNot(Op0))
48724 return SDValue();
48725
48726 if (!supportedVectorShiftWithImm(VT, Subtarget, ISD::SRL))
48727 return SDValue();
48728
48729 unsigned EltBitWidth = VT.getScalarSizeInBits();
48730 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
48731 return SDValue();
48732
48733 SDLoc DL(N);
48734 unsigned ShiftVal = SplatVal.countr_one();
48735 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
48736 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);
48737 return DAG.getBitcast(N->getValueType(0), Shift);
48738}
48739
48740// Get the index node from the lowered DAG of a GEP IR instruction with one
48741// indexing dimension.
48743 if (Ld->isIndexed())
48744 return SDValue();
48745
48746 SDValue Base = Ld->getBasePtr();
48747
48748 if (Base.getOpcode() != ISD::ADD)
48749 return SDValue();
48750
48751 SDValue ShiftedIndex = Base.getOperand(0);
48752
48753 if (ShiftedIndex.getOpcode() != ISD::SHL)
48754 return SDValue();
48755
48756 return ShiftedIndex.getOperand(0);
48757
48758}
48759
48760static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
48761 return Subtarget.hasBMI2() &&
48762 (VT == MVT::i32 || (VT == MVT::i64 && Subtarget.is64Bit()));
48763}
48764
48765// This function recognizes cases where X86 bzhi instruction can replace and
48766// 'and-load' sequence.
48767// In case of loading integer value from an array of constants which is defined
48768// as follows:
48769//
48770// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
48771//
48772// then applying a bitwise and on the result with another input.
48773// It's equivalent to performing bzhi (zero high bits) on the input, with the
48774// same index of the load.
48776 const X86Subtarget &Subtarget) {
48777 MVT VT = Node->getSimpleValueType(0);
48778 SDLoc dl(Node);
48779
48780 // Check if subtarget has BZHI instruction for the node's type
48781 if (!hasBZHI(Subtarget, VT))
48782 return SDValue();
48783
48784 // Try matching the pattern for both operands.
48785 for (unsigned i = 0; i < 2; i++) {
48786 SDValue N = Node->getOperand(i);
48787 LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
48788
48789 // continue if the operand is not a load instruction
48790 if (!Ld)
48791 return SDValue();
48792
48793 const Value *MemOp = Ld->getMemOperand()->getValue();
48794
48795 if (!MemOp)
48796 return SDValue();
48797
48798 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
48799 if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
48800 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
48801
48802 Constant *Init = GV->getInitializer();
48803 Type *Ty = Init->getType();
48804 if (!isa<ConstantDataArray>(Init) ||
48805 !Ty->getArrayElementType()->isIntegerTy() ||
48807 VT.getSizeInBits() ||
48808 Ty->getArrayNumElements() >
48810 continue;
48811
48812 // Check if the array's constant elements are suitable to our case.
48813 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
48814 bool ConstantsMatch = true;
48815 for (uint64_t j = 0; j < ArrayElementCount; j++) {
48816 auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
48817 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
48818 ConstantsMatch = false;
48819 break;
48820 }
48821 }
48822 if (!ConstantsMatch)
48823 continue;
48824
48825 // Do the transformation (For 32-bit type):
48826 // -> (and (load arr[idx]), inp)
48827 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
48828 // that will be replaced with one bzhi instruction.
48829 SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
48830 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
48831
48832 // Get the Node which indexes into the array.
48834 if (!Index)
48835 return SDValue();
48836 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
48837
48838 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
48839 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
48840
48841 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
48842 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
48843
48844 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
48845 }
48846 }
48847 }
48848 }
48849 return SDValue();
48850}
48851
48852// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
48853// Where C is a mask containing the same number of bits as the setcc and
48854// where the setcc will freely 0 upper bits of k-register. We can replace the
48855// undef in the concat with 0s and remove the AND. This mainly helps with
48856// v2i1/v4i1 setcc being casted to scalar.
48858 const X86Subtarget &Subtarget) {
48859 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
48860
48861 EVT VT = N->getValueType(0);
48862
48863 // Make sure this is an AND with constant. We will check the value of the
48864 // constant later.
48865 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
48866 if (!C1)
48867 return SDValue();
48868
48869 // This is implied by the ConstantSDNode.
48870 assert(!VT.isVector() && "Expected scalar VT!");
48871
48872 SDValue Src = N->getOperand(0);
48873 if (!Src.hasOneUse())
48874 return SDValue();
48875
48876 // (Optionally) peek through any_extend().
48877 if (Src.getOpcode() == ISD::ANY_EXTEND) {
48878 if (!Src.getOperand(0).hasOneUse())
48879 return SDValue();
48880 Src = Src.getOperand(0);
48881 }
48882
48883 if (Src.getOpcode() != ISD::BITCAST || !Src.getOperand(0).hasOneUse())
48884 return SDValue();
48885
48886 Src = Src.getOperand(0);
48887 EVT SrcVT = Src.getValueType();
48888
48889 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48890 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
48891 !TLI.isTypeLegal(SrcVT))
48892 return SDValue();
48893
48894 if (Src.getOpcode() != ISD::CONCAT_VECTORS)
48895 return SDValue();
48896
48897 // We only care about the first subvector of the concat, we expect the
48898 // other subvectors to be ignored due to the AND if we make the change.
48899 SDValue SubVec = Src.getOperand(0);
48900 EVT SubVecVT = SubVec.getValueType();
48901
48902 // The RHS of the AND should be a mask with as many bits as SubVec.
48903 if (!TLI.isTypeLegal(SubVecVT) ||
48904 !C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements()))
48905 return SDValue();
48906
48907 // First subvector should be a setcc with a legal result type or a
48908 // AND containing at least one setcc with a legal result type.
48909 auto IsLegalSetCC = [&](SDValue V) {
48910 if (V.getOpcode() != ISD::SETCC)
48911 return false;
48912 EVT SetccVT = V.getOperand(0).getValueType();
48913 if (!TLI.isTypeLegal(SetccVT) ||
48914 !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
48915 return false;
48916 if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
48917 return false;
48918 return true;
48919 };
48920 if (!(IsLegalSetCC(SubVec) || (SubVec.getOpcode() == ISD::AND &&
48921 (IsLegalSetCC(SubVec.getOperand(0)) ||
48922 IsLegalSetCC(SubVec.getOperand(1))))))
48923 return SDValue();
48924
48925 // We passed all the checks. Rebuild the concat_vectors with zeroes
48926 // and cast it back to VT.
48927 SDLoc dl(N);
48928 SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
48929 DAG.getConstant(0, dl, SubVecVT));
48930 Ops[0] = SubVec;
48931 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
48932 Ops);
48933 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits());
48934 return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT);
48935}
48936
48937static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG,
48938 SDValue OpMustEq, SDValue Op, unsigned Depth) {
48939 // We don't want to go crazy with the recursion here. This isn't a super
48940 // important optimization.
48941 static constexpr unsigned kMaxDepth = 2;
48942
48943 // Only do this re-ordering if op has one use.
48944 if (!Op.hasOneUse())
48945 return SDValue();
48946
48947 SDLoc DL(Op);
48948 // If we hit another assosiative op, recurse further.
48949 if (Op.getOpcode() == Opc) {
48950 // Done recursing.
48951 if (Depth++ >= kMaxDepth)
48952 return SDValue();
48953
48954 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
48955 if (SDValue R =
48956 getBMIMatchingOp(Opc, DAG, OpMustEq, Op.getOperand(OpIdx), Depth))
48957 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), R,
48958 Op.getOperand(1 - OpIdx));
48959
48960 } else if (Op.getOpcode() == ISD::SUB) {
48961 if (Opc == ISD::AND) {
48962 // BLSI: (and x, (sub 0, x))
48963 if (isNullConstant(Op.getOperand(0)) && Op.getOperand(1) == OpMustEq)
48964 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
48965 }
48966 // Opc must be ISD::AND or ISD::XOR
48967 // BLSR: (and x, (sub x, 1))
48968 // BLSMSK: (xor x, (sub x, 1))
48969 if (isOneConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
48970 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
48971
48972 } else if (Op.getOpcode() == ISD::ADD) {
48973 // Opc must be ISD::AND or ISD::XOR
48974 // BLSR: (and x, (add x, -1))
48975 // BLSMSK: (xor x, (add x, -1))
48976 if (isAllOnesConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
48977 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
48978 }
48979 return SDValue();
48980}
48981
48983 const X86Subtarget &Subtarget) {
48984 EVT VT = N->getValueType(0);
48985 // Make sure this node is a candidate for BMI instructions.
48986 if (!Subtarget.hasBMI() || !VT.isScalarInteger() ||
48987 (VT != MVT::i32 && VT != MVT::i64))
48988 return SDValue();
48989
48990 assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR);
48991
48992 // Try and match LHS and RHS.
48993 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
48994 if (SDValue OpMatch =
48995 getBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx),
48996 N->getOperand(1 - OpIdx), 0))
48997 return OpMatch;
48998 return SDValue();
48999}
49000
49003 const X86Subtarget &Subtarget) {
49004 SDValue N0 = N->getOperand(0);
49005 SDValue N1 = N->getOperand(1);
49006 EVT VT = N->getValueType(0);
49007 SDLoc dl(N);
49008 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49009
49010 // If this is SSE1 only convert to FAND to avoid scalarization.
49011 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
49012 return DAG.getBitcast(MVT::v4i32,
49013 DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,
49014 DAG.getBitcast(MVT::v4f32, N0),
49015 DAG.getBitcast(MVT::v4f32, N1)));
49016 }
49017
49018 // Use a 32-bit and+zext if upper bits known zero.
49019 if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {
49020 APInt HiMask = APInt::getHighBitsSet(64, 32);
49021 if (DAG.MaskedValueIsZero(N1, HiMask) ||
49022 DAG.MaskedValueIsZero(N0, HiMask)) {
49023 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);
49024 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);
49025 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
49026 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
49027 }
49028 }
49029
49030 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
49031 // TODO: Support multiple SrcOps.
49032 if (VT == MVT::i1) {
49034 SmallVector<APInt, 2> SrcPartials;
49035 if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
49036 SrcOps.size() == 1) {
49037 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
49038 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
49039 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
49040 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
49041 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
49042 if (Mask) {
49043 assert(SrcPartials[0].getBitWidth() == NumElts &&
49044 "Unexpected partial reduction mask");
49045 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
49046 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
49047 return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
49048 }
49049 }
49050 }
49051
49052 // InstCombine converts:
49053 // `(-x << C0) & C1`
49054 // to
49055 // `(x * (Pow2_Ceil(C1) - (1 << C0))) & C1`
49056 // This saves an IR instruction but on x86 the neg/shift version is preferable
49057 // so undo the transform.
49058
49059 if (N0.getOpcode() == ISD::MUL && N0.hasOneUse()) {
49060 // TODO: We don't actually need a splat for this, we just need the checks to
49061 // hold for each element.
49062 ConstantSDNode *N1C = isConstOrConstSplat(N1, /*AllowUndefs*/ true,
49063 /*AllowTruncation*/ false);
49064 ConstantSDNode *N01C =
49065 isConstOrConstSplat(N0.getOperand(1), /*AllowUndefs*/ true,
49066 /*AllowTruncation*/ false);
49067 if (N1C && N01C) {
49068 const APInt &MulC = N01C->getAPIntValue();
49069 const APInt &AndC = N1C->getAPIntValue();
49070 APInt MulCLowBit = MulC & (-MulC);
49071 if (MulC.uge(AndC) && !MulC.isPowerOf2() &&
49072 (MulCLowBit + MulC).isPowerOf2()) {
49073 SDValue Neg = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT),
49074 N0.getOperand(0));
49075 int32_t MulCLowBitLog = MulCLowBit.exactLogBase2();
49076 assert(MulCLowBitLog != -1 &&
49077 "Isolated lowbit is somehow not a power of 2!");
49078 SDValue Shift = DAG.getNode(ISD::SHL, dl, VT, Neg,
49079 DAG.getConstant(MulCLowBitLog, dl, VT));
49080 return DAG.getNode(ISD::AND, dl, VT, Shift, N1);
49081 }
49082 }
49083 }
49084
49085 if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
49086 return V;
49087
49088 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
49089 return R;
49090
49091 if (SDValue R = combineBitOpWithShift(N, DAG))
49092 return R;
49093
49094 if (SDValue R = combineBitOpWithPACK(N, DAG))
49095 return R;
49096
49097 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
49098 return FPLogic;
49099
49100 if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget))
49101 return R;
49102
49103 if (DCI.isBeforeLegalizeOps())
49104 return SDValue();
49105
49106 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
49107 return R;
49108
49109 if (SDValue R = combineAndNotIntoANDNP(N, DAG))
49110 return R;
49111
49112 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
49113 return ShiftRight;
49114
49115 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
49116 return R;
49117
49118 // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
49119 // iff c2 is all/no bits mask - i.e. a select-with-zero mask.
49120 // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?
49121 if (VT.isVector() && getTargetConstantFromNode(N1)) {
49122 unsigned Opc0 = N0.getOpcode();
49123 if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&
49125 DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&
49126 N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {
49127 SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);
49128 return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);
49129 }
49130 }
49131
49132 // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
49133 // avoids slow variable shift (moving shift amount to ECX etc.)
49134 if (isOneConstant(N1) && N0->hasOneUse()) {
49135 SDValue Src = N0;
49136 while ((Src.getOpcode() == ISD::ZERO_EXTEND ||
49137 Src.getOpcode() == ISD::TRUNCATE) &&
49138 Src.getOperand(0)->hasOneUse())
49139 Src = Src.getOperand(0);
49140 bool ContainsNOT = false;
49141 X86::CondCode X86CC = X86::COND_B;
49142 // Peek through AND(NOT(SRL(X,Y)),1).
49143 if (isBitwiseNot(Src)) {
49144 Src = Src.getOperand(0);
49145 X86CC = X86::COND_AE;
49146 ContainsNOT = true;
49147 }
49148 if (Src.getOpcode() == ISD::SRL &&
49149 !isa<ConstantSDNode>(Src.getOperand(1))) {
49150 SDValue BitNo = Src.getOperand(1);
49151 Src = Src.getOperand(0);
49152 // Peek through AND(SRL(NOT(X),Y),1).
49153 if (isBitwiseNot(Src)) {
49154 Src = Src.getOperand(0);
49155 X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;
49156 ContainsNOT = true;
49157 }
49158 // If we have BMI2 then SHRX should be faster for i32/i64 cases.
49159 if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32))
49160 if (SDValue BT = getBT(Src, BitNo, dl, DAG))
49161 return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);
49162 }
49163 }
49164
49165 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
49166 // Attempt to recursively combine a bitmask AND with shuffles.
49167 SDValue Op(N, 0);
49168 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
49169 return Res;
49170
49171 // If either operand is a constant mask, then only the elements that aren't
49172 // zero are actually demanded by the other operand.
49173 auto GetDemandedMasks = [&](SDValue Op) {
49174 APInt UndefElts;
49175 SmallVector<APInt> EltBits;
49176 int NumElts = VT.getVectorNumElements();
49177 int EltSizeInBits = VT.getScalarSizeInBits();
49178 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
49179 APInt DemandedElts = APInt::getAllOnes(NumElts);
49180 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
49181 EltBits)) {
49182 DemandedBits.clearAllBits();
49183 DemandedElts.clearAllBits();
49184 for (int I = 0; I != NumElts; ++I) {
49185 if (UndefElts[I]) {
49186 // We can't assume an undef src element gives an undef dst - the
49187 // other src might be zero.
49188 DemandedBits.setAllBits();
49189 DemandedElts.setBit(I);
49190 } else if (!EltBits[I].isZero()) {
49191 DemandedBits |= EltBits[I];
49192 DemandedElts.setBit(I);
49193 }
49194 }
49195 }
49196 return std::make_pair(DemandedBits, DemandedElts);
49197 };
49198 APInt Bits0, Elts0;
49199 APInt Bits1, Elts1;
49200 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
49201 std::tie(Bits1, Elts1) = GetDemandedMasks(N0);
49202
49203 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
49204 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
49205 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
49206 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
49207 if (N->getOpcode() != ISD::DELETED_NODE)
49208 DCI.AddToWorklist(N);
49209 return SDValue(N, 0);
49210 }
49211
49212 SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Bits0, Elts0, DAG);
49213 SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Bits1, Elts1, DAG);
49214 if (NewN0 || NewN1)
49215 return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,
49216 NewN1 ? NewN1 : N1);
49217 }
49218
49219 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
49220 if ((VT.getScalarSizeInBits() % 8) == 0 &&
49222 isa<ConstantSDNode>(N0.getOperand(1)) && N0->hasOneUse()) {
49223 SDValue BitMask = N1;
49224 SDValue SrcVec = N0.getOperand(0);
49225 EVT SrcVecVT = SrcVec.getValueType();
49226
49227 // Check that the constant bitmask masks whole bytes.
49228 APInt UndefElts;
49229 SmallVector<APInt, 64> EltBits;
49230 if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&
49231 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
49232 llvm::all_of(EltBits, [](const APInt &M) {
49233 return M.isZero() || M.isAllOnes();
49234 })) {
49235 unsigned NumElts = SrcVecVT.getVectorNumElements();
49236 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
49237 unsigned Idx = N0.getConstantOperandVal(1);
49238
49239 // Create a root shuffle mask from the byte mask and the extracted index.
49240 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
49241 for (unsigned i = 0; i != Scale; ++i) {
49242 if (UndefElts[i])
49243 continue;
49244 int VecIdx = Scale * Idx + i;
49245 ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;
49246 }
49247
49249 {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
49251 /*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true,
49252 /*AllowVarPerLaneMask*/ true, DAG, Subtarget))
49253 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,
49254 N0.getOperand(1));
49255 }
49256 }
49257
49258 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
49259 return R;
49260
49261 return SDValue();
49262}
49263
49264// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
49266 const X86Subtarget &Subtarget) {
49267 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
49268
49269 MVT VT = N->getSimpleValueType(0);
49270 unsigned EltSizeInBits = VT.getScalarSizeInBits();
49271 if (!VT.isVector() || (EltSizeInBits % 8) != 0)
49272 return SDValue();
49273
49274 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
49275 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
49276 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
49277 return SDValue();
49278
49279 // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
49280 // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
49281 if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||
49282 !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
49283 return SDValue();
49284
49285 // Attempt to extract constant byte masks.
49286 APInt UndefElts0, UndefElts1;
49287 SmallVector<APInt, 32> EltBits0, EltBits1;
49288 if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
49289 /*AllowWholeUndefs*/ false,
49290 /*AllowPartialUndefs*/ false))
49291 return SDValue();
49292 if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
49293 /*AllowWholeUndefs*/ false,
49294 /*AllowPartialUndefs*/ false))
49295 return SDValue();
49296
49297 for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
49298 // TODO - add UNDEF elts support.
49299 if (UndefElts0[i] || UndefElts1[i])
49300 return SDValue();
49301 if (EltBits0[i] != ~EltBits1[i])
49302 return SDValue();
49303 }
49304
49305 SDLoc DL(N);
49306
49307 if (useVPTERNLOG(Subtarget, VT)) {
49308 // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.
49309 // VPTERNLOG is only available as vXi32/64-bit types.
49310 MVT OpSVT = EltSizeInBits <= 32 ? MVT::i32 : MVT::i64;
49311 MVT OpVT =
49312 MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());
49313 SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));
49314 SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));
49315 SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));
49316 SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
49317 SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},
49318 DAG, Subtarget);
49319 return DAG.getBitcast(VT, Res);
49320 }
49321
49322 SDValue X = N->getOperand(0);
49323 SDValue Y =
49324 DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
49325 DAG.getBitcast(VT, N1.getOperand(0)));
49326 return DAG.getNode(ISD::OR, DL, VT, X, Y);
49327}
49328
49329// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
49330static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
49331 if (N->getOpcode() != ISD::OR)
49332 return false;
49333
49334 SDValue N0 = N->getOperand(0);
49335 SDValue N1 = N->getOperand(1);
49336
49337 // Canonicalize AND to LHS.
49338 if (N1.getOpcode() == ISD::AND)
49339 std::swap(N0, N1);
49340
49341 // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
49342 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
49343 return false;
49344
49345 Mask = N1.getOperand(0);
49346 X = N1.getOperand(1);
49347
49348 // Check to see if the mask appeared in both the AND and ANDNP.
49349 if (N0.getOperand(0) == Mask)
49350 Y = N0.getOperand(1);
49351 else if (N0.getOperand(1) == Mask)
49352 Y = N0.getOperand(0);
49353 else
49354 return false;
49355
49356 // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
49357 // ANDNP combine allows other combines to happen that prevent matching.
49358 return true;
49359}
49360
49361// Try to fold:
49362// (or (and (m, y), (pandn m, x)))
49363// into:
49364// (vselect m, x, y)
49365// As a special case, try to fold:
49366// (or (and (m, (sub 0, x)), (pandn m, x)))
49367// into:
49368// (sub (xor X, M), M)
49370 const X86Subtarget &Subtarget) {
49371 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
49372
49373 EVT VT = N->getValueType(0);
49374 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
49375 (VT.is256BitVector() && Subtarget.hasInt256())))
49376 return SDValue();
49377
49378 SDValue X, Y, Mask;
49379 if (!matchLogicBlend(N, X, Y, Mask))
49380 return SDValue();
49381
49382 // Validate that X, Y, and Mask are bitcasts, and see through them.
49383 Mask = peekThroughBitcasts(Mask);
49386
49387 EVT MaskVT = Mask.getValueType();
49388 unsigned EltBits = MaskVT.getScalarSizeInBits();
49389
49390 // TODO: Attempt to handle floating point cases as well?
49391 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
49392 return SDValue();
49393
49394 SDLoc DL(N);
49395
49396 // Attempt to combine to conditional negate: (sub (xor X, M), M)
49397 if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
49398 DAG, Subtarget))
49399 return Res;
49400
49401 // PBLENDVB is only available on SSE 4.1.
49402 if (!Subtarget.hasSSE41())
49403 return SDValue();
49404
49405 // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
49406 if (Subtarget.hasVLX())
49407 return SDValue();
49408
49409 MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
49410
49411 X = DAG.getBitcast(BlendVT, X);
49412 Y = DAG.getBitcast(BlendVT, Y);
49413 Mask = DAG.getBitcast(BlendVT, Mask);
49414 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
49415 return DAG.getBitcast(VT, Mask);
49416}
49417
49418// Helper function for combineOrCmpEqZeroToCtlzSrl
49419// Transforms:
49420// seteq(cmp x, 0)
49421// into:
49422// srl(ctlz x), log2(bitsize(x))
49423// Input pattern is checked by caller.
49425 SDValue Cmp = Op.getOperand(1);
49426 EVT VT = Cmp.getOperand(0).getValueType();
49427 unsigned Log2b = Log2_32(VT.getSizeInBits());
49428 SDLoc dl(Op);
49429 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
49430 // The result of the shift is true or false, and on X86, the 32-bit
49431 // encoding of shr and lzcnt is more desirable.
49432 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
49433 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
49434 DAG.getConstant(Log2b, dl, MVT::i8));
49435 return Scc;
49436}
49437
49438// Try to transform:
49439// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
49440// into:
49441// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
49442// Will also attempt to match more generic cases, eg:
49443// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
49444// Only applies if the target supports the FastLZCNT feature.
49447 const X86Subtarget &Subtarget) {
49448 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
49449 return SDValue();
49450
49451 auto isORCandidate = [](SDValue N) {
49452 return (N->getOpcode() == ISD::OR && N->hasOneUse());
49453 };
49454
49455 // Check the zero extend is extending to 32-bit or more. The code generated by
49456 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
49457 // instructions to clear the upper bits.
49458 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
49459 !isORCandidate(N->getOperand(0)))
49460 return SDValue();
49461
49462 // Check the node matches: setcc(eq, cmp 0)
49463 auto isSetCCCandidate = [](SDValue N) {
49464 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
49465 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
49466 N->getOperand(1).getOpcode() == X86ISD::CMP &&
49467 isNullConstant(N->getOperand(1).getOperand(1)) &&
49468 N->getOperand(1).getValueType().bitsGE(MVT::i32);
49469 };
49470
49471 SDNode *OR = N->getOperand(0).getNode();
49472 SDValue LHS = OR->getOperand(0);
49473 SDValue RHS = OR->getOperand(1);
49474
49475 // Save nodes matching or(or, setcc(eq, cmp 0)).
49477 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
49478 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
49479 ORNodes.push_back(OR);
49480 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
49481 LHS = OR->getOperand(0);
49482 RHS = OR->getOperand(1);
49483 }
49484
49485 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
49486 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
49487 !isORCandidate(SDValue(OR, 0)))
49488 return SDValue();
49489
49490 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
49491 // to
49492 // or(srl(ctlz),srl(ctlz)).
49493 // The dag combiner can then fold it into:
49494 // srl(or(ctlz, ctlz)).
49495 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);
49496 SDValue Ret, NewRHS;
49497 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))
49498 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);
49499
49500 if (!Ret)
49501 return SDValue();
49502
49503 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
49504 while (!ORNodes.empty()) {
49505 OR = ORNodes.pop_back_val();
49506 LHS = OR->getOperand(0);
49507 RHS = OR->getOperand(1);
49508 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
49509 if (RHS->getOpcode() == ISD::OR)
49510 std::swap(LHS, RHS);
49511 NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);
49512 if (!NewRHS)
49513 return SDValue();
49514 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);
49515 }
49516
49517 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
49518}
49519
49521 SDValue And1_L, SDValue And1_R,
49522 const SDLoc &DL, SelectionDAG &DAG) {
49523 if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse())
49524 return SDValue();
49525 SDValue NotOp = And0_L->getOperand(0);
49526 if (NotOp == And1_R)
49527 std::swap(And1_R, And1_L);
49528 if (NotOp != And1_L)
49529 return SDValue();
49530
49531 // (~(NotOp) & And0_R) | (NotOp & And1_R)
49532 // --> ((And0_R ^ And1_R) & NotOp) ^ And1_R
49533 EVT VT = And1_L->getValueType(0);
49534 SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R);
49535 SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R);
49536 SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);
49537 SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R);
49538 return Xor1;
49539}
49540
49541/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
49542/// equivalent `((x ^ y) & m) ^ y)` pattern.
49543/// This is typically a better representation for targets without a fused
49544/// "and-not" operation. This function is intended to be called from a
49545/// `TargetLowering::PerformDAGCombine` callback on `ISD::OR` nodes.
49547 // Note that masked-merge variants using XOR or ADD expressions are
49548 // normalized to OR by InstCombine so we only check for OR.
49549 assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node");
49550 SDValue N0 = Node->getOperand(0);
49551 if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())
49552 return SDValue();
49553 SDValue N1 = Node->getOperand(1);
49554 if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())
49555 return SDValue();
49556
49557 SDLoc DL(Node);
49558 SDValue N00 = N0->getOperand(0);
49559 SDValue N01 = N0->getOperand(1);
49560 SDValue N10 = N1->getOperand(0);
49561 SDValue N11 = N1->getOperand(1);
49562 if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))
49563 return Result;
49564 if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))
49565 return Result;
49566 if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))
49567 return Result;
49568 if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))
49569 return Result;
49570 return SDValue();
49571}
49572
49573/// If this is an add or subtract where one operand is produced by a cmp+setcc,
49574/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
49575/// with CMP+{ADC, SBB}.
49576/// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.
49577static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
49578 SDValue X, SDValue Y,
49579 SelectionDAG &DAG,
49580 bool ZeroSecondOpOnly = false) {
49581 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
49582 return SDValue();
49583
49584 // Look through a one-use zext.
49585 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())
49586 Y = Y.getOperand(0);
49587
49589 SDValue EFLAGS;
49590 if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {
49591 CC = (X86::CondCode)Y.getConstantOperandVal(0);
49592 EFLAGS = Y.getOperand(1);
49593 } else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&
49594 Y.hasOneUse()) {
49595 EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);
49596 }
49597
49598 if (!EFLAGS)
49599 return SDValue();
49600
49601 // If X is -1 or 0, then we have an opportunity to avoid constants required in
49602 // the general case below.
49603 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
49604 if (ConstantX && !ZeroSecondOpOnly) {
49605 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
49606 (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
49607 // This is a complicated way to get -1 or 0 from the carry flag:
49608 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
49609 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
49610 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49611 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49612 EFLAGS);
49613 }
49614
49615 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
49616 (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
49617 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
49618 EFLAGS.getValueType().isInteger() &&
49619 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
49620 // Swap the operands of a SUB, and we have the same pattern as above.
49621 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
49622 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
49623 SDValue NewSub = DAG.getNode(
49624 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
49625 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
49626 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
49627 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49628 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49629 NewEFLAGS);
49630 }
49631 }
49632 }
49633
49634 if (CC == X86::COND_B) {
49635 // X + SETB Z --> adc X, 0
49636 // X - SETB Z --> sbb X, 0
49637 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
49638 DAG.getVTList(VT, MVT::i32), X,
49639 DAG.getConstant(0, DL, VT), EFLAGS);
49640 }
49641
49642 if (ZeroSecondOpOnly)
49643 return SDValue();
49644
49645 if (CC == X86::COND_A) {
49646 // Try to convert COND_A into COND_B in an attempt to facilitate
49647 // materializing "setb reg".
49648 //
49649 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
49650 // cannot take an immediate as its first operand.
49651 //
49652 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
49653 EFLAGS.getValueType().isInteger() &&
49654 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
49655 SDValue NewSub =
49656 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
49657 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
49658 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
49659 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
49660 DAG.getVTList(VT, MVT::i32), X,
49661 DAG.getConstant(0, DL, VT), NewEFLAGS);
49662 }
49663 }
49664
49665 if (CC == X86::COND_AE) {
49666 // X + SETAE --> sbb X, -1
49667 // X - SETAE --> adc X, -1
49668 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
49669 DAG.getVTList(VT, MVT::i32), X,
49670 DAG.getConstant(-1, DL, VT), EFLAGS);
49671 }
49672
49673 if (CC == X86::COND_BE) {
49674 // X + SETBE --> sbb X, -1
49675 // X - SETBE --> adc X, -1
49676 // Try to convert COND_BE into COND_AE in an attempt to facilitate
49677 // materializing "setae reg".
49678 //
49679 // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
49680 // cannot take an immediate as its first operand.
49681 //
49682 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
49683 EFLAGS.getValueType().isInteger() &&
49684 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
49685 SDValue NewSub =
49686 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
49687 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
49688 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
49689 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
49690 DAG.getVTList(VT, MVT::i32), X,
49691 DAG.getConstant(-1, DL, VT), NewEFLAGS);
49692 }
49693 }
49694
49695 if (CC != X86::COND_E && CC != X86::COND_NE)
49696 return SDValue();
49697
49698 if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||
49699 !X86::isZeroNode(EFLAGS.getOperand(1)) ||
49700 !EFLAGS.getOperand(0).getValueType().isInteger())
49701 return SDValue();
49702
49703 SDValue Z = EFLAGS.getOperand(0);
49704 EVT ZVT = Z.getValueType();
49705
49706 // If X is -1 or 0, then we have an opportunity to avoid constants required in
49707 // the general case below.
49708 if (ConstantX) {
49709 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
49710 // fake operands:
49711 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
49712 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
49713 if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
49714 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
49715 SDValue Zero = DAG.getConstant(0, DL, ZVT);
49716 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
49717 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
49718 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49719 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49720 SDValue(Neg.getNode(), 1));
49721 }
49722
49723 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
49724 // with fake operands:
49725 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
49726 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
49727 if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
49728 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
49729 SDValue One = DAG.getConstant(1, DL, ZVT);
49730 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
49731 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
49732 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49733 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49734 Cmp1.getValue(1));
49735 }
49736 }
49737
49738 // (cmp Z, 1) sets the carry flag if Z is 0.
49739 SDValue One = DAG.getConstant(1, DL, ZVT);
49740 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
49741 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
49742
49743 // Add the flags type for ADC/SBB nodes.
49744 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
49745
49746 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
49747 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
49748 if (CC == X86::COND_NE)
49749 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
49750 DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));
49751
49752 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
49753 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
49754 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
49755 DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
49756}
49757
49758/// If this is an add or subtract where one operand is produced by a cmp+setcc,
49759/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
49760/// with CMP+{ADC, SBB}.
49762 bool IsSub = N->getOpcode() == ISD::SUB;
49763 SDValue X = N->getOperand(0);
49764 SDValue Y = N->getOperand(1);
49765 EVT VT = N->getValueType(0);
49766 SDLoc DL(N);
49767
49768 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))
49769 return ADCOrSBB;
49770
49771 // Commute and try again (negate the result for subtracts).
49772 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {
49773 if (IsSub)
49774 ADCOrSBB =
49775 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), ADCOrSBB);
49776 return ADCOrSBB;
49777 }
49778
49779 return SDValue();
49780}
49781
49783 SelectionDAG &DAG) {
49784 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::OR) &&
49785 "Unexpected opcode");
49786
49787 // Delegate to combineAddOrSubToADCOrSBB if we have:
49788 //
49789 // (xor/or (zero_extend (setcc)) imm)
49790 //
49791 // where imm is odd if and only if we have xor, in which case the XOR/OR are
49792 // equivalent to a SUB/ADD, respectively.
49793 if (N0.getOpcode() == ISD::ZERO_EXTEND &&
49794 N0.getOperand(0).getOpcode() == X86ISD::SETCC && N0.hasOneUse()) {
49795 if (auto *N1C = dyn_cast<ConstantSDNode>(N1)) {
49796 bool IsSub = N->getOpcode() == ISD::XOR;
49797 bool N1COdd = N1C->getZExtValue() & 1;
49798 if (IsSub ? N1COdd : !N1COdd) {
49799 SDLoc DL(N);
49800 EVT VT = N->getValueType(0);
49801 if (SDValue R = combineAddOrSubToADCOrSBB(IsSub, DL, VT, N1, N0, DAG))
49802 return R;
49803 }
49804 }
49805 }
49806
49807 // not(pcmpeq(and(X,CstPow2),0)) -> pcmpeq(and(X,CstPow2),CstPow2)
49808 if (N->getOpcode() == ISD::XOR && N0.getOpcode() == X86ISD::PCMPEQ &&
49809 N0.getOperand(0).getOpcode() == ISD::AND &&
49812 MVT VT = N->getSimpleValueType(0);
49813 APInt UndefElts;
49814 SmallVector<APInt> EltBits;
49816 VT.getScalarSizeInBits(), UndefElts,
49817 EltBits)) {
49818 bool IsPow2OrUndef = true;
49819 for (unsigned I = 0, E = EltBits.size(); I != E; ++I)
49820 IsPow2OrUndef &= UndefElts[I] || EltBits[I].isPowerOf2();
49821
49822 if (IsPow2OrUndef)
49823 return DAG.getNode(X86ISD::PCMPEQ, SDLoc(N), VT, N0.getOperand(0),
49824 N0.getOperand(0).getOperand(1));
49825 }
49826 }
49827
49828 return SDValue();
49829}
49830
49833 const X86Subtarget &Subtarget) {
49834 SDValue N0 = N->getOperand(0);
49835 SDValue N1 = N->getOperand(1);
49836 EVT VT = N->getValueType(0);
49837 SDLoc dl(N);
49838 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49839
49840 // If this is SSE1 only convert to FOR to avoid scalarization.
49841 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
49842 return DAG.getBitcast(MVT::v4i32,
49843 DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,
49844 DAG.getBitcast(MVT::v4f32, N0),
49845 DAG.getBitcast(MVT::v4f32, N1)));
49846 }
49847
49848 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
49849 // TODO: Support multiple SrcOps.
49850 if (VT == MVT::i1) {
49852 SmallVector<APInt, 2> SrcPartials;
49853 if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
49854 SrcOps.size() == 1) {
49855 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
49856 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
49857 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
49858 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
49859 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
49860 if (Mask) {
49861 assert(SrcPartials[0].getBitWidth() == NumElts &&
49862 "Unexpected partial reduction mask");
49863 SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
49864 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
49865 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
49866 return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
49867 }
49868 }
49869 }
49870
49871 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
49872 return R;
49873
49874 if (SDValue R = combineBitOpWithShift(N, DAG))
49875 return R;
49876
49877 if (SDValue R = combineBitOpWithPACK(N, DAG))
49878 return R;
49879
49880 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
49881 return FPLogic;
49882
49883 if (DCI.isBeforeLegalizeOps())
49884 return SDValue();
49885
49886 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
49887 return R;
49888
49889 if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
49890 return R;
49891
49892 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
49893 return R;
49894
49895 // (0 - SetCC) | C -> (zext (not SetCC)) * (C + 1) - 1 if we can get a LEA out of it.
49896 if ((VT == MVT::i32 || VT == MVT::i64) &&
49897 N0.getOpcode() == ISD::SUB && N0.hasOneUse() &&
49898 isNullConstant(N0.getOperand(0))) {
49899 SDValue Cond = N0.getOperand(1);
49900 if (Cond.getOpcode() == ISD::ZERO_EXTEND && Cond.hasOneUse())
49901 Cond = Cond.getOperand(0);
49902
49903 if (Cond.getOpcode() == X86ISD::SETCC && Cond.hasOneUse()) {
49904 if (auto *CN = dyn_cast<ConstantSDNode>(N1)) {
49905 uint64_t Val = CN->getZExtValue();
49906 if (Val == 1 || Val == 2 || Val == 3 || Val == 4 || Val == 7 || Val == 8) {
49907 X86::CondCode CCode = (X86::CondCode)Cond.getConstantOperandVal(0);
49908 CCode = X86::GetOppositeBranchCondition(CCode);
49909 SDValue NotCond = getSETCC(CCode, Cond.getOperand(1), SDLoc(Cond), DAG);
49910
49911 SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT);
49912 R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT));
49913 R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT));
49914 return R;
49915 }
49916 }
49917 }
49918 }
49919
49920 // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
49921 // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
49922 // iff the upper elements of the non-shifted arg are zero.
49923 // KUNPCK require 16+ bool vector elements.
49924 if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
49925 unsigned NumElts = VT.getVectorNumElements();
49926 unsigned HalfElts = NumElts / 2;
49927 APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
49928 if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
49929 N1.getConstantOperandAPInt(1) == HalfElts &&
49930 DAG.MaskedVectorIsZero(N0, UpperElts)) {
49931 return DAG.getNode(
49932 ISD::CONCAT_VECTORS, dl, VT,
49933 extractSubVector(N0, 0, DAG, dl, HalfElts),
49934 extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
49935 }
49936 if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
49937 N0.getConstantOperandAPInt(1) == HalfElts &&
49938 DAG.MaskedVectorIsZero(N1, UpperElts)) {
49939 return DAG.getNode(
49940 ISD::CONCAT_VECTORS, dl, VT,
49941 extractSubVector(N1, 0, DAG, dl, HalfElts),
49942 extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
49943 }
49944 }
49945
49946 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
49947 // Attempt to recursively combine an OR of shuffles.
49948 SDValue Op(N, 0);
49949 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
49950 return Res;
49951
49952 // If either operand is a constant mask, then only the elements that aren't
49953 // allones are actually demanded by the other operand.
49954 auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
49955 APInt UndefElts;
49956 SmallVector<APInt> EltBits;
49957 int NumElts = VT.getVectorNumElements();
49958 int EltSizeInBits = VT.getScalarSizeInBits();
49959 if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
49960 return false;
49961
49962 APInt DemandedElts = APInt::getZero(NumElts);
49963 for (int I = 0; I != NumElts; ++I)
49964 if (!EltBits[I].isAllOnes())
49965 DemandedElts.setBit(I);
49966
49967 return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);
49968 };
49969 if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
49970 if (N->getOpcode() != ISD::DELETED_NODE)
49971 DCI.AddToWorklist(N);
49972 return SDValue(N, 0);
49973 }
49974 }
49975
49976 // We should fold "masked merge" patterns when `andn` is not available.
49977 if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1)
49978 if (SDValue R = foldMaskedMerge(N, DAG))
49979 return R;
49980
49981 if (SDValue R = combineOrXorWithSETCC(N, N0, N1, DAG))
49982 return R;
49983
49984 return SDValue();
49985}
49986
49987/// Try to turn tests against the signbit in the form of:
49988/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
49989/// into:
49990/// SETGT(X, -1)
49992 // This is only worth doing if the output type is i8 or i1.
49993 EVT ResultType = N->getValueType(0);
49994 if (ResultType != MVT::i8 && ResultType != MVT::i1)
49995 return SDValue();
49996
49997 SDValue N0 = N->getOperand(0);
49998 SDValue N1 = N->getOperand(1);
49999
50000 // We should be performing an xor against a truncated shift.
50001 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
50002 return SDValue();
50003
50004 // Make sure we are performing an xor against one.
50005 if (!isOneConstant(N1))
50006 return SDValue();
50007
50008 // SetCC on x86 zero extends so only act on this if it's a logical shift.
50009 SDValue Shift = N0.getOperand(0);
50010 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
50011 return SDValue();
50012
50013 // Make sure we are truncating from one of i16, i32 or i64.
50014 EVT ShiftTy = Shift.getValueType();
50015 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
50016 return SDValue();
50017
50018 // Make sure the shift amount extracts the sign bit.
50019 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
50020 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
50021 return SDValue();
50022
50023 // Create a greater-than comparison against -1.
50024 // N.B. Using SETGE against 0 works but we want a canonical looking
50025 // comparison, using SETGT matches up with what TranslateX86CC.
50026 SDLoc DL(N);
50027 SDValue ShiftOp = Shift.getOperand(0);
50028 EVT ShiftOpTy = ShiftOp.getValueType();
50029 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50030 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
50031 *DAG.getContext(), ResultType);
50032 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
50033 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
50034 if (SetCCResultType != ResultType)
50035 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
50036 return Cond;
50037}
50038
50039/// Turn vector tests of the signbit in the form of:
50040/// xor (sra X, elt_size(X)-1), -1
50041/// into:
50042/// pcmpgt X, -1
50043///
50044/// This should be called before type legalization because the pattern may not
50045/// persist after that.
50047 const X86Subtarget &Subtarget) {
50048 EVT VT = N->getValueType(0);
50049 if (!VT.isSimple())
50050 return SDValue();
50051
50052 switch (VT.getSimpleVT().SimpleTy) {
50053 // clang-format off
50054 default: return SDValue();
50055 case MVT::v16i8:
50056 case MVT::v8i16:
50057 case MVT::v4i32:
50058 case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
50059 case MVT::v32i8:
50060 case MVT::v16i16:
50061 case MVT::v8i32:
50062 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
50063 // clang-format on
50064 }
50065
50066 // There must be a shift right algebraic before the xor, and the xor must be a
50067 // 'not' operation.
50068 SDValue Shift = N->getOperand(0);
50069 SDValue Ones = N->getOperand(1);
50070 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
50072 return SDValue();
50073
50074 // The shift should be smearing the sign bit across each vector element.
50075 auto *ShiftAmt =
50076 isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
50077 if (!ShiftAmt ||
50078 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
50079 return SDValue();
50080
50081 // Create a greater-than comparison against -1. We don't use the more obvious
50082 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
50083 return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
50084}
50085
50086/// Detect patterns of truncation with unsigned saturation:
50087///
50088/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
50089/// Return the source value x to be truncated or SDValue() if the pattern was
50090/// not matched.
50091///
50092/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
50093/// where C1 >= 0 and C2 is unsigned max of destination type.
50094///
50095/// (truncate (smax (smin (x, C2), C1)) to dest_type)
50096/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
50097///
50098/// These two patterns are equivalent to:
50099/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
50100/// So return the smax(x, C1) value to be truncated or SDValue() if the
50101/// pattern was not matched.
50103 const SDLoc &DL) {
50104 EVT InVT = In.getValueType();
50105
50106 // Saturation with truncation. We truncate from InVT to VT.
50108 "Unexpected types for truncate operation");
50109
50110 // Match min/max and return limit value as a parameter.
50111 auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
50112 if (V.getOpcode() == Opcode &&
50113 ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
50114 return V.getOperand(0);
50115 return SDValue();
50116 };
50117
50118 APInt C1, C2;
50119 if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
50120 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
50121 // the element size of the destination type.
50122 if (C2.isMask(VT.getScalarSizeInBits()))
50123 return UMin;
50124
50125 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
50126 if (MatchMinMax(SMin, ISD::SMAX, C1))
50127 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
50128 return SMin;
50129
50130 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
50131 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
50132 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
50133 C2.uge(C1)) {
50134 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
50135 }
50136
50137 return SDValue();
50138}
50139
50140/// Detect patterns of truncation with signed saturation:
50141/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
50142/// signed_max_of_dest_type)) to dest_type)
50143/// or:
50144/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
50145/// signed_min_of_dest_type)) to dest_type).
50146/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
50147/// Return the source value to be truncated or SDValue() if the pattern was not
50148/// matched.
50149static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
50150 unsigned NumDstBits = VT.getScalarSizeInBits();
50151 unsigned NumSrcBits = In.getScalarValueSizeInBits();
50152 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
50153
50154 auto MatchMinMax = [](SDValue V, unsigned Opcode,
50155 const APInt &Limit) -> SDValue {
50156 APInt C;
50157 if (V.getOpcode() == Opcode &&
50158 ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
50159 return V.getOperand(0);
50160 return SDValue();
50161 };
50162
50163 APInt SignedMax, SignedMin;
50164 if (MatchPackUS) {
50165 SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);
50166 SignedMin = APInt(NumSrcBits, 0);
50167 } else {
50168 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
50169 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
50170 }
50171
50172 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
50173 if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
50174 return SMax;
50175
50176 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
50177 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
50178 return SMin;
50179
50180 return SDValue();
50181}
50182
50184 SelectionDAG &DAG,
50185 const X86Subtarget &Subtarget) {
50186 if (!Subtarget.hasSSE2() || !VT.isVector())
50187 return SDValue();
50188
50189 EVT SVT = VT.getVectorElementType();
50190 EVT InVT = In.getValueType();
50191 EVT InSVT = InVT.getVectorElementType();
50192
50193 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
50194 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
50195 // and concatenate at the same time. Then we can use a final vpmovuswb to
50196 // clip to 0-255.
50197 if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
50198 InVT == MVT::v16i32 && VT == MVT::v16i8) {
50199 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
50200 // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
50201 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
50202 DL, DAG, Subtarget);
50203 assert(Mid && "Failed to pack!");
50204 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
50205 }
50206 }
50207
50208 // vXi32 truncate instructions are available with AVX512F.
50209 // vXi16 truncate instructions are only available with AVX512BW.
50210 // For 256-bit or smaller vectors, we require VLX.
50211 // FIXME: We could widen truncates to 512 to remove the VLX restriction.
50212 // If the result type is 256-bits or larger and we have disable 512-bit
50213 // registers, we should go ahead and use the pack instructions if possible.
50214 bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
50215 (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
50216 (InVT.getSizeInBits() > 128) &&
50217 (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
50218 !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
50219
50220 if (!PreferAVX512 && VT.getVectorNumElements() > 1 &&
50222 (SVT == MVT::i8 || SVT == MVT::i16) &&
50223 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
50224 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
50225 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
50226 if (SVT == MVT::i8 && InSVT == MVT::i32) {
50227 EVT MidVT = VT.changeVectorElementType(MVT::i16);
50228 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
50229 DAG, Subtarget);
50230 assert(Mid && "Failed to pack!");
50232 Subtarget);
50233 assert(V && "Failed to pack!");
50234 return V;
50235 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
50236 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
50237 Subtarget);
50238 }
50239 if (SDValue SSatVal = detectSSatPattern(In, VT))
50240 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
50241 Subtarget);
50242 }
50243
50244 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50245 if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
50246 Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
50247 (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
50248 unsigned TruncOpc = 0;
50249 SDValue SatVal;
50250 if (SDValue SSatVal = detectSSatPattern(In, VT)) {
50251 SatVal = SSatVal;
50252 TruncOpc = X86ISD::VTRUNCS;
50253 } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {
50254 SatVal = USatVal;
50255 TruncOpc = X86ISD::VTRUNCUS;
50256 }
50257 if (SatVal) {
50258 unsigned ResElts = VT.getVectorNumElements();
50259 // If the input type is less than 512 bits and we don't have VLX, we need
50260 // to widen to 512 bits.
50261 if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
50262 unsigned NumConcats = 512 / InVT.getSizeInBits();
50263 ResElts *= NumConcats;
50264 SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
50265 ConcatOps[0] = SatVal;
50266 InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
50267 NumConcats * InVT.getVectorNumElements());
50268 SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
50269 }
50270 // Widen the result if its narrower than 128 bits.
50271 if (ResElts * SVT.getSizeInBits() < 128)
50272 ResElts = 128 / SVT.getSizeInBits();
50273 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
50274 SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
50275 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
50276 DAG.getIntPtrConstant(0, DL));
50277 }
50278 }
50279
50280 return SDValue();
50281}
50282
50283/// This function detects the AVG pattern between vectors of unsigned i8/i16,
50284/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
50285/// ISD::AVGCEILU (AVG) instruction.
50287 const X86Subtarget &Subtarget,
50288 const SDLoc &DL) {
50289 if (!VT.isVector())
50290 return SDValue();
50291 EVT InVT = In.getValueType();
50292 unsigned NumElems = VT.getVectorNumElements();
50293
50294 EVT ScalarVT = VT.getVectorElementType();
50295 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2))
50296 return SDValue();
50297
50298 // InScalarVT is the intermediate type in AVG pattern and it should be greater
50299 // than the original input type (i8/i16).
50300 EVT InScalarVT = InVT.getVectorElementType();
50301 if (InScalarVT.getFixedSizeInBits() <= ScalarVT.getFixedSizeInBits())
50302 return SDValue();
50303
50304 if (!Subtarget.hasSSE2())
50305 return SDValue();
50306
50307 // Detect the following pattern:
50308 //
50309 // %1 = zext <N x i8> %a to <N x i32>
50310 // %2 = zext <N x i8> %b to <N x i32>
50311 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
50312 // %4 = add nuw nsw <N x i32> %3, %2
50313 // %5 = lshr <N x i32> %N, <i32 1 x N>
50314 // %6 = trunc <N x i32> %5 to <N x i8>
50315 //
50316 // In AVX512, the last instruction can also be a trunc store.
50317 if (In.getOpcode() != ISD::SRL)
50318 return SDValue();
50319
50320 // A lambda checking the given SDValue is a constant vector and each element
50321 // is in the range [Min, Max].
50322 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
50323 return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) {
50324 return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max));
50325 });
50326 };
50327
50328 auto IsZExtLike = [DAG = &DAG, ScalarVT](SDValue V) {
50329 unsigned MaxActiveBits = DAG->computeKnownBits(V).countMaxActiveBits();
50330 return MaxActiveBits <= ScalarVT.getSizeInBits();
50331 };
50332
50333 // Check if each element of the vector is right-shifted by one.
50334 SDValue LHS = In.getOperand(0);
50335 SDValue RHS = In.getOperand(1);
50336 if (!IsConstVectorInRange(RHS, 1, 1))
50337 return SDValue();
50338 if (LHS.getOpcode() != ISD::ADD)
50339 return SDValue();
50340
50341 // Detect a pattern of a + b + 1 where the order doesn't matter.
50342 SDValue Operands[3];
50343 Operands[0] = LHS.getOperand(0);
50344 Operands[1] = LHS.getOperand(1);
50345
50346 auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
50347 ArrayRef<SDValue> Ops) {
50348 return DAG.getNode(ISD::AVGCEILU, DL, Ops[0].getValueType(), Ops);
50349 };
50350
50351 auto AVGSplitter = [&](std::array<SDValue, 2> Ops) {
50352 for (SDValue &Op : Ops)
50353 if (Op.getValueType() != VT)
50354 Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
50355 // Pad to a power-of-2 vector, split+apply and extract the original vector.
50356 unsigned NumElemsPow2 = PowerOf2Ceil(NumElems);
50357 EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2);
50358 if (NumElemsPow2 != NumElems) {
50359 for (SDValue &Op : Ops) {
50360 SmallVector<SDValue, 32> EltsOfOp(NumElemsPow2, DAG.getUNDEF(ScalarVT));
50361 for (unsigned i = 0; i != NumElems; ++i) {
50362 SDValue Idx = DAG.getIntPtrConstant(i, DL);
50363 EltsOfOp[i] =
50364 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op, Idx);
50365 }
50366 Op = DAG.getBuildVector(Pow2VT, DL, EltsOfOp);
50367 }
50368 }
50369 SDValue Res = SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, Ops, AVGBuilder);
50370 if (NumElemsPow2 == NumElems)
50371 return Res;
50372 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
50373 DAG.getIntPtrConstant(0, DL));
50374 };
50375
50376 // Take care of the case when one of the operands is a constant vector whose
50377 // element is in the range [1, 256].
50378 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
50379 IsZExtLike(Operands[0])) {
50380 // The pattern is detected. Subtract one from the constant vector, then
50381 // demote it and emit X86ISD::AVG instruction.
50382 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
50383 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
50384 return AVGSplitter({Operands[0], Operands[1]});
50385 }
50386
50387 // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
50388 // Match the or case only if its 'add-like' - can be replaced by an add.
50389 auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
50390 if (ISD::ADD == V.getOpcode()) {
50391 Op0 = V.getOperand(0);
50392 Op1 = V.getOperand(1);
50393 return true;
50394 }
50395 if (ISD::ZERO_EXTEND != V.getOpcode())
50396 return false;
50397 V = V.getOperand(0);
50398 if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||
50399 !DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))
50400 return false;
50401 Op0 = V.getOperand(0);
50402 Op1 = V.getOperand(1);
50403 return true;
50404 };
50405
50406 SDValue Op0, Op1;
50407 if (FindAddLike(Operands[0], Op0, Op1))
50408 std::swap(Operands[0], Operands[1]);
50409 else if (!FindAddLike(Operands[1], Op0, Op1))
50410 return SDValue();
50411 Operands[2] = Op0;
50412 Operands[1] = Op1;
50413
50414 // Now we have three operands of two additions. Check that one of them is a
50415 // constant vector with ones, and the other two can be promoted from i8/i16.
50416 for (SDValue &Op : Operands) {
50417 if (!IsConstVectorInRange(Op, 1, 1))
50418 continue;
50419 std::swap(Op, Operands[2]);
50420
50421 // Check if Operands[0] and Operands[1] are results of type promotion.
50422 for (int j = 0; j < 2; ++j)
50423 if (Operands[j].getValueType() != VT)
50424 if (!IsZExtLike(Operands[j]))
50425 return SDValue();
50426
50427 // The pattern is detected, emit X86ISD::AVG instruction(s).
50428 return AVGSplitter({Operands[0], Operands[1]});
50429 }
50430
50431 return SDValue();
50432}
50433
50436 const X86Subtarget &Subtarget) {
50437 LoadSDNode *Ld = cast<LoadSDNode>(N);
50438 EVT RegVT = Ld->getValueType(0);
50439 EVT MemVT = Ld->getMemoryVT();
50440 SDLoc dl(Ld);
50441 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50442
50443 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
50444 // into two 16-byte operations. Also split non-temporal aligned loads on
50445 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
50447 unsigned Fast;
50448 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
50449 Ext == ISD::NON_EXTLOAD &&
50450 ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
50451 Ld->getAlign() >= Align(16)) ||
50452 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
50453 *Ld->getMemOperand(), &Fast) &&
50454 !Fast))) {
50455 unsigned NumElems = RegVT.getVectorNumElements();
50456 if (NumElems < 2)
50457 return SDValue();
50458
50459 unsigned HalfOffset = 16;
50460 SDValue Ptr1 = Ld->getBasePtr();
50461 SDValue Ptr2 =
50462 DAG.getMemBasePlusOffset(Ptr1, TypeSize::getFixed(HalfOffset), dl);
50463 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
50464 NumElems / 2);
50465 SDValue Load1 =
50466 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
50467 Ld->getOriginalAlign(),
50468 Ld->getMemOperand()->getFlags());
50469 SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
50470 Ld->getPointerInfo().getWithOffset(HalfOffset),
50471 Ld->getOriginalAlign(),
50472 Ld->getMemOperand()->getFlags());
50473 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
50474 Load1.getValue(1), Load2.getValue(1));
50475
50476 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
50477 return DCI.CombineTo(N, NewVec, TF, true);
50478 }
50479
50480 // Bool vector load - attempt to cast to an integer, as we have good
50481 // (vXiY *ext(vXi1 bitcast(iX))) handling.
50482 if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
50483 RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
50484 unsigned NumElts = RegVT.getVectorNumElements();
50485 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
50486 if (TLI.isTypeLegal(IntVT)) {
50487 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
50488 Ld->getPointerInfo(),
50489 Ld->getOriginalAlign(),
50490 Ld->getMemOperand()->getFlags());
50491 SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
50492 return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
50493 }
50494 }
50495
50496 // If we also load/broadcast this to a wider type, then just extract the
50497 // lowest subvector.
50498 if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
50499 (RegVT.is128BitVector() || RegVT.is256BitVector())) {
50500 SDValue Ptr = Ld->getBasePtr();
50501 SDValue Chain = Ld->getChain();
50502 for (SDNode *User : Chain->uses()) {
50503 auto *UserLd = dyn_cast<MemSDNode>(User);
50504 if (User != N && UserLd &&
50505 (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||
50506 User->getOpcode() == X86ISD::VBROADCAST_LOAD ||
50508 UserLd->getChain() == Chain && !User->hasAnyUseOfValue(1) &&
50509 User->getValueSizeInBits(0).getFixedValue() >
50510 RegVT.getFixedSizeInBits()) {
50511 if (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
50512 UserLd->getBasePtr() == Ptr &&
50513 UserLd->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits()) {
50514 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
50515 RegVT.getSizeInBits());
50516 Extract = DAG.getBitcast(RegVT, Extract);
50517 return DCI.CombineTo(N, Extract, SDValue(User, 1));
50518 }
50519 auto MatchingBits = [](const APInt &Undefs, const APInt &UserUndefs,
50520 ArrayRef<APInt> Bits, ArrayRef<APInt> UserBits) {
50521 for (unsigned I = 0, E = Undefs.getBitWidth(); I != E; ++I) {
50522 if (Undefs[I])
50523 continue;
50524 if (UserUndefs[I] || Bits[I] != UserBits[I])
50525 return false;
50526 }
50527 return true;
50528 };
50529 // See if we are loading a constant that matches in the lower
50530 // bits of a longer constant (but from a different constant pool ptr).
50531 EVT UserVT = User->getValueType(0);
50532 SDValue UserPtr = UserLd->getBasePtr();
50534 const Constant *UserC = getTargetConstantFromBasePtr(UserPtr);
50535 if (LdC && UserC && UserPtr != Ptr) {
50536 unsigned LdSize = LdC->getType()->getPrimitiveSizeInBits();
50537 unsigned UserSize = UserC->getType()->getPrimitiveSizeInBits();
50538 if (LdSize < UserSize || !ISD::isNormalLoad(User)) {
50539 APInt Undefs, UserUndefs;
50540 SmallVector<APInt> Bits, UserBits;
50541 unsigned NumBits = std::min(RegVT.getScalarSizeInBits(),
50542 UserVT.getScalarSizeInBits());
50543 if (getTargetConstantBitsFromNode(SDValue(N, 0), NumBits, Undefs,
50544 Bits) &&
50546 UserUndefs, UserBits)) {
50547 if (MatchingBits(Undefs, UserUndefs, Bits, UserBits)) {
50548 SDValue Extract = extractSubVector(
50549 SDValue(User, 0), 0, DAG, SDLoc(N), RegVT.getSizeInBits());
50550 Extract = DAG.getBitcast(RegVT, Extract);
50551 return DCI.CombineTo(N, Extract, SDValue(User, 1));
50552 }
50553 }
50554 }
50555 }
50556 }
50557 }
50558 }
50559
50560 // Cast ptr32 and ptr64 pointers to the default address space before a load.
50561 unsigned AddrSpace = Ld->getAddressSpace();
50562 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
50563 AddrSpace == X86AS::PTR32_UPTR) {
50564 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
50565 if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
50566 SDValue Cast =
50567 DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
50568 return DAG.getExtLoad(Ext, dl, RegVT, Ld->getChain(), Cast,
50569 Ld->getPointerInfo(), MemVT, Ld->getOriginalAlign(),
50570 Ld->getMemOperand()->getFlags());
50571 }
50572 }
50573
50574 return SDValue();
50575}
50576
50577/// If V is a build vector of boolean constants and exactly one of those
50578/// constants is true, return the operand index of that true element.
50579/// Otherwise, return -1.
50580static int getOneTrueElt(SDValue V) {
50581 // This needs to be a build vector of booleans.
50582 // TODO: Checking for the i1 type matches the IR definition for the mask,
50583 // but the mask check could be loosened to i8 or other types. That might
50584 // also require checking more than 'allOnesValue'; eg, the x86 HW
50585 // instructions only require that the MSB is set for each mask element.
50586 // The ISD::MSTORE comments/definition do not specify how the mask operand
50587 // is formatted.
50588 auto *BV = dyn_cast<BuildVectorSDNode>(V);
50589 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
50590 return -1;
50591
50592 int TrueIndex = -1;
50593 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
50594 for (unsigned i = 0; i < NumElts; ++i) {
50595 const SDValue &Op = BV->getOperand(i);
50596 if (Op.isUndef())
50597 continue;
50598 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
50599 if (!ConstNode)
50600 return -1;
50601 if (ConstNode->getAPIntValue().countr_one() >= 1) {
50602 // If we already found a one, this is too many.
50603 if (TrueIndex >= 0)
50604 return -1;
50605 TrueIndex = i;
50606 }
50607 }
50608 return TrueIndex;
50609}
50610
50611/// Given a masked memory load/store operation, return true if it has one mask
50612/// bit set. If it has one mask bit set, then also return the memory address of
50613/// the scalar element to load/store, the vector index to insert/extract that
50614/// scalar element, and the alignment for the scalar memory access.
50616 SelectionDAG &DAG, SDValue &Addr,
50617 SDValue &Index, Align &Alignment,
50618 unsigned &Offset) {
50619 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
50620 if (TrueMaskElt < 0)
50621 return false;
50622
50623 // Get the address of the one scalar element that is specified by the mask
50624 // using the appropriate offset from the base pointer.
50625 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
50626 Offset = 0;
50627 Addr = MaskedOp->getBasePtr();
50628 if (TrueMaskElt != 0) {
50629 Offset = TrueMaskElt * EltVT.getStoreSize();
50631 SDLoc(MaskedOp));
50632 }
50633
50634 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
50635 Alignment = commonAlignment(MaskedOp->getOriginalAlign(),
50636 EltVT.getStoreSize());
50637 return true;
50638}
50639
50640/// If exactly one element of the mask is set for a non-extending masked load,
50641/// it is a scalar load and vector insert.
50642/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
50643/// mask have already been optimized in IR, so we don't bother with those here.
50644static SDValue
50647 const X86Subtarget &Subtarget) {
50648 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
50649 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
50650 // However, some target hooks may need to be added to know when the transform
50651 // is profitable. Endianness would also have to be considered.
50652
50653 SDValue Addr, VecIndex;
50654 Align Alignment;
50655 unsigned Offset;
50656 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
50657 return SDValue();
50658
50659 // Load the one scalar element that is specified by the mask using the
50660 // appropriate offset from the base pointer.
50661 SDLoc DL(ML);
50662 EVT VT = ML->getValueType(0);
50663 EVT EltVT = VT.getVectorElementType();
50664
50665 EVT CastVT = VT;
50666 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
50667 EltVT = MVT::f64;
50668 CastVT = VT.changeVectorElementType(EltVT);
50669 }
50670
50671 SDValue Load =
50672 DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
50673 ML->getPointerInfo().getWithOffset(Offset),
50674 Alignment, ML->getMemOperand()->getFlags());
50675
50676 SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
50677
50678 // Insert the loaded element into the appropriate place in the vector.
50679 SDValue Insert =
50680 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
50681 Insert = DAG.getBitcast(VT, Insert);
50682 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
50683}
50684
50685static SDValue
50688 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
50689 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
50690 return SDValue();
50691
50692 SDLoc DL(ML);
50693 EVT VT = ML->getValueType(0);
50694
50695 // If we are loading the first and last elements of a vector, it is safe and
50696 // always faster to load the whole vector. Replace the masked load with a
50697 // vector load and select.
50698 unsigned NumElts = VT.getVectorNumElements();
50699 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
50700 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
50701 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
50702 if (LoadFirstElt && LoadLastElt) {
50703 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
50704 ML->getMemOperand());
50705 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
50706 ML->getPassThru());
50707 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
50708 }
50709
50710 // Convert a masked load with a constant mask into a masked load and a select.
50711 // This allows the select operation to use a faster kind of select instruction
50712 // (for example, vblendvps -> vblendps).
50713
50714 // Don't try this if the pass-through operand is already undefined. That would
50715 // cause an infinite loop because that's what we're about to create.
50716 if (ML->getPassThru().isUndef())
50717 return SDValue();
50718
50719 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
50720 return SDValue();
50721
50722 // The new masked load has an undef pass-through operand. The select uses the
50723 // original pass-through operand.
50724 SDValue NewML = DAG.getMaskedLoad(
50725 VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
50726 DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
50727 ML->getAddressingMode(), ML->getExtensionType());
50728 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
50729 ML->getPassThru());
50730
50731 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
50732}
50733
50736 const X86Subtarget &Subtarget) {
50737 auto *Mld = cast<MaskedLoadSDNode>(N);
50738
50739 // TODO: Expanding load with constant mask may be optimized as well.
50740 if (Mld->isExpandingLoad())
50741 return SDValue();
50742
50743 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
50744 if (SDValue ScalarLoad =
50745 reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
50746 return ScalarLoad;
50747
50748 // TODO: Do some AVX512 subsets benefit from this transform?
50749 if (!Subtarget.hasAVX512())
50750 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
50751 return Blend;
50752 }
50753
50754 // If the mask value has been legalized to a non-boolean vector, try to
50755 // simplify ops leading up to it. We only demand the MSB of each lane.
50756 SDValue Mask = Mld->getMask();
50757 if (Mask.getScalarValueSizeInBits() != 1) {
50758 EVT VT = Mld->getValueType(0);
50759 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50761 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
50762 if (N->getOpcode() != ISD::DELETED_NODE)
50763 DCI.AddToWorklist(N);
50764 return SDValue(N, 0);
50765 }
50766 if (SDValue NewMask =
50768 return DAG.getMaskedLoad(
50769 VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
50770 NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
50771 Mld->getAddressingMode(), Mld->getExtensionType());
50772 }
50773
50774 return SDValue();
50775}
50776
50777/// If exactly one element of the mask is set for a non-truncating masked store,
50778/// it is a vector extract and scalar store.
50779/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
50780/// mask have already been optimized in IR, so we don't bother with those here.
50782 SelectionDAG &DAG,
50783 const X86Subtarget &Subtarget) {
50784 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
50785 // However, some target hooks may need to be added to know when the transform
50786 // is profitable. Endianness would also have to be considered.
50787
50788 SDValue Addr, VecIndex;
50789 Align Alignment;
50790 unsigned Offset;
50791 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
50792 return SDValue();
50793
50794 // Extract the one scalar element that is actually being stored.
50795 SDLoc DL(MS);
50796 SDValue Value = MS->getValue();
50797 EVT VT = Value.getValueType();
50798 EVT EltVT = VT.getVectorElementType();
50799 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
50800 EltVT = MVT::f64;
50801 EVT CastVT = VT.changeVectorElementType(EltVT);
50802 Value = DAG.getBitcast(CastVT, Value);
50803 }
50804 SDValue Extract =
50805 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
50806
50807 // Store that element at the appropriate offset from the base pointer.
50808 return DAG.getStore(MS->getChain(), DL, Extract, Addr,
50810 Alignment, MS->getMemOperand()->getFlags());
50811}
50812
50815 const X86Subtarget &Subtarget) {
50816 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
50817 if (Mst->isCompressingStore())
50818 return SDValue();
50819
50820 EVT VT = Mst->getValue().getValueType();
50821 SDLoc dl(Mst);
50822 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50823
50824 if (Mst->isTruncatingStore())
50825 return SDValue();
50826
50827 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
50828 return ScalarStore;
50829
50830 // If the mask value has been legalized to a non-boolean vector, try to
50831 // simplify ops leading up to it. We only demand the MSB of each lane.
50832 SDValue Mask = Mst->getMask();
50833 if (Mask.getScalarValueSizeInBits() != 1) {
50835 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
50836 if (N->getOpcode() != ISD::DELETED_NODE)
50837 DCI.AddToWorklist(N);
50838 return SDValue(N, 0);
50839 }
50840 if (SDValue NewMask =
50842 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
50843 Mst->getBasePtr(), Mst->getOffset(), NewMask,
50844 Mst->getMemoryVT(), Mst->getMemOperand(),
50845 Mst->getAddressingMode());
50846 }
50847
50848 SDValue Value = Mst->getValue();
50849 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
50850 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
50851 Mst->getMemoryVT())) {
50852 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
50853 Mst->getBasePtr(), Mst->getOffset(), Mask,
50854 Mst->getMemoryVT(), Mst->getMemOperand(),
50855 Mst->getAddressingMode(), true);
50856 }
50857
50858 return SDValue();
50859}
50860
50863 const X86Subtarget &Subtarget) {
50864 StoreSDNode *St = cast<StoreSDNode>(N);
50865 EVT StVT = St->getMemoryVT();
50866 SDLoc dl(St);
50867 SDValue StoredVal = St->getValue();
50868 EVT VT = StoredVal.getValueType();
50869 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50870
50871 // Convert a store of vXi1 into a store of iX and a bitcast.
50872 if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
50873 VT.getVectorElementType() == MVT::i1) {
50874
50876 StoredVal = DAG.getBitcast(NewVT, StoredVal);
50877
50878 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
50879 St->getPointerInfo(), St->getOriginalAlign(),
50880 St->getMemOperand()->getFlags());
50881 }
50882
50883 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
50884 // This will avoid a copy to k-register.
50885 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
50886 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
50887 StoredVal.getOperand(0).getValueType() == MVT::i8) {
50888 SDValue Val = StoredVal.getOperand(0);
50889 // We must store zeros to the unused bits.
50890 Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
50891 return DAG.getStore(St->getChain(), dl, Val,
50892 St->getBasePtr(), St->getPointerInfo(),
50893 St->getOriginalAlign(),
50894 St->getMemOperand()->getFlags());
50895 }
50896
50897 // Widen v2i1/v4i1 stores to v8i1.
50898 if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
50899 Subtarget.hasAVX512()) {
50900 unsigned NumConcats = 8 / VT.getVectorNumElements();
50901 // We must store zeros to the unused bits.
50902 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
50903 Ops[0] = StoredVal;
50904 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
50905 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
50906 St->getPointerInfo(), St->getOriginalAlign(),
50907 St->getMemOperand()->getFlags());
50908 }
50909
50910 // Turn vXi1 stores of constants into a scalar store.
50911 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
50912 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
50914 // If its a v64i1 store without 64-bit support, we need two stores.
50915 if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
50916 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
50917 StoredVal->ops().slice(0, 32));
50919 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
50920 StoredVal->ops().slice(32, 32));
50922
50923 SDValue Ptr0 = St->getBasePtr();
50924 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(4), dl);
50925
50926 SDValue Ch0 =
50927 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
50928 St->getOriginalAlign(),
50929 St->getMemOperand()->getFlags());
50930 SDValue Ch1 =
50931 DAG.getStore(St->getChain(), dl, Hi, Ptr1,
50933 St->getOriginalAlign(),
50934 St->getMemOperand()->getFlags());
50935 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
50936 }
50937
50938 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
50939 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
50940 St->getPointerInfo(), St->getOriginalAlign(),
50941 St->getMemOperand()->getFlags());
50942 }
50943
50944 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
50945 // Sandy Bridge, perform two 16-byte stores.
50946 unsigned Fast;
50947 if (VT.is256BitVector() && StVT == VT &&
50948 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
50949 *St->getMemOperand(), &Fast) &&
50950 !Fast) {
50951 unsigned NumElems = VT.getVectorNumElements();
50952 if (NumElems < 2)
50953 return SDValue();
50954
50955 return splitVectorStore(St, DAG);
50956 }
50957
50958 // Split under-aligned vector non-temporal stores.
50959 if (St->isNonTemporal() && StVT == VT &&
50960 St->getAlign().value() < VT.getStoreSize()) {
50961 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
50962 // vectors or the legalizer can scalarize it to use MOVNTI.
50963 if (VT.is256BitVector() || VT.is512BitVector()) {
50964 unsigned NumElems = VT.getVectorNumElements();
50965 if (NumElems < 2)
50966 return SDValue();
50967 return splitVectorStore(St, DAG);
50968 }
50969
50970 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
50971 // to use MOVNTI.
50972 if (VT.is128BitVector() && Subtarget.hasSSE2()) {
50973 MVT NTVT = Subtarget.hasSSE4A()
50974 ? MVT::v2f64
50975 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
50976 return scalarizeVectorStore(St, NTVT, DAG);
50977 }
50978 }
50979
50980 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
50981 // supported, but avx512f is by extending to v16i32 and truncating.
50982 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
50983 St->getValue().getOpcode() == ISD::TRUNCATE &&
50984 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
50985 TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
50986 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
50987 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,
50988 St->getValue().getOperand(0));
50989 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
50990 MVT::v16i8, St->getMemOperand());
50991 }
50992
50993 // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
50994 if (!St->isTruncatingStore() &&
50995 (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
50996 StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
50997 StoredVal.hasOneUse() &&
50998 TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
50999 bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
51000 return EmitTruncSStore(IsSigned, St->getChain(),
51001 dl, StoredVal.getOperand(0), St->getBasePtr(),
51002 VT, St->getMemOperand(), DAG);
51003 }
51004
51005 // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
51006 if (!St->isTruncatingStore()) {
51007 auto IsExtractedElement = [](SDValue V) {
51008 if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())
51009 V = V.getOperand(0);
51010 unsigned Opc = V.getOpcode();
51011 if ((Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) &&
51012 isNullConstant(V.getOperand(1)) && V.hasOneUse() &&
51013 V.getOperand(0).hasOneUse())
51014 return V.getOperand(0);
51015 return SDValue();
51016 };
51017 if (SDValue Extract = IsExtractedElement(StoredVal)) {
51018 SDValue Trunc = peekThroughOneUseBitcasts(Extract);
51019 if (Trunc.getOpcode() == X86ISD::VTRUNC) {
51020 SDValue Src = Trunc.getOperand(0);
51021 MVT DstVT = Trunc.getSimpleValueType();
51022 MVT SrcVT = Src.getSimpleValueType();
51023 unsigned NumSrcElts = SrcVT.getVectorNumElements();
51024 unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
51025 MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
51026 if (NumTruncBits == VT.getSizeInBits() &&
51027 TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
51028 return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
51029 TruncVT, St->getMemOperand());
51030 }
51031 }
51032 }
51033 }
51034
51035 // Optimize trunc store (of multiple scalars) to shuffle and store.
51036 // First, pack all of the elements in one place. Next, store to memory
51037 // in fewer chunks.
51038 if (St->isTruncatingStore() && VT.isVector()) {
51039 // Check if we can detect an AVG pattern from the truncation. If yes,
51040 // replace the trunc store by a normal store with the result of X86ISD::AVG
51041 // instruction.
51042 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT()))
51043 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
51044 Subtarget, dl))
51045 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
51046 St->getPointerInfo(), St->getOriginalAlign(),
51047 St->getMemOperand()->getFlags());
51048
51049 if (TLI.isTruncStoreLegal(VT, StVT)) {
51050 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
51051 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
51052 dl, Val, St->getBasePtr(),
51053 St->getMemoryVT(), St->getMemOperand(), DAG);
51054 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
51055 DAG, dl))
51056 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
51057 dl, Val, St->getBasePtr(),
51058 St->getMemoryVT(), St->getMemOperand(), DAG);
51059 }
51060
51061 return SDValue();
51062 }
51063
51064 // Cast ptr32 and ptr64 pointers to the default address space before a store.
51065 unsigned AddrSpace = St->getAddressSpace();
51066 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
51067 AddrSpace == X86AS::PTR32_UPTR) {
51068 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
51069 if (PtrVT != St->getBasePtr().getSimpleValueType()) {
51070 SDValue Cast =
51071 DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
51072 return DAG.getTruncStore(
51073 St->getChain(), dl, StoredVal, Cast, St->getPointerInfo(), StVT,
51074 St->getOriginalAlign(), St->getMemOperand()->getFlags(),
51075 St->getAAInfo());
51076 }
51077 }
51078
51079 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
51080 // the FP state in cases where an emms may be missing.
51081 // A preferable solution to the general problem is to figure out the right
51082 // places to insert EMMS. This qualifies as a quick hack.
51083
51084 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
51085 if (VT.getSizeInBits() != 64)
51086 return SDValue();
51087
51088 const Function &F = DAG.getMachineFunction().getFunction();
51089 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
51090 bool F64IsLegal =
51091 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
51092
51093 if (!F64IsLegal || Subtarget.is64Bit())
51094 return SDValue();
51095
51096 if (VT == MVT::i64 && isa<LoadSDNode>(St->getValue()) &&
51097 cast<LoadSDNode>(St->getValue())->isSimple() &&
51098 St->getChain().hasOneUse() && St->isSimple()) {
51099 auto *Ld = cast<LoadSDNode>(St->getValue());
51100
51101 if (!ISD::isNormalLoad(Ld))
51102 return SDValue();
51103
51104 // Avoid the transformation if there are multiple uses of the loaded value.
51105 if (!Ld->hasNUsesOfValue(1, 0))
51106 return SDValue();
51107
51108 SDLoc LdDL(Ld);
51109 SDLoc StDL(N);
51110 // Lower to a single movq load/store pair.
51111 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
51112 Ld->getBasePtr(), Ld->getMemOperand());
51113
51114 // Make sure new load is placed in same chain order.
51115 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
51116 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
51117 St->getMemOperand());
51118 }
51119
51120 // This is similar to the above case, but here we handle a scalar 64-bit
51121 // integer store that is extracted from a vector on a 32-bit target.
51122 // If we have SSE2, then we can treat it like a floating-point double
51123 // to get past legalization. The execution dependencies fixup pass will
51124 // choose the optimal machine instruction for the store if this really is
51125 // an integer or v2f32 rather than an f64.
51126 if (VT == MVT::i64 &&
51128 SDValue OldExtract = St->getOperand(1);
51129 SDValue ExtOp0 = OldExtract.getOperand(0);
51130 unsigned VecSize = ExtOp0.getValueSizeInBits();
51131 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
51132 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
51133 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
51134 BitCast, OldExtract.getOperand(1));
51135 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
51136 St->getPointerInfo(), St->getOriginalAlign(),
51137 St->getMemOperand()->getFlags());
51138 }
51139
51140 return SDValue();
51141}
51142
51145 const X86Subtarget &Subtarget) {
51146 auto *St = cast<MemIntrinsicSDNode>(N);
51147
51148 SDValue StoredVal = N->getOperand(1);
51149 MVT VT = StoredVal.getSimpleValueType();
51150 EVT MemVT = St->getMemoryVT();
51151
51152 // Figure out which elements we demand.
51153 unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
51154 APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
51155
51156 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51157 if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {
51158 if (N->getOpcode() != ISD::DELETED_NODE)
51159 DCI.AddToWorklist(N);
51160 return SDValue(N, 0);
51161 }
51162
51163 return SDValue();
51164}
51165
51166/// Return 'true' if this vector operation is "horizontal"
51167/// and return the operands for the horizontal operation in LHS and RHS. A
51168/// horizontal operation performs the binary operation on successive elements
51169/// of its first operand, then on successive elements of its second operand,
51170/// returning the resulting values in a vector. For example, if
51171/// A = < float a0, float a1, float a2, float a3 >
51172/// and
51173/// B = < float b0, float b1, float b2, float b3 >
51174/// then the result of doing a horizontal operation on A and B is
51175/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
51176/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
51177/// A horizontal-op B, for some already available A and B, and if so then LHS is
51178/// set to A, RHS to B, and the routine returns 'true'.
51179static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
51180 SelectionDAG &DAG, const X86Subtarget &Subtarget,
51181 bool IsCommutative,
51182 SmallVectorImpl<int> &PostShuffleMask) {
51183 // If either operand is undef, bail out. The binop should be simplified.
51184 if (LHS.isUndef() || RHS.isUndef())
51185 return false;
51186
51187 // Look for the following pattern:
51188 // A = < float a0, float a1, float a2, float a3 >
51189 // B = < float b0, float b1, float b2, float b3 >
51190 // and
51191 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
51192 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
51193 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
51194 // which is A horizontal-op B.
51195
51196 MVT VT = LHS.getSimpleValueType();
51197 assert((VT.is128BitVector() || VT.is256BitVector()) &&
51198 "Unsupported vector type for horizontal add/sub");
51199 unsigned NumElts = VT.getVectorNumElements();
51200
51201 auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
51202 SmallVectorImpl<int> &ShuffleMask) {
51203 bool UseSubVector = false;
51204 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
51205 Op.getOperand(0).getValueType().is256BitVector() &&
51206 llvm::isNullConstant(Op.getOperand(1))) {
51207 Op = Op.getOperand(0);
51208 UseSubVector = true;
51209 }
51211 SmallVector<int, 16> SrcMask, ScaledMask;
51213 if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
51214 !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
51215 return Op.getValueSizeInBits() == BC.getValueSizeInBits();
51216 })) {
51217 resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
51218 if (!UseSubVector && SrcOps.size() <= 2 &&
51219 scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
51220 N0 = !SrcOps.empty() ? SrcOps[0] : SDValue();
51221 N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
51222 ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
51223 }
51224 if (UseSubVector && SrcOps.size() == 1 &&
51225 scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
51226 std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
51227 ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
51228 ShuffleMask.assign(Mask.begin(), Mask.end());
51229 }
51230 }
51231 };
51232
51233 // View LHS in the form
51234 // LHS = VECTOR_SHUFFLE A, B, LMask
51235 // If LHS is not a shuffle, then pretend it is the identity shuffle:
51236 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
51237 // NOTE: A default initialized SDValue represents an UNDEF of type VT.
51238 SDValue A, B;
51240 GetShuffle(LHS, A, B, LMask);
51241
51242 // Likewise, view RHS in the form
51243 // RHS = VECTOR_SHUFFLE C, D, RMask
51244 SDValue C, D;
51246 GetShuffle(RHS, C, D, RMask);
51247
51248 // At least one of the operands should be a vector shuffle.
51249 unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
51250 if (NumShuffles == 0)
51251 return false;
51252
51253 if (LMask.empty()) {
51254 A = LHS;
51255 for (unsigned i = 0; i != NumElts; ++i)
51256 LMask.push_back(i);
51257 }
51258
51259 if (RMask.empty()) {
51260 C = RHS;
51261 for (unsigned i = 0; i != NumElts; ++i)
51262 RMask.push_back(i);
51263 }
51264
51265 // If we have an unary mask, ensure the other op is set to null.
51266 if (isUndefOrInRange(LMask, 0, NumElts))
51267 B = SDValue();
51268 else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
51269 A = SDValue();
51270
51271 if (isUndefOrInRange(RMask, 0, NumElts))
51272 D = SDValue();
51273 else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
51274 C = SDValue();
51275
51276 // If A and B occur in reverse order in RHS, then canonicalize by commuting
51277 // RHS operands and shuffle mask.
51278 if (A != C) {
51279 std::swap(C, D);
51281 }
51282 // Check that the shuffles are both shuffling the same vectors.
51283 if (!(A == C && B == D))
51284 return false;
51285
51286 PostShuffleMask.clear();
51287 PostShuffleMask.append(NumElts, SM_SentinelUndef);
51288
51289 // LHS and RHS are now:
51290 // LHS = shuffle A, B, LMask
51291 // RHS = shuffle A, B, RMask
51292 // Check that the masks correspond to performing a horizontal operation.
51293 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
51294 // so we just repeat the inner loop if this is a 256-bit op.
51295 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
51296 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
51297 unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
51298 assert((NumEltsPer128BitChunk % 2 == 0) &&
51299 "Vector type should have an even number of elements in each lane");
51300 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
51301 for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
51302 // Ignore undefined components.
51303 int LIdx = LMask[i + j], RIdx = RMask[i + j];
51304 if (LIdx < 0 || RIdx < 0 ||
51305 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
51306 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
51307 continue;
51308
51309 // Check that successive odd/even elements are being operated on. If not,
51310 // this is not a horizontal operation.
51311 if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
51312 !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
51313 return false;
51314
51315 // Compute the post-shuffle mask index based on where the element
51316 // is stored in the HOP result, and where it needs to be moved to.
51317 int Base = LIdx & ~1u;
51318 int Index = ((Base % NumEltsPer128BitChunk) / 2) +
51319 ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
51320
51321 // The low half of the 128-bit result must choose from A.
51322 // The high half of the 128-bit result must choose from B,
51323 // unless B is undef. In that case, we are always choosing from A.
51324 if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
51325 Index += NumEltsPer64BitChunk;
51326 PostShuffleMask[i + j] = Index;
51327 }
51328 }
51329
51330 SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
51331 SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
51332
51333 bool IsIdentityPostShuffle =
51334 isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
51335 if (IsIdentityPostShuffle)
51336 PostShuffleMask.clear();
51337
51338 // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
51339 if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
51340 isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
51341 return false;
51342
51343 // If the source nodes are already used in HorizOps then always accept this.
51344 // Shuffle folding should merge these back together.
51345 bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) {
51346 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
51347 });
51348 bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) {
51349 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
51350 });
51351 bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS;
51352
51353 // Assume a SingleSource HOP if we only shuffle one input and don't need to
51354 // shuffle the result.
51355 if (!ForceHorizOp &&
51356 !shouldUseHorizontalOp(NewLHS == NewRHS &&
51357 (NumShuffles < 2 || !IsIdentityPostShuffle),
51358 DAG, Subtarget))
51359 return false;
51360
51361 LHS = DAG.getBitcast(VT, NewLHS);
51362 RHS = DAG.getBitcast(VT, NewRHS);
51363 return true;
51364}
51365
51366// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
51368 const X86Subtarget &Subtarget) {
51369 EVT VT = N->getValueType(0);
51370 unsigned Opcode = N->getOpcode();
51371 bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
51372 SmallVector<int, 8> PostShuffleMask;
51373
51374 switch (Opcode) {
51375 case ISD::FADD:
51376 case ISD::FSUB:
51377 if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
51378 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
51379 SDValue LHS = N->getOperand(0);
51380 SDValue RHS = N->getOperand(1);
51381 auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
51382 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
51383 PostShuffleMask)) {
51384 SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
51385 if (!PostShuffleMask.empty())
51386 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
51387 DAG.getUNDEF(VT), PostShuffleMask);
51388 return HorizBinOp;
51389 }
51390 }
51391 break;
51392 case ISD::ADD:
51393 case ISD::SUB:
51394 if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
51395 VT == MVT::v16i16 || VT == MVT::v8i32)) {
51396 SDValue LHS = N->getOperand(0);
51397 SDValue RHS = N->getOperand(1);
51398 auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
51399 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
51400 PostShuffleMask)) {
51401 auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
51402 ArrayRef<SDValue> Ops) {
51403 return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
51404 };
51405 SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
51406 {LHS, RHS}, HOpBuilder);
51407 if (!PostShuffleMask.empty())
51408 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
51409 DAG.getUNDEF(VT), PostShuffleMask);
51410 return HorizBinOp;
51411 }
51412 }
51413 break;
51414 }
51415
51416 return SDValue();
51417}
51418
51419// Try to combine the following nodes
51420// t29: i64 = X86ISD::Wrapper TargetConstantPool:i64
51421// <i32 -2147483648[float -0.000000e+00]> 0
51422// t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD
51423// <(load 4 from constant-pool)> t0, t29
51424// [t30: v16i32 = bitcast t27]
51425// t6: v16i32 = xor t7, t27[t30]
51426// t11: v16f32 = bitcast t6
51427// t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8
51428// into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:
51429// t22: v16f32 = bitcast t7
51430// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22
51431// t24: v32f16 = bitcast t23
51433 const X86Subtarget &Subtarget) {
51434 EVT VT = N->getValueType(0);
51435 SDValue LHS = N->getOperand(0);
51436 SDValue RHS = N->getOperand(1);
51437 int CombineOpcode =
51438 N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
51439 auto combineConjugation = [&](SDValue &r) {
51440 if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {
51441 SDValue XOR = LHS.getOperand(0);
51442 if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {
51443 KnownBits XORRHS = DAG.computeKnownBits(XOR.getOperand(1));
51444 if (XORRHS.isConstant()) {
51445 APInt ConjugationInt32 = APInt(32, 0x80000000, true);
51446 APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL, true);
51447 if ((XORRHS.getBitWidth() == 32 &&
51448 XORRHS.getConstant() == ConjugationInt32) ||
51449 (XORRHS.getBitWidth() == 64 &&
51450 XORRHS.getConstant() == ConjugationInt64)) {
51451 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
51452 SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));
51453 SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);
51454 r = DAG.getBitcast(VT, FCMulC);
51455 return true;
51456 }
51457 }
51458 }
51459 }
51460 return false;
51461 };
51462 SDValue Res;
51463 if (combineConjugation(Res))
51464 return Res;
51465 std::swap(LHS, RHS);
51466 if (combineConjugation(Res))
51467 return Res;
51468 return Res;
51469}
51470
51471// Try to combine the following nodes:
51472// FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)
51474 const X86Subtarget &Subtarget) {
51475 auto AllowContract = [&DAG](const SDNodeFlags &Flags) {
51477 Flags.hasAllowContract();
51478 };
51479
51480 auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {
51481 return DAG.getTarget().Options.NoSignedZerosFPMath ||
51482 Flags.hasNoSignedZeros();
51483 };
51484 auto IsVectorAllNegativeZero = [&DAG](SDValue Op) {
51485 APInt AI = APInt(32, 0x80008000, true);
51486 KnownBits Bits = DAG.computeKnownBits(Op);
51487 return Bits.getBitWidth() == 32 && Bits.isConstant() &&
51488 Bits.getConstant() == AI;
51489 };
51490
51491 if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||
51492 !AllowContract(N->getFlags()))
51493 return SDValue();
51494
51495 EVT VT = N->getValueType(0);
51496 if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)
51497 return SDValue();
51498
51499 SDValue LHS = N->getOperand(0);
51500 SDValue RHS = N->getOperand(1);
51501 bool IsConj;
51502 SDValue FAddOp1, MulOp0, MulOp1;
51503 auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,
51504 &IsVectorAllNegativeZero,
51505 &HasNoSignedZero](SDValue N) -> bool {
51506 if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)
51507 return false;
51508 SDValue Op0 = N.getOperand(0);
51509 unsigned Opcode = Op0.getOpcode();
51510 if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
51511 if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {
51512 MulOp0 = Op0.getOperand(0);
51513 MulOp1 = Op0.getOperand(1);
51514 IsConj = Opcode == X86ISD::VFCMULC;
51515 return true;
51516 }
51517 if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&
51519 HasNoSignedZero(Op0->getFlags())) ||
51520 IsVectorAllNegativeZero(Op0->getOperand(2)))) {
51521 MulOp0 = Op0.getOperand(0);
51522 MulOp1 = Op0.getOperand(1);
51523 IsConj = Opcode == X86ISD::VFCMADDC;
51524 return true;
51525 }
51526 }
51527 return false;
51528 };
51529
51530 if (GetCFmulFrom(LHS))
51531 FAddOp1 = RHS;
51532 else if (GetCFmulFrom(RHS))
51533 FAddOp1 = LHS;
51534 else
51535 return SDValue();
51536
51537 MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);
51538 FAddOp1 = DAG.getBitcast(CVT, FAddOp1);
51539 unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;
51540 // FIXME: How do we handle when fast math flags of FADD are different from
51541 // CFMUL's?
51542 SDValue CFmul =
51543 DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());
51544 return DAG.getBitcast(VT, CFmul);
51545}
51546
51547/// Do target-specific dag combines on floating-point adds/subs.
51549 const X86Subtarget &Subtarget) {
51550 if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
51551 return HOp;
51552
51553 if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))
51554 return COp;
51555
51556 return SDValue();
51557}
51558
51559/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
51560/// the codegen.
51561/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
51562/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
51563/// anything that is guaranteed to be transformed by DAGCombiner.
51565 const X86Subtarget &Subtarget,
51566 const SDLoc &DL) {
51567 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
51568 SDValue Src = N->getOperand(0);
51569 unsigned SrcOpcode = Src.getOpcode();
51570 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51571
51572 EVT VT = N->getValueType(0);
51573 EVT SrcVT = Src.getValueType();
51574
51575 auto IsFreeTruncation = [VT](SDValue Op) {
51576 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
51577
51578 // See if this has been extended from a smaller/equal size to
51579 // the truncation size, allowing a truncation to combine with the extend.
51580 unsigned Opcode = Op.getOpcode();
51581 if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
51582 Opcode == ISD::ZERO_EXTEND) &&
51583 Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
51584 return true;
51585
51586 // See if this is a single use constant which can be constant folded.
51587 // NOTE: We don't peek throught bitcasts here because there is currently
51588 // no support for constant folding truncate+bitcast+vector_of_constants. So
51589 // we'll just send up with a truncate on both operands which will
51590 // get turned back into (truncate (binop)) causing an infinite loop.
51591 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
51592 };
51593
51594 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
51595 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
51596 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
51597 return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
51598 };
51599
51600 // Don't combine if the operation has other uses.
51601 if (!Src.hasOneUse())
51602 return SDValue();
51603
51604 // Only support vector truncation for now.
51605 // TODO: i64 scalar math would benefit as well.
51606 if (!VT.isVector())
51607 return SDValue();
51608
51609 // In most cases its only worth pre-truncating if we're only facing the cost
51610 // of one truncation.
51611 // i.e. if one of the inputs will constant fold or the input is repeated.
51612 switch (SrcOpcode) {
51613 case ISD::MUL:
51614 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
51615 // better to truncate if we have the chance.
51616 if (SrcVT.getScalarType() == MVT::i64 &&
51617 TLI.isOperationLegal(SrcOpcode, VT) &&
51618 !TLI.isOperationLegal(SrcOpcode, SrcVT))
51619 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
51620 [[fallthrough]];
51621 case ISD::AND:
51622 case ISD::XOR:
51623 case ISD::OR:
51624 case ISD::ADD:
51625 case ISD::SUB: {
51626 SDValue Op0 = Src.getOperand(0);
51627 SDValue Op1 = Src.getOperand(1);
51628 if (TLI.isOperationLegal(SrcOpcode, VT) &&
51629 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
51630 return TruncateArithmetic(Op0, Op1);
51631 break;
51632 }
51633 }
51634
51635 return SDValue();
51636}
51637
51638// Try to form a MULHU or MULHS node by looking for
51639// (trunc (srl (mul ext, ext), 16))
51640// TODO: This is X86 specific because we want to be able to handle wide types
51641// before type legalization. But we can only do it if the vector will be
51642// legalized via widening/splitting. Type legalization can't handle promotion
51643// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
51644// combiner.
51645static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
51646 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
51647 // First instruction should be a right shift of a multiply.
51648 if (Src.getOpcode() != ISD::SRL ||
51649 Src.getOperand(0).getOpcode() != ISD::MUL)
51650 return SDValue();
51651
51652 if (!Subtarget.hasSSE2())
51653 return SDValue();
51654
51655 // Only handle vXi16 types that are at least 128-bits unless they will be
51656 // widened.
51657 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
51658 return SDValue();
51659
51660 // Input type should be at least vXi32.
51661 EVT InVT = Src.getValueType();
51662 if (InVT.getVectorElementType().getSizeInBits() < 32)
51663 return SDValue();
51664
51665 // Need a shift by 16.
51666 APInt ShiftAmt;
51667 if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
51668 ShiftAmt != 16)
51669 return SDValue();
51670
51671 SDValue LHS = Src.getOperand(0).getOperand(0);
51672 SDValue RHS = Src.getOperand(0).getOperand(1);
51673
51674 // Count leading sign/zero bits on both inputs - if there are enough then
51675 // truncation back to vXi16 will be cheap - either as a pack/shuffle
51676 // sequence or using AVX512 truncations. If the inputs are sext/zext then the
51677 // truncations may actually be free by peeking through to the ext source.
51678 auto IsSext = [&DAG](SDValue V) {
51679 return DAG.ComputeMaxSignificantBits(V) <= 16;
51680 };
51681 auto IsZext = [&DAG](SDValue V) {
51682 return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;
51683 };
51684
51685 bool IsSigned = IsSext(LHS) && IsSext(RHS);
51686 bool IsUnsigned = IsZext(LHS) && IsZext(RHS);
51687 if (!IsSigned && !IsUnsigned)
51688 return SDValue();
51689
51690 // Check if both inputs are extensions, which will be removed by truncation.
51691 bool IsTruncateFree = (LHS.getOpcode() == ISD::SIGN_EXTEND ||
51692 LHS.getOpcode() == ISD::ZERO_EXTEND) &&
51693 (RHS.getOpcode() == ISD::SIGN_EXTEND ||
51694 RHS.getOpcode() == ISD::ZERO_EXTEND) &&
51695 LHS.getOperand(0).getScalarValueSizeInBits() <= 16 &&
51696 RHS.getOperand(0).getScalarValueSizeInBits() <= 16;
51697
51698 // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on
51699 // the (bitcasted) inputs directly, and then cheaply pack/truncate the result
51700 // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU
51701 // will have to split anyway.
51702 unsigned InSizeInBits = InVT.getSizeInBits();
51703 if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&
51704 !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&
51705 (InSizeInBits % 16) == 0) {
51706 EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
51707 InVT.getSizeInBits() / 16);
51708 SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),
51709 DAG.getBitcast(BCVT, RHS));
51710 return DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));
51711 }
51712
51713 // Truncate back to source type.
51714 LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);
51715 RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);
51716
51717 unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;
51718 return DAG.getNode(Opc, DL, VT, LHS, RHS);
51719}
51720
51721// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
51722// from one vector with signed bytes from another vector, adds together
51723// adjacent pairs of 16-bit products, and saturates the result before
51724// truncating to 16-bits.
51725//
51726// Which looks something like this:
51727// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
51728// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
51730 const X86Subtarget &Subtarget,
51731 const SDLoc &DL) {
51732 if (!VT.isVector() || !Subtarget.hasSSSE3())
51733 return SDValue();
51734
51735 unsigned NumElems = VT.getVectorNumElements();
51736 EVT ScalarVT = VT.getVectorElementType();
51737 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
51738 return SDValue();
51739
51740 SDValue SSatVal = detectSSatPattern(In, VT);
51741 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
51742 return SDValue();
51743
51744 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
51745 // of multiplies from even/odd elements.
51746 SDValue N0 = SSatVal.getOperand(0);
51747 SDValue N1 = SSatVal.getOperand(1);
51748
51749 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
51750 return SDValue();
51751
51752 SDValue N00 = N0.getOperand(0);
51753 SDValue N01 = N0.getOperand(1);
51754 SDValue N10 = N1.getOperand(0);
51755 SDValue N11 = N1.getOperand(1);
51756
51757 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
51758 // Canonicalize zero_extend to LHS.
51759 if (N01.getOpcode() == ISD::ZERO_EXTEND)
51760 std::swap(N00, N01);
51761 if (N11.getOpcode() == ISD::ZERO_EXTEND)
51762 std::swap(N10, N11);
51763
51764 // Ensure we have a zero_extend and a sign_extend.
51765 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
51766 N01.getOpcode() != ISD::SIGN_EXTEND ||
51767 N10.getOpcode() != ISD::ZERO_EXTEND ||
51768 N11.getOpcode() != ISD::SIGN_EXTEND)
51769 return SDValue();
51770
51771 // Peek through the extends.
51772 N00 = N00.getOperand(0);
51773 N01 = N01.getOperand(0);
51774 N10 = N10.getOperand(0);
51775 N11 = N11.getOperand(0);
51776
51777 // Ensure the extend is from vXi8.
51778 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
51779 N01.getValueType().getVectorElementType() != MVT::i8 ||
51780 N10.getValueType().getVectorElementType() != MVT::i8 ||
51781 N11.getValueType().getVectorElementType() != MVT::i8)
51782 return SDValue();
51783
51784 // All inputs should be build_vectors.
51785 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
51786 N01.getOpcode() != ISD::BUILD_VECTOR ||
51787 N10.getOpcode() != ISD::BUILD_VECTOR ||
51789 return SDValue();
51790
51791 // N00/N10 are zero extended. N01/N11 are sign extended.
51792
51793 // For each element, we need to ensure we have an odd element from one vector
51794 // multiplied by the odd element of another vector and the even element from
51795 // one of the same vectors being multiplied by the even element from the
51796 // other vector. So we need to make sure for each element i, this operator
51797 // is being performed:
51798 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
51799 SDValue ZExtIn, SExtIn;
51800 for (unsigned i = 0; i != NumElems; ++i) {
51801 SDValue N00Elt = N00.getOperand(i);
51802 SDValue N01Elt = N01.getOperand(i);
51803 SDValue N10Elt = N10.getOperand(i);
51804 SDValue N11Elt = N11.getOperand(i);
51805 // TODO: Be more tolerant to undefs.
51806 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
51807 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
51808 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
51810 return SDValue();
51811 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
51812 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
51813 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
51814 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
51815 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
51816 return SDValue();
51817 unsigned IdxN00 = ConstN00Elt->getZExtValue();
51818 unsigned IdxN01 = ConstN01Elt->getZExtValue();
51819 unsigned IdxN10 = ConstN10Elt->getZExtValue();
51820 unsigned IdxN11 = ConstN11Elt->getZExtValue();
51821 // Add is commutative so indices can be reordered.
51822 if (IdxN00 > IdxN10) {
51823 std::swap(IdxN00, IdxN10);
51824 std::swap(IdxN01, IdxN11);
51825 }
51826 // N0 indices be the even element. N1 indices must be the next odd element.
51827 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
51828 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
51829 return SDValue();
51830 SDValue N00In = N00Elt.getOperand(0);
51831 SDValue N01In = N01Elt.getOperand(0);
51832 SDValue N10In = N10Elt.getOperand(0);
51833 SDValue N11In = N11Elt.getOperand(0);
51834 // First time we find an input capture it.
51835 if (!ZExtIn) {
51836 ZExtIn = N00In;
51837 SExtIn = N01In;
51838 }
51839 if (ZExtIn != N00In || SExtIn != N01In ||
51840 ZExtIn != N10In || SExtIn != N11In)
51841 return SDValue();
51842 }
51843
51844 auto ExtractVec = [&DAG, &DL, NumElems](SDValue &Ext) {
51845 EVT ExtVT = Ext.getValueType();
51846 if (ExtVT.getVectorNumElements() != NumElems * 2) {
51847 MVT NVT = MVT::getVectorVT(MVT::i8, NumElems * 2);
51848 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, Ext,
51849 DAG.getIntPtrConstant(0, DL));
51850 }
51851 };
51852 ExtractVec(ZExtIn);
51853 ExtractVec(SExtIn);
51854
51855 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
51856 ArrayRef<SDValue> Ops) {
51857 // Shrink by adding truncate nodes and let DAGCombine fold with the
51858 // sources.
51859 EVT InVT = Ops[0].getValueType();
51860 assert(InVT.getScalarType() == MVT::i8 &&
51861 "Unexpected scalar element type");
51862 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
51863 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
51864 InVT.getVectorNumElements() / 2);
51865 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
51866 };
51867 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
51868 PMADDBuilder);
51869}
51870
51872 const X86Subtarget &Subtarget) {
51873 EVT VT = N->getValueType(0);
51874 SDValue Src = N->getOperand(0);
51875 SDLoc DL(N);
51876
51877 // Attempt to pre-truncate inputs to arithmetic ops instead.
51878 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
51879 return V;
51880
51881 // Try to detect AVG pattern first.
51882 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
51883 return Avg;
51884
51885 // Try to detect PMADD
51886 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
51887 return PMAdd;
51888
51889 // Try to combine truncation with signed/unsigned saturation.
51890 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
51891 return Val;
51892
51893 // Try to combine PMULHUW/PMULHW for vXi16.
51894 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
51895 return V;
51896
51897 // The bitcast source is a direct mmx result.
51898 // Detect bitcasts between i32 to x86mmx
51899 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
51900 SDValue BCSrc = Src.getOperand(0);
51901 if (BCSrc.getValueType() == MVT::x86mmx)
51902 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
51903 }
51904
51905 return SDValue();
51906}
51907
51910 EVT VT = N->getValueType(0);
51911 SDValue In = N->getOperand(0);
51912 SDLoc DL(N);
51913
51914 if (SDValue SSatVal = detectSSatPattern(In, VT))
51915 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
51916 if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL))
51917 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
51918
51919 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51920 APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));
51921 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
51922 return SDValue(N, 0);
51923
51924 return SDValue();
51925}
51926
51927/// Returns the negated value if the node \p N flips sign of FP value.
51928///
51929/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
51930/// or FSUB(0, x)
51931/// AVX512F does not have FXOR, so FNEG is lowered as
51932/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
51933/// In this case we go though all bitcasts.
51934/// This also recognizes splat of a negated value and returns the splat of that
51935/// value.
51936static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
51937 if (N->getOpcode() == ISD::FNEG)
51938 return N->getOperand(0);
51939
51940 // Don't recurse exponentially.
51942 return SDValue();
51943
51944 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
51945
51947 EVT VT = Op->getValueType(0);
51948
51949 // Make sure the element size doesn't change.
51950 if (VT.getScalarSizeInBits() != ScalarSize)
51951 return SDValue();
51952
51953 unsigned Opc = Op.getOpcode();
51954 switch (Opc) {
51955 case ISD::VECTOR_SHUFFLE: {
51956 // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
51957 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
51958 if (!Op.getOperand(1).isUndef())
51959 return SDValue();
51960 if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
51961 if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
51962 return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
51963 cast<ShuffleVectorSDNode>(Op)->getMask());
51964 break;
51965 }
51967 // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
51968 // -V, INDEX).
51969 SDValue InsVector = Op.getOperand(0);
51970 SDValue InsVal = Op.getOperand(1);
51971 if (!InsVector.isUndef())
51972 return SDValue();
51973 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
51974 if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
51975 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
51976 NegInsVal, Op.getOperand(2));
51977 break;
51978 }
51979 case ISD::FSUB:
51980 case ISD::XOR:
51981 case X86ISD::FXOR: {
51982 SDValue Op1 = Op.getOperand(1);
51983 SDValue Op0 = Op.getOperand(0);
51984
51985 // For XOR and FXOR, we want to check if constant
51986 // bits of Op1 are sign bit masks. For FSUB, we
51987 // have to check if constant bits of Op0 are sign
51988 // bit masks and hence we swap the operands.
51989 if (Opc == ISD::FSUB)
51990 std::swap(Op0, Op1);
51991
51992 APInt UndefElts;
51993 SmallVector<APInt, 16> EltBits;
51994 // Extract constant bits and see if they are all
51995 // sign bit masks. Ignore the undef elements.
51996 if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
51997 /* AllowWholeUndefs */ true,
51998 /* AllowPartialUndefs */ false)) {
51999 for (unsigned I = 0, E = EltBits.size(); I < E; I++)
52000 if (!UndefElts[I] && !EltBits[I].isSignMask())
52001 return SDValue();
52002
52003 // Only allow bitcast from correctly-sized constant.
52004 Op0 = peekThroughBitcasts(Op0);
52005 if (Op0.getScalarValueSizeInBits() == ScalarSize)
52006 return Op0;
52007 }
52008 break;
52009 } // case
52010 } // switch
52011
52012 return SDValue();
52013}
52014
52015static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
52016 bool NegRes) {
52017 if (NegMul) {
52018 switch (Opcode) {
52019 // clang-format off
52020 default: llvm_unreachable("Unexpected opcode");
52021 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
52022 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
52023 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
52024 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
52025 case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
52026 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
52027 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
52028 case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
52029 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
52030 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
52031 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
52032 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
52033 // clang-format on
52034 }
52035 }
52036
52037 if (NegAcc) {
52038 switch (Opcode) {
52039 // clang-format off
52040 default: llvm_unreachable("Unexpected opcode");
52041 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
52042 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
52043 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
52044 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
52045 case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
52046 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
52047 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
52048 case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
52049 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
52050 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
52051 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
52052 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
52053 case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
52054 case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
52055 case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
52056 case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
52057 // clang-format on
52058 }
52059 }
52060
52061 if (NegRes) {
52062 switch (Opcode) {
52063 // For accuracy reason, we never combine fneg and fma under strict FP.
52064 // clang-format off
52065 default: llvm_unreachable("Unexpected opcode");
52066 case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
52067 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
52068 case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
52069 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
52070 case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
52071 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
52072 case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
52073 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
52074 // clang-format on
52075 }
52076 }
52077
52078 return Opcode;
52079}
52080
52081/// Do target-specific dag combines on floating point negations.
52084 const X86Subtarget &Subtarget) {
52085 EVT OrigVT = N->getValueType(0);
52086 SDValue Arg = isFNEG(DAG, N);
52087 if (!Arg)
52088 return SDValue();
52089
52090 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52091 EVT VT = Arg.getValueType();
52092 EVT SVT = VT.getScalarType();
52093 SDLoc DL(N);
52094
52095 // Let legalize expand this if it isn't a legal type yet.
52096 if (!TLI.isTypeLegal(VT))
52097 return SDValue();
52098
52099 // If we're negating a FMUL node on a target with FMA, then we can avoid the
52100 // use of a constant by performing (-0 - A*B) instead.
52101 // FIXME: Check rounding control flags as well once it becomes available.
52102 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
52103 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
52104 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
52105 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
52106 Arg.getOperand(1), Zero);
52107 return DAG.getBitcast(OrigVT, NewNode);
52108 }
52109
52110 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
52111 bool LegalOperations = !DCI.isBeforeLegalizeOps();
52112 if (SDValue NegArg =
52113 TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
52114 return DAG.getBitcast(OrigVT, NegArg);
52115
52116 return SDValue();
52117}
52118
52120 bool LegalOperations,
52121 bool ForCodeSize,
52123 unsigned Depth) const {
52124 // fneg patterns are removable even if they have multiple uses.
52125 if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
52127 return DAG.getBitcast(Op.getValueType(), Arg);
52128 }
52129
52130 EVT VT = Op.getValueType();
52131 EVT SVT = VT.getScalarType();
52132 unsigned Opc = Op.getOpcode();
52133 SDNodeFlags Flags = Op.getNode()->getFlags();
52134 switch (Opc) {
52135 case ISD::FMA:
52136 case X86ISD::FMSUB:
52137 case X86ISD::FNMADD:
52138 case X86ISD::FNMSUB:
52139 case X86ISD::FMADD_RND:
52140 case X86ISD::FMSUB_RND:
52141 case X86ISD::FNMADD_RND:
52142 case X86ISD::FNMSUB_RND: {
52143 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
52144 !(SVT == MVT::f32 || SVT == MVT::f64) ||
52146 break;
52147
52148 // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
52149 // if it may have signed zeros.
52150 if (!Flags.hasNoSignedZeros())
52151 break;
52152
52153 // This is always negatible for free but we might be able to remove some
52154 // extra operand negations as well.
52156 for (int i = 0; i != 3; ++i)
52157 NewOps[i] = getCheaperNegatedExpression(
52158 Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
52159
52160 bool NegA = !!NewOps[0];
52161 bool NegB = !!NewOps[1];
52162 bool NegC = !!NewOps[2];
52163 unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
52164
52165 Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
52167
52168 // Fill in the non-negated ops with the original values.
52169 for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
52170 if (!NewOps[i])
52171 NewOps[i] = Op.getOperand(i);
52172 return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
52173 }
52174 case X86ISD::FRCP:
52175 if (SDValue NegOp0 =
52176 getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
52177 ForCodeSize, Cost, Depth + 1))
52178 return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
52179 break;
52180 }
52181
52182 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
52183 ForCodeSize, Cost, Depth);
52184}
52185
52187 const X86Subtarget &Subtarget) {
52188 MVT VT = N->getSimpleValueType(0);
52189 // If we have integer vector types available, use the integer opcodes.
52190 if (!VT.isVector() || !Subtarget.hasSSE2())
52191 return SDValue();
52192
52193 SDLoc dl(N);
52194
52195 unsigned IntBits = VT.getScalarSizeInBits();
52196 MVT IntSVT = MVT::getIntegerVT(IntBits);
52197 MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
52198
52199 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
52200 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
52201 unsigned IntOpcode;
52202 switch (N->getOpcode()) {
52203 // clang-format off
52204 default: llvm_unreachable("Unexpected FP logic op");
52205 case X86ISD::FOR: IntOpcode = ISD::OR; break;
52206 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
52207 case X86ISD::FAND: IntOpcode = ISD::AND; break;
52208 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
52209 // clang-format on
52210 }
52211 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
52212 return DAG.getBitcast(VT, IntOp);
52213}
52214
52215
52216/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
52218 if (N->getOpcode() != ISD::XOR)
52219 return SDValue();
52220
52221 SDValue LHS = N->getOperand(0);
52222 if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
52223 return SDValue();
52224
52226 X86::CondCode(LHS->getConstantOperandVal(0)));
52227 SDLoc DL(N);
52228 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
52229}
52230
52232 const X86Subtarget &Subtarget) {
52233 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&
52234 "Invalid opcode for combing with CTLZ");
52235 if (Subtarget.hasFastLZCNT())
52236 return SDValue();
52237
52238 EVT VT = N->getValueType(0);
52239 if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32 &&
52240 (VT != MVT::i64 || !Subtarget.is64Bit()))
52241 return SDValue();
52242
52243 SDValue N0 = N->getOperand(0);
52244 SDValue N1 = N->getOperand(1);
52245
52246 if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF &&
52248 return SDValue();
52249
52250 SDValue OpCTLZ;
52251 SDValue OpSizeTM1;
52252
52253 if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
52254 OpCTLZ = N1;
52255 OpSizeTM1 = N0;
52256 } else if (N->getOpcode() == ISD::SUB) {
52257 return SDValue();
52258 } else {
52259 OpCTLZ = N0;
52260 OpSizeTM1 = N1;
52261 }
52262
52263 if (!OpCTLZ.hasOneUse())
52264 return SDValue();
52265 auto *C = dyn_cast<ConstantSDNode>(OpSizeTM1);
52266 if (!C)
52267 return SDValue();
52268
52269 if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))
52270 return SDValue();
52271 SDLoc DL(N);
52272 EVT OpVT = VT;
52273 SDValue Op = OpCTLZ.getOperand(0);
52274 if (VT == MVT::i8) {
52275 // Zero extend to i32 since there is not an i8 bsr.
52276 OpVT = MVT::i32;
52277 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, OpVT, Op);
52278 }
52279
52280 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
52281 Op = DAG.getNode(X86ISD::BSR, DL, VTs, Op);
52282 if (VT == MVT::i8)
52283 Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op);
52284
52285 return Op;
52286}
52287
52290 const X86Subtarget &Subtarget) {
52291 SDValue N0 = N->getOperand(0);
52292 SDValue N1 = N->getOperand(1);
52293 EVT VT = N->getValueType(0);
52294
52295 // If this is SSE1 only convert to FXOR to avoid scalarization.
52296 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
52297 return DAG.getBitcast(MVT::v4i32,
52298 DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
52299 DAG.getBitcast(MVT::v4f32, N0),
52300 DAG.getBitcast(MVT::v4f32, N1)));
52301 }
52302
52303 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
52304 return Cmp;
52305
52306 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
52307 return R;
52308
52309 if (SDValue R = combineBitOpWithShift(N, DAG))
52310 return R;
52311
52312 if (SDValue R = combineBitOpWithPACK(N, DAG))
52313 return R;
52314
52315 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
52316 return FPLogic;
52317
52318 if (SDValue R = combineXorSubCTLZ(N, DAG, Subtarget))
52319 return R;
52320
52321 if (DCI.isBeforeLegalizeOps())
52322 return SDValue();
52323
52324 if (SDValue SetCC = foldXor1SetCC(N, DAG))
52325 return SetCC;
52326
52327 if (SDValue R = combineOrXorWithSETCC(N, N0, N1, DAG))
52328 return R;
52329
52330 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
52331 return RV;
52332
52333 // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
52334 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52335 if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
52336 N0.getOperand(0).getValueType().isVector() &&
52337 N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
52338 TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
52339 return DAG.getBitcast(VT, DAG.getNOT(SDLoc(N), N0.getOperand(0),
52340 N0.getOperand(0).getValueType()));
52341 }
52342
52343 // Handle AVX512 mask widening.
52344 // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
52345 if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
52346 VT.getVectorElementType() == MVT::i1 &&
52348 TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
52349 return DAG.getNode(
52351 DAG.getNOT(SDLoc(N), N0.getOperand(1), N0.getOperand(1).getValueType()),
52352 N0.getOperand(2));
52353 }
52354
52355 // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
52356 // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
52357 // TODO: Under what circumstances could this be performed in DAGCombine?
52358 if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
52359 N0.getOperand(0).getOpcode() == N->getOpcode()) {
52360 SDValue TruncExtSrc = N0.getOperand(0);
52361 auto *N1C = dyn_cast<ConstantSDNode>(N1);
52362 auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
52363 if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
52364 SDLoc DL(N);
52365 SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
52366 SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
52367 return DAG.getNode(ISD::XOR, DL, VT, LHS,
52368 DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
52369 }
52370 }
52371
52372 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
52373 return R;
52374
52375 return combineFneg(N, DAG, DCI, Subtarget);
52376}
52377
52380 const X86Subtarget &Subtarget) {
52381 SDValue N0 = N->getOperand(0);
52382 EVT VT = N->getValueType(0);
52383
52384 // Convert a (iX bitreverse(bitcast(vXi1 X))) -> (iX bitcast(shuffle(X)))
52385 if (VT.isInteger() && N0.getOpcode() == ISD::BITCAST && N0.hasOneUse()) {
52386 SDValue Src = N0.getOperand(0);
52387 EVT SrcVT = Src.getValueType();
52388 if (SrcVT.isVector() && SrcVT.getScalarType() == MVT::i1 &&
52389 (DCI.isBeforeLegalize() ||
52390 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)) &&
52391 Subtarget.hasSSSE3()) {
52392 unsigned NumElts = SrcVT.getVectorNumElements();
52393 SmallVector<int, 32> ReverseMask(NumElts);
52394 for (unsigned I = 0; I != NumElts; ++I)
52395 ReverseMask[I] = (NumElts - 1) - I;
52396 SDValue Rev =
52397 DAG.getVectorShuffle(SrcVT, SDLoc(N), Src, Src, ReverseMask);
52398 return DAG.getBitcast(VT, Rev);
52399 }
52400 }
52401
52402 return SDValue();
52403}
52404
52407 const X86Subtarget &Subtarget) {
52408 EVT VT = N->getValueType(0);
52409 unsigned NumBits = VT.getSizeInBits();
52410
52411 // TODO - Constant Folding.
52412
52413 // Simplify the inputs.
52414 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52415 APInt DemandedMask(APInt::getAllOnes(NumBits));
52416 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
52417 return SDValue(N, 0);
52418
52419 return SDValue();
52420}
52421
52423 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
52424}
52425
52426/// If a value is a scalar FP zero or a vector FP zero (potentially including
52427/// undefined elements), return a zero constant that may be used to fold away
52428/// that value. In the case of a vector, the returned constant will not contain
52429/// undefined elements even if the input parameter does. This makes it suitable
52430/// to be used as a replacement operand with operations (eg, bitwise-and) where
52431/// an undef should not propagate.
52433 const X86Subtarget &Subtarget) {
52435 return SDValue();
52436
52437 if (V.getValueType().isVector())
52438 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
52439
52440 return V;
52441}
52442
52444 const X86Subtarget &Subtarget) {
52445 SDValue N0 = N->getOperand(0);
52446 SDValue N1 = N->getOperand(1);
52447 EVT VT = N->getValueType(0);
52448 SDLoc DL(N);
52449
52450 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
52451 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
52452 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
52453 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
52454 return SDValue();
52455
52456 auto isAllOnesConstantFP = [](SDValue V) {
52457 if (V.getSimpleValueType().isVector())
52458 return ISD::isBuildVectorAllOnes(V.getNode());
52459 auto *C = dyn_cast<ConstantFPSDNode>(V);
52460 return C && C->getConstantFPValue()->isAllOnesValue();
52461 };
52462
52463 // fand (fxor X, -1), Y --> fandn X, Y
52464 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
52465 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
52466
52467 // fand X, (fxor Y, -1) --> fandn Y, X
52468 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
52469 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
52470
52471 return SDValue();
52472}
52473
52474/// Do target-specific dag combines on X86ISD::FAND nodes.
52476 const X86Subtarget &Subtarget) {
52477 // FAND(0.0, x) -> 0.0
52478 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
52479 return V;
52480
52481 // FAND(x, 0.0) -> 0.0
52482 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
52483 return V;
52484
52485 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
52486 return V;
52487
52488 return lowerX86FPLogicOp(N, DAG, Subtarget);
52489}
52490
52491/// Do target-specific dag combines on X86ISD::FANDN nodes.
52493 const X86Subtarget &Subtarget) {
52494 // FANDN(0.0, x) -> x
52495 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
52496 return N->getOperand(1);
52497
52498 // FANDN(x, 0.0) -> 0.0
52499 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
52500 return V;
52501
52502 return lowerX86FPLogicOp(N, DAG, Subtarget);
52503}
52504
52505/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
52508 const X86Subtarget &Subtarget) {
52509 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
52510
52511 // F[X]OR(0.0, x) -> x
52512 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
52513 return N->getOperand(1);
52514
52515 // F[X]OR(x, 0.0) -> x
52516 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
52517 return N->getOperand(0);
52518
52519 if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
52520 return NewVal;
52521
52522 return lowerX86FPLogicOp(N, DAG, Subtarget);
52523}
52524
52525/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
52527 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
52528
52529 // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
52530 if (!DAG.getTarget().Options.NoNaNsFPMath ||
52532 return SDValue();
52533
52534 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
52535 // into FMINC and FMAXC, which are Commutative operations.
52536 unsigned NewOp = 0;
52537 switch (N->getOpcode()) {
52538 default: llvm_unreachable("unknown opcode");
52539 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
52540 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
52541 }
52542
52543 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
52544 N->getOperand(0), N->getOperand(1));
52545}
52546
52548 const X86Subtarget &Subtarget) {
52549 EVT VT = N->getValueType(0);
52550 if (Subtarget.useSoftFloat() || isSoftF16(VT, Subtarget))
52551 return SDValue();
52552
52553 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52554
52555 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
52556 (Subtarget.hasSSE2() && VT == MVT::f64) ||
52557 (Subtarget.hasFP16() && VT == MVT::f16) ||
52558 (VT.isVector() && TLI.isTypeLegal(VT))))
52559 return SDValue();
52560
52561 SDValue Op0 = N->getOperand(0);
52562 SDValue Op1 = N->getOperand(1);
52563 SDLoc DL(N);
52564 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
52565
52566 // If we don't have to respect NaN inputs, this is a direct translation to x86
52567 // min/max instructions.
52568 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
52569 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
52570
52571 // If one of the operands is known non-NaN use the native min/max instructions
52572 // with the non-NaN input as second operand.
52573 if (DAG.isKnownNeverNaN(Op1))
52574 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
52575 if (DAG.isKnownNeverNaN(Op0))
52576 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
52577
52578 // If we have to respect NaN inputs, this takes at least 3 instructions.
52579 // Favor a library call when operating on a scalar and minimizing code size.
52580 if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
52581 return SDValue();
52582
52583 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
52584 VT);
52585
52586 // There are 4 possibilities involving NaN inputs, and these are the required
52587 // outputs:
52588 // Op1
52589 // Num NaN
52590 // ----------------
52591 // Num | Max | Op0 |
52592 // Op0 ----------------
52593 // NaN | Op1 | NaN |
52594 // ----------------
52595 //
52596 // The SSE FP max/min instructions were not designed for this case, but rather
52597 // to implement:
52598 // Min = Op1 < Op0 ? Op1 : Op0
52599 // Max = Op1 > Op0 ? Op1 : Op0
52600 //
52601 // So they always return Op0 if either input is a NaN. However, we can still
52602 // use those instructions for fmaxnum by selecting away a NaN input.
52603
52604 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
52605 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
52606 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
52607
52608 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
52609 // are NaN, the NaN value of Op1 is the result.
52610 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
52611}
52612
52615 EVT VT = N->getValueType(0);
52616 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52617
52618 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
52619 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
52620 return SDValue(N, 0);
52621
52622 // Convert a full vector load into vzload when not all bits are needed.
52623 SDValue In = N->getOperand(0);
52624 MVT InVT = In.getSimpleValueType();
52625 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
52626 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
52627 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
52628 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
52629 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
52630 MVT MemVT = MVT::getIntegerVT(NumBits);
52631 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
52632 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
52633 SDLoc dl(N);
52634 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
52635 DAG.getBitcast(InVT, VZLoad));
52636 DCI.CombineTo(N, Convert);
52637 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
52639 return SDValue(N, 0);
52640 }
52641 }
52642
52643 return SDValue();
52644}
52645
52648 bool IsStrict = N->isTargetStrictFPOpcode();
52649 EVT VT = N->getValueType(0);
52650
52651 // Convert a full vector load into vzload when not all bits are needed.
52652 SDValue In = N->getOperand(IsStrict ? 1 : 0);
52653 MVT InVT = In.getSimpleValueType();
52654 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
52655 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
52656 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
52657 LoadSDNode *LN = cast<LoadSDNode>(In);
52658 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
52659 MVT MemVT = MVT::getFloatingPointVT(NumBits);
52660 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
52661 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
52662 SDLoc dl(N);
52663 if (IsStrict) {
52664 SDValue Convert =
52665 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
52666 {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
52667 DCI.CombineTo(N, Convert, Convert.getValue(1));
52668 } else {
52669 SDValue Convert =
52670 DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
52671 DCI.CombineTo(N, Convert);
52672 }
52673 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
52675 return SDValue(N, 0);
52676 }
52677 }
52678
52679 return SDValue();
52680}
52681
52682/// Do target-specific dag combines on X86ISD::ANDNP nodes.
52685 const X86Subtarget &Subtarget) {
52686 SDValue N0 = N->getOperand(0);
52687 SDValue N1 = N->getOperand(1);
52688 MVT VT = N->getSimpleValueType(0);
52689 int NumElts = VT.getVectorNumElements();
52690 unsigned EltSizeInBits = VT.getScalarSizeInBits();
52691 SDLoc DL(N);
52692
52693 // ANDNP(undef, x) -> 0
52694 // ANDNP(x, undef) -> 0
52695 if (N0.isUndef() || N1.isUndef())
52696 return DAG.getConstant(0, DL, VT);
52697
52698 // ANDNP(0, x) -> x
52700 return N1;
52701
52702 // ANDNP(x, 0) -> 0
52704 return DAG.getConstant(0, DL, VT);
52705
52706 // ANDNP(x, -1) -> NOT(x) -> XOR(x, -1)
52708 return DAG.getNOT(DL, N0, VT);
52709
52710 // Turn ANDNP back to AND if input is inverted.
52711 if (SDValue Not = IsNOT(N0, DAG))
52712 return DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, Not), N1);
52713
52714 // Fold for better commutativity:
52715 // ANDNP(x,NOT(y)) -> AND(NOT(x),NOT(y)) -> NOT(OR(X,Y)).
52716 if (N1->hasOneUse())
52717 if (SDValue Not = IsNOT(N1, DAG))
52718 return DAG.getNOT(
52719 DL, DAG.getNode(ISD::OR, DL, VT, N0, DAG.getBitcast(VT, Not)), VT);
52720
52721 // Constant Folding
52722 APInt Undefs0, Undefs1;
52723 SmallVector<APInt> EltBits0, EltBits1;
52724 if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0,
52725 /*AllowWholeUndefs*/ true,
52726 /*AllowPartialUndefs*/ true)) {
52727 if (getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1,
52728 /*AllowWholeUndefs*/ true,
52729 /*AllowPartialUndefs*/ true)) {
52730 SmallVector<APInt> ResultBits;
52731 for (int I = 0; I != NumElts; ++I)
52732 ResultBits.push_back(~EltBits0[I] & EltBits1[I]);
52733 return getConstVector(ResultBits, VT, DAG, DL);
52734 }
52735
52736 // Constant fold NOT(N0) to allow us to use AND.
52737 // Ensure this is only performed if we can confirm that the bitcasted source
52738 // has oneuse to prevent an infinite loop with canonicalizeBitSelect.
52739 if (N0->hasOneUse()) {
52741 if (BC0.getOpcode() != ISD::BITCAST) {
52742 for (APInt &Elt : EltBits0)
52743 Elt = ~Elt;
52744 SDValue Not = getConstVector(EltBits0, VT, DAG, DL);
52745 return DAG.getNode(ISD::AND, DL, VT, Not, N1);
52746 }
52747 }
52748 }
52749
52750 // Attempt to recursively combine a bitmask ANDNP with shuffles.
52751 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
52752 SDValue Op(N, 0);
52753 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
52754 return Res;
52755
52756 // If either operand is a constant mask, then only the elements that aren't
52757 // zero are actually demanded by the other operand.
52758 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
52759 APInt UndefElts;
52760 SmallVector<APInt> EltBits;
52761 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
52762 APInt DemandedElts = APInt::getAllOnes(NumElts);
52763 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
52764 EltBits)) {
52765 DemandedBits.clearAllBits();
52766 DemandedElts.clearAllBits();
52767 for (int I = 0; I != NumElts; ++I) {
52768 if (UndefElts[I]) {
52769 // We can't assume an undef src element gives an undef dst - the
52770 // other src might be zero.
52771 DemandedBits.setAllBits();
52772 DemandedElts.setBit(I);
52773 } else if ((Invert && !EltBits[I].isAllOnes()) ||
52774 (!Invert && !EltBits[I].isZero())) {
52775 DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];
52776 DemandedElts.setBit(I);
52777 }
52778 }
52779 }
52780 return std::make_pair(DemandedBits, DemandedElts);
52781 };
52782 APInt Bits0, Elts0;
52783 APInt Bits1, Elts1;
52784 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
52785 std::tie(Bits1, Elts1) = GetDemandedMasks(N0, true);
52786
52787 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52788 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
52789 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
52790 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
52791 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
52792 if (N->getOpcode() != ISD::DELETED_NODE)
52793 DCI.AddToWorklist(N);
52794 return SDValue(N, 0);
52795 }
52796 }
52797
52798 return SDValue();
52799}
52800
52803 SDValue N1 = N->getOperand(1);
52804
52805 // BT ignores high bits in the bit index operand.
52806 unsigned BitWidth = N1.getValueSizeInBits();
52808 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
52809 if (N->getOpcode() != ISD::DELETED_NODE)
52810 DCI.AddToWorklist(N);
52811 return SDValue(N, 0);
52812 }
52813
52814 return SDValue();
52815}
52816
52819 bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
52820 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
52821
52822 if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
52823 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52824 APInt DemandedElts = APInt::getLowBitsSet(8, 4);
52825 if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {
52826 if (N->getOpcode() != ISD::DELETED_NODE)
52827 DCI.AddToWorklist(N);
52828 return SDValue(N, 0);
52829 }
52830
52831 // Convert a full vector load into vzload when not all bits are needed.
52832 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
52833 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
52834 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
52835 SDLoc dl(N);
52836 if (IsStrict) {
52837 SDValue Convert = DAG.getNode(
52838 N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
52839 {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
52840 DCI.CombineTo(N, Convert, Convert.getValue(1));
52841 } else {
52842 SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
52843 DAG.getBitcast(MVT::v8i16, VZLoad));
52844 DCI.CombineTo(N, Convert);
52845 }
52846
52847 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
52849 return SDValue(N, 0);
52850 }
52851 }
52852 }
52853
52854 return SDValue();
52855}
52856
52857// Try to combine sext_in_reg of a cmov of constants by extending the constants.
52859 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
52860
52861 EVT DstVT = N->getValueType(0);
52862
52863 SDValue N0 = N->getOperand(0);
52864 SDValue N1 = N->getOperand(1);
52865 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
52866
52867 if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
52868 return SDValue();
52869
52870 // Look through single use any_extends / truncs.
52871 SDValue IntermediateBitwidthOp;
52872 if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
52873 N0.hasOneUse()) {
52874 IntermediateBitwidthOp = N0;
52875 N0 = N0.getOperand(0);
52876 }
52877
52878 // See if we have a single use cmov.
52879 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
52880 return SDValue();
52881
52882 SDValue CMovOp0 = N0.getOperand(0);
52883 SDValue CMovOp1 = N0.getOperand(1);
52884
52885 // Make sure both operands are constants.
52886 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
52887 !isa<ConstantSDNode>(CMovOp1.getNode()))
52888 return SDValue();
52889
52890 SDLoc DL(N);
52891
52892 // If we looked through an any_extend/trunc above, add one to the constants.
52893 if (IntermediateBitwidthOp) {
52894 unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
52895 CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
52896 CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
52897 }
52898
52899 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
52900 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
52901
52902 EVT CMovVT = DstVT;
52903 // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
52904 if (DstVT == MVT::i16) {
52905 CMovVT = MVT::i32;
52906 CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
52907 CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
52908 }
52909
52910 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
52911 N0.getOperand(2), N0.getOperand(3));
52912
52913 if (CMovVT != DstVT)
52914 CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
52915
52916 return CMov;
52917}
52918
52920 const X86Subtarget &Subtarget) {
52921 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
52922
52923 if (SDValue V = combineSextInRegCmov(N, DAG))
52924 return V;
52925
52926 EVT VT = N->getValueType(0);
52927 SDValue N0 = N->getOperand(0);
52928 SDValue N1 = N->getOperand(1);
52929 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
52930 SDLoc dl(N);
52931
52932 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
52933 // both SSE and AVX2 since there is no sign-extended shift right
52934 // operation on a vector with 64-bit elements.
52935 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
52936 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
52937 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
52938 N0.getOpcode() == ISD::SIGN_EXTEND)) {
52939 SDValue N00 = N0.getOperand(0);
52940
52941 // EXTLOAD has a better solution on AVX2,
52942 // it may be replaced with X86ISD::VSEXT node.
52943 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
52944 if (!ISD::isNormalLoad(N00.getNode()))
52945 return SDValue();
52946
52947 // Attempt to promote any comparison mask ops before moving the
52948 // SIGN_EXTEND_INREG in the way.
52949 if (SDValue Promote = PromoteMaskArithmetic(N0, dl, DAG, Subtarget))
52950 return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
52951
52952 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
52953 SDValue Tmp =
52954 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
52955 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
52956 }
52957 }
52958 return SDValue();
52959}
52960
52961/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
52962/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
52963/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
52964/// opportunities to combine math ops, use an LEA, or use a complex addressing
52965/// mode. This can eliminate extend, add, and shift instructions.
52967 const X86Subtarget &Subtarget) {
52968 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
52969 Ext->getOpcode() != ISD::ZERO_EXTEND)
52970 return SDValue();
52971
52972 // TODO: This should be valid for other integer types.
52973 EVT VT = Ext->getValueType(0);
52974 if (VT != MVT::i64)
52975 return SDValue();
52976
52977 SDValue Add = Ext->getOperand(0);
52978 if (Add.getOpcode() != ISD::ADD)
52979 return SDValue();
52980
52981 SDValue AddOp0 = Add.getOperand(0);
52982 SDValue AddOp1 = Add.getOperand(1);
52983 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
52984 bool NSW = Add->getFlags().hasNoSignedWrap();
52985 bool NUW = Add->getFlags().hasNoUnsignedWrap();
52986 NSW = NSW || (Sext && DAG.willNotOverflowAdd(true, AddOp0, AddOp1));
52987 NUW = NUW || (!Sext && DAG.willNotOverflowAdd(false, AddOp0, AddOp1));
52988
52989 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
52990 // into the 'zext'
52991 if ((Sext && !NSW) || (!Sext && !NUW))
52992 return SDValue();
52993
52994 // Having a constant operand to the 'add' ensures that we are not increasing
52995 // the instruction count because the constant is extended for free below.
52996 // A constant operand can also become the displacement field of an LEA.
52997 auto *AddOp1C = dyn_cast<ConstantSDNode>(AddOp1);
52998 if (!AddOp1C)
52999 return SDValue();
53000
53001 // Don't make the 'add' bigger if there's no hope of combining it with some
53002 // other 'add' or 'shl' instruction.
53003 // TODO: It may be profitable to generate simpler LEA instructions in place
53004 // of single 'add' instructions, but the cost model for selecting an LEA
53005 // currently has a high threshold.
53006 bool HasLEAPotential = false;
53007 for (auto *User : Ext->uses()) {
53008 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
53009 HasLEAPotential = true;
53010 break;
53011 }
53012 }
53013 if (!HasLEAPotential)
53014 return SDValue();
53015
53016 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
53017 int64_t AddC = Sext ? AddOp1C->getSExtValue() : AddOp1C->getZExtValue();
53018 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
53019 SDValue NewConstant = DAG.getConstant(AddC, SDLoc(Add), VT);
53020
53021 // The wider add is guaranteed to not wrap because both operands are
53022 // sign-extended.
53023 SDNodeFlags Flags;
53024 Flags.setNoSignedWrap(NSW);
53025 Flags.setNoUnsignedWrap(NUW);
53026 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
53027}
53028
53029// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
53030// operands and the result of CMOV is not used anywhere else - promote CMOV
53031// itself instead of promoting its result. This could be beneficial, because:
53032// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
53033// (or more) pseudo-CMOVs only when they go one-after-another and
53034// getting rid of result extension code after CMOV will help that.
53035// 2) Promotion of constant CMOV arguments is free, hence the
53036// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
53037// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
53038// promotion is also good in terms of code-size.
53039// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
53040// promotion).
53042 SDValue CMovN = Extend->getOperand(0);
53043 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
53044 return SDValue();
53045
53046 EVT TargetVT = Extend->getValueType(0);
53047 unsigned ExtendOpcode = Extend->getOpcode();
53048 SDLoc DL(Extend);
53049
53050 EVT VT = CMovN.getValueType();
53051 SDValue CMovOp0 = CMovN.getOperand(0);
53052 SDValue CMovOp1 = CMovN.getOperand(1);
53053
53054 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
53055 !isa<ConstantSDNode>(CMovOp1.getNode()))
53056 return SDValue();
53057
53058 // Only extend to i32 or i64.
53059 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
53060 return SDValue();
53061
53062 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
53063 // are free.
53064 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
53065 return SDValue();
53066
53067 // If this a zero extend to i64, we should only extend to i32 and use a free
53068 // zero extend to finish.
53069 EVT ExtendVT = TargetVT;
53070 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
53071 ExtendVT = MVT::i32;
53072
53073 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
53074 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
53075
53076 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
53077 CMovN.getOperand(2), CMovN.getOperand(3));
53078
53079 // Finish extending if needed.
53080 if (ExtendVT != TargetVT)
53081 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
53082
53083 return Res;
53084}
53085
53086// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
53087// result type.
53089 const X86Subtarget &Subtarget) {
53090 SDValue N0 = N->getOperand(0);
53091 EVT VT = N->getValueType(0);
53092 SDLoc dl(N);
53093
53094 // Only do this combine with AVX512 for vector extends.
53095 if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
53096 return SDValue();
53097
53098 // Only combine legal element types.
53099 EVT SVT = VT.getVectorElementType();
53100 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
53101 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
53102 return SDValue();
53103
53104 // We don't have CMPP Instruction for vxf16
53105 if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)
53106 return SDValue();
53107 // We can only do this if the vector size in 256 bits or less.
53108 unsigned Size = VT.getSizeInBits();
53109 if (Size > 256 && Subtarget.useAVX512Regs())
53110 return SDValue();
53111
53112 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
53113 // that's the only integer compares with we have.
53114 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
53116 return SDValue();
53117
53118 // Only do this combine if the extension will be fully consumed by the setcc.
53119 EVT N00VT = N0.getOperand(0).getValueType();
53120 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
53121 if (Size != MatchingVecType.getSizeInBits())
53122 return SDValue();
53123
53124 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
53125
53126 if (N->getOpcode() == ISD::ZERO_EXTEND)
53127 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
53128
53129 return Res;
53130}
53131
53134 const X86Subtarget &Subtarget) {
53135 SDValue N0 = N->getOperand(0);
53136 EVT VT = N->getValueType(0);
53137 SDLoc DL(N);
53138
53139 // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
53140 if (!DCI.isBeforeLegalizeOps() &&
53142 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
53143 N0->getOperand(1));
53144 bool ReplaceOtherUses = !N0.hasOneUse();
53145 DCI.CombineTo(N, Setcc);
53146 // Replace other uses with a truncate of the widened setcc_carry.
53147 if (ReplaceOtherUses) {
53148 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
53149 N0.getValueType(), Setcc);
53150 DCI.CombineTo(N0.getNode(), Trunc);
53151 }
53152
53153 return SDValue(N, 0);
53154 }
53155
53156 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
53157 return NewCMov;
53158
53159 if (!DCI.isBeforeLegalizeOps())
53160 return SDValue();
53161
53162 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
53163 return V;
53164
53165 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,
53166 DAG, DCI, Subtarget))
53167 return V;
53168
53169 if (VT.isVector()) {
53170 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), DL, DAG, Subtarget))
53171 return R;
53172
53174 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
53175 }
53176
53177 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
53178 return NewAdd;
53179
53180 return SDValue();
53181}
53182
53183// Inverting a constant vector is profitable if it can be eliminated and the
53184// inverted vector is already present in DAG. Otherwise, it will be loaded
53185// anyway.
53186//
53187// We determine which of the values can be completely eliminated and invert it.
53188// If both are eliminable, select a vector with the first negative element.
53191 "ConstantFP build vector expected");
53192 // Check if we can eliminate V. We assume if a value is only used in FMAs, we
53193 // can eliminate it. Since this function is invoked for each FMA with this
53194 // vector.
53195 auto IsNotFMA = [](SDNode *Use) {
53196 return Use->getOpcode() != ISD::FMA && Use->getOpcode() != ISD::STRICT_FMA;
53197 };
53198 if (llvm::any_of(V->uses(), IsNotFMA))
53199 return SDValue();
53200
53202 EVT VT = V.getValueType();
53203 EVT EltVT = VT.getVectorElementType();
53204 for (const SDValue &Op : V->op_values()) {
53205 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
53206 Ops.push_back(DAG.getConstantFP(-Cst->getValueAPF(), SDLoc(Op), EltVT));
53207 } else {
53208 assert(Op.isUndef());
53209 Ops.push_back(DAG.getUNDEF(EltVT));
53210 }
53211 }
53212
53213 SDNode *NV = DAG.getNodeIfExists(ISD::BUILD_VECTOR, DAG.getVTList(VT), Ops);
53214 if (!NV)
53215 return SDValue();
53216
53217 // If an inverted version cannot be eliminated, choose it instead of the
53218 // original version.
53219 if (llvm::any_of(NV->uses(), IsNotFMA))
53220 return SDValue(NV, 0);
53221
53222 // If the inverted version also can be eliminated, we have to consistently
53223 // prefer one of the values. We prefer a constant with a negative value on
53224 // the first place.
53225 // N.B. We need to skip undefs that may precede a value.
53226 for (const SDValue &Op : V->op_values()) {
53227 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
53228 if (Cst->isNegative())
53229 return SDValue();
53230 break;
53231 }
53232 }
53233 return SDValue(NV, 0);
53234}
53235
53238 const X86Subtarget &Subtarget) {
53239 SDLoc dl(N);
53240 EVT VT = N->getValueType(0);
53241 bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();
53242
53243 // Let legalize expand this if it isn't a legal type yet.
53244 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53245 if (!TLI.isTypeLegal(VT))
53246 return SDValue();
53247
53248 SDValue A = N->getOperand(IsStrict ? 1 : 0);
53249 SDValue B = N->getOperand(IsStrict ? 2 : 1);
53250 SDValue C = N->getOperand(IsStrict ? 3 : 2);
53251
53252 // If the operation allows fast-math and the target does not support FMA,
53253 // split this into mul+add to avoid libcall(s).
53254 SDNodeFlags Flags = N->getFlags();
53255 if (!IsStrict && Flags.hasAllowReassociation() &&
53256 TLI.isOperationExpand(ISD::FMA, VT)) {
53257 SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
53258 return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
53259 }
53260
53261 EVT ScalarVT = VT.getScalarType();
53262 if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
53263 !Subtarget.hasAnyFMA()) &&
53264 !(ScalarVT == MVT::f16 && Subtarget.hasFP16()))
53265 return SDValue();
53266
53267 auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
53268 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
53269 bool LegalOperations = !DCI.isBeforeLegalizeOps();
53270 if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
53271 CodeSize)) {
53272 V = NegV;
53273 return true;
53274 }
53275 // Look through extract_vector_elts. If it comes from an FNEG, create a
53276 // new extract from the FNEG input.
53277 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
53278 isNullConstant(V.getOperand(1))) {
53279 SDValue Vec = V.getOperand(0);
53280 if (SDValue NegV = TLI.getCheaperNegatedExpression(
53281 Vec, DAG, LegalOperations, CodeSize)) {
53282 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
53283 NegV, V.getOperand(1));
53284 return true;
53285 }
53286 }
53287 // Lookup if there is an inverted version of constant vector V in DAG.
53288 if (ISD::isBuildVectorOfConstantFPSDNodes(V.getNode())) {
53289 if (SDValue NegV = getInvertedVectorForFMA(V, DAG)) {
53290 V = NegV;
53291 return true;
53292 }
53293 }
53294 return false;
53295 };
53296
53297 // Do not convert the passthru input of scalar intrinsics.
53298 // FIXME: We could allow negations of the lower element only.
53299 bool NegA = invertIfNegative(A);
53300 bool NegB = invertIfNegative(B);
53301 bool NegC = invertIfNegative(C);
53302
53303 if (!NegA && !NegB && !NegC)
53304 return SDValue();
53305
53306 unsigned NewOpcode =
53307 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
53308
53309 // Propagate fast-math-flags to new FMA node.
53310 SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
53311 if (IsStrict) {
53312 assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
53313 return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
53314 {N->getOperand(0), A, B, C});
53315 } else {
53316 if (N->getNumOperands() == 4)
53317 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
53318 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
53319 }
53320}
53321
53322// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
53323// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
53326 SDLoc dl(N);
53327 EVT VT = N->getValueType(0);
53328 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53329 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
53330 bool LegalOperations = !DCI.isBeforeLegalizeOps();
53331
53332 SDValue N2 = N->getOperand(2);
53333
53334 SDValue NegN2 =
53335 TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
53336 if (!NegN2)
53337 return SDValue();
53338 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
53339
53340 if (N->getNumOperands() == 4)
53341 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
53342 NegN2, N->getOperand(3));
53343 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
53344 NegN2);
53345}
53346
53349 const X86Subtarget &Subtarget) {
53350 SDLoc dl(N);
53351 SDValue N0 = N->getOperand(0);
53352 EVT VT = N->getValueType(0);
53353
53354 // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
53355 // FIXME: Is this needed? We don't seem to have any tests for it.
53356 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
53358 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
53359 N0->getOperand(1));
53360 bool ReplaceOtherUses = !N0.hasOneUse();
53361 DCI.CombineTo(N, Setcc);
53362 // Replace other uses with a truncate of the widened setcc_carry.
53363 if (ReplaceOtherUses) {
53364 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
53365 N0.getValueType(), Setcc);
53366 DCI.CombineTo(N0.getNode(), Trunc);
53367 }
53368
53369 return SDValue(N, 0);
53370 }
53371
53372 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
53373 return NewCMov;
53374
53375 if (DCI.isBeforeLegalizeOps())
53376 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
53377 return V;
53378
53379 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,
53380 DAG, DCI, Subtarget))
53381 return V;
53382
53383 if (VT.isVector())
53384 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), dl, DAG, Subtarget))
53385 return R;
53386
53387 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
53388 return NewAdd;
53389
53390 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
53391 return R;
53392
53393 // TODO: Combine with any target/faux shuffle.
53394 if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
53396 SDValue N00 = N0.getOperand(0);
53397 SDValue N01 = N0.getOperand(1);
53398 unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
53399 APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
53400 if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
53401 (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
53402 return concatSubVectors(N00, N01, DAG, dl);
53403 }
53404 }
53405
53406 return SDValue();
53407}
53408
53409/// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
53410/// pre-promote its result type since vXi1 vectors don't get promoted
53411/// during type legalization.
53414 const SDLoc &DL, SelectionDAG &DAG,
53415 const X86Subtarget &Subtarget) {
53416 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
53417 VT.getVectorElementType() == MVT::i1 &&
53418 (OpVT.getVectorElementType() == MVT::i8 ||
53419 OpVT.getVectorElementType() == MVT::i16)) {
53420 SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
53421 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
53422 }
53423 return SDValue();
53424}
53425
53428 const X86Subtarget &Subtarget) {
53429 const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
53430 const SDValue LHS = N->getOperand(0);
53431 const SDValue RHS = N->getOperand(1);
53432 EVT VT = N->getValueType(0);
53433 EVT OpVT = LHS.getValueType();
53434 SDLoc DL(N);
53435
53436 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
53437 if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG,
53438 Subtarget))
53439 return V;
53440
53441 if (VT == MVT::i1) {
53442 X86::CondCode X86CC;
53443 if (SDValue V =
53444 MatchVectorAllEqualTest(LHS, RHS, CC, DL, Subtarget, DAG, X86CC))
53445 return DAG.getNode(ISD::TRUNCATE, DL, VT, getSETCC(X86CC, V, DL, DAG));
53446 }
53447
53448 if (OpVT.isScalarInteger()) {
53449 // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
53450 // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
53451 auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
53452 if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
53453 if (N0.getOperand(0) == N1)
53454 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
53455 N0.getOperand(1));
53456 if (N0.getOperand(1) == N1)
53457 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
53458 N0.getOperand(0));
53459 }
53460 return SDValue();
53461 };
53462 if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
53463 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
53464 if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
53465 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
53466
53467 // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
53468 // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
53469 auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
53470 if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
53471 if (N0.getOperand(0) == N1)
53472 return DAG.getNode(ISD::AND, DL, OpVT, N1,
53473 DAG.getNOT(DL, N0.getOperand(1), OpVT));
53474 if (N0.getOperand(1) == N1)
53475 return DAG.getNode(ISD::AND, DL, OpVT, N1,
53476 DAG.getNOT(DL, N0.getOperand(0), OpVT));
53477 }
53478 return SDValue();
53479 };
53480 if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
53481 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
53482 if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
53483 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
53484
53485 // cmpeq(trunc(x),C) --> cmpeq(x,C)
53486 // cmpne(trunc(x),C) --> cmpne(x,C)
53487 // iff x upper bits are zero.
53488 if (LHS.getOpcode() == ISD::TRUNCATE &&
53489 LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
53490 isa<ConstantSDNode>(RHS) && !DCI.isBeforeLegalize()) {
53491 EVT SrcVT = LHS.getOperand(0).getValueType();
53493 OpVT.getScalarSizeInBits());
53494 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53495 if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
53496 TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
53497 return DAG.getSetCC(DL, VT, LHS.getOperand(0),
53498 DAG.getZExtOrTrunc(RHS, DL, SrcVT), CC);
53499 }
53500
53501 // With C as a power of 2 and C != 0 and C != INT_MIN:
53502 // icmp eq Abs(X) C ->
53503 // (icmp eq A, C) | (icmp eq A, -C)
53504 // icmp ne Abs(X) C ->
53505 // (icmp ne A, C) & (icmp ne A, -C)
53506 // Both of these patterns can be better optimized in
53507 // DAGCombiner::foldAndOrOfSETCC. Note this only applies for scalar
53508 // integers which is checked above.
53509 if (LHS.getOpcode() == ISD::ABS && LHS.hasOneUse()) {
53510 if (auto *C = dyn_cast<ConstantSDNode>(RHS)) {
53511 const APInt &CInt = C->getAPIntValue();
53512 // We can better optimize this case in DAGCombiner::foldAndOrOfSETCC.
53513 if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) {
53514 SDValue BaseOp = LHS.getOperand(0);
53515 SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC);
53516 SDValue SETCC1 = DAG.getSetCC(
53517 DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);
53518 return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT,
53519 SETCC0, SETCC1);
53520 }
53521 }
53522 }
53523 }
53524 }
53525
53526 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
53528 // Using temporaries to avoid messing up operand ordering for later
53529 // transformations if this doesn't work.
53530 SDValue Op0 = LHS;
53531 SDValue Op1 = RHS;
53532 ISD::CondCode TmpCC = CC;
53533 // Put build_vector on the right.
53534 if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
53535 std::swap(Op0, Op1);
53536 TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
53537 }
53538
53539 bool IsSEXT0 =
53540 (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
53541 (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
53542 bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
53543
53544 if (IsSEXT0 && IsVZero1) {
53545 assert(VT == Op0.getOperand(0).getValueType() &&
53546 "Unexpected operand type");
53547 if (TmpCC == ISD::SETGT)
53548 return DAG.getConstant(0, DL, VT);
53549 if (TmpCC == ISD::SETLE)
53550 return DAG.getConstant(1, DL, VT);
53551 if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
53552 return DAG.getNOT(DL, Op0.getOperand(0), VT);
53553
53554 assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&
53555 "Unexpected condition code!");
53556 return Op0.getOperand(0);
53557 }
53558 }
53559
53560 // Try and make unsigned vector comparison signed. On pre AVX512 targets there
53561 // only are unsigned comparisons (`PCMPGT`) and on AVX512 its often better to
53562 // use `PCMPGT` if the result is mean to stay in a vector (and if its going to
53563 // a mask, there are signed AVX512 comparisons).
53564 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger()) {
53565 bool CanMakeSigned = false;
53567 KnownBits CmpKnown =
53569 // If we know LHS/RHS share the same sign bit at each element we can
53570 // make this signed.
53571 // NOTE: `computeKnownBits` on a vector type aggregates common bits
53572 // across all lanes. So a pattern where the sign varies from lane to
53573 // lane, but at each lane Sign(LHS) is known to equal Sign(RHS), will be
53574 // missed. We could get around this by demanding each lane
53575 // independently, but this isn't the most important optimization and
53576 // that may eat into compile time.
53577 CanMakeSigned =
53578 CmpKnown.Zero.isSignBitSet() || CmpKnown.One.isSignBitSet();
53579 }
53580 if (CanMakeSigned || ISD::isSignedIntSetCC(CC)) {
53581 SDValue LHSOut = LHS;
53582 SDValue RHSOut = RHS;
53583 ISD::CondCode NewCC = CC;
53584 switch (CC) {
53585 case ISD::SETGE:
53586 case ISD::SETUGE:
53587 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ true,
53588 /*NSW*/ true))
53589 LHSOut = NewLHS;
53590 else if (SDValue NewRHS = incDecVectorConstant(
53591 RHS, DAG, /*IsInc*/ false, /*NSW*/ true))
53592 RHSOut = NewRHS;
53593 else
53594 break;
53595
53596 [[fallthrough]];
53597 case ISD::SETUGT:
53598 NewCC = ISD::SETGT;
53599 break;
53600
53601 case ISD::SETLE:
53602 case ISD::SETULE:
53603 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ false,
53604 /*NSW*/ true))
53605 LHSOut = NewLHS;
53606 else if (SDValue NewRHS = incDecVectorConstant(RHS, DAG, /*IsInc*/ true,
53607 /*NSW*/ true))
53608 RHSOut = NewRHS;
53609 else
53610 break;
53611
53612 [[fallthrough]];
53613 case ISD::SETULT:
53614 // Will be swapped to SETGT in LowerVSETCC*.
53615 NewCC = ISD::SETLT;
53616 break;
53617 default:
53618 break;
53619 }
53620 if (NewCC != CC) {
53621 if (SDValue R = truncateAVX512SetCCNoBWI(VT, OpVT, LHSOut, RHSOut,
53622 NewCC, DL, DAG, Subtarget))
53623 return R;
53624 return DAG.getSetCC(DL, VT, LHSOut, RHSOut, NewCC);
53625 }
53626 }
53627 }
53628
53629 if (SDValue R =
53630 truncateAVX512SetCCNoBWI(VT, OpVT, LHS, RHS, CC, DL, DAG, Subtarget))
53631 return R;
53632
53633 // In the middle end transforms:
53634 // `(or (icmp eq X, C), (icmp eq X, C+1))`
53635 // -> `(icmp ult (add x, -C), 2)`
53636 // Likewise inverted cases with `ugt`.
53637 //
53638 // Since x86, pre avx512, doesn't have unsigned vector compares, this results
53639 // in worse codegen. So, undo the middle-end transform and go back to `(or
53640 // (icmp eq), (icmp eq))` form.
53641 // Also skip AVX1 with ymm vectors, as the umin approach combines better than
53642 // the xmm approach.
53643 //
53644 // NB: We don't handle the similiar simplication of `(and (icmp ne), (icmp
53645 // ne))` as it doesn't end up instruction positive.
53646 // TODO: We might want to do this for avx512 as well if we `sext` the result.
53647 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger() &&
53648 ISD::isUnsignedIntSetCC(CC) && LHS.getOpcode() == ISD::ADD &&
53649 !Subtarget.hasAVX512() &&
53650 (OpVT.getSizeInBits() <= 128 || !Subtarget.hasAVX() ||
53651 Subtarget.hasAVX2()) &&
53652 LHS.hasOneUse()) {
53653
53654 APInt CmpC;
53655 SDValue AddC = LHS.getOperand(1);
53656 if (ISD::isConstantSplatVector(RHS.getNode(), CmpC) &&
53658 // See which form we have depending on the constant/condition.
53659 SDValue C0 = SDValue();
53660 SDValue C1 = SDValue();
53661
53662 // If we had `(add x, -1)` and can lower with `umin`, don't transform as
53663 // we will end up generating an additional constant. Keeping in the
53664 // current form has a slight latency cost, but it probably worth saving a
53665 // constant.
53668 // Pass
53669 }
53670 // Normal Cases
53671 else if ((CC == ISD::SETULT && CmpC == 2) ||
53672 (CC == ISD::SETULE && CmpC == 1)) {
53673 // These will constant fold.
53674 C0 = DAG.getNegative(AddC, DL, OpVT);
53675 C1 = DAG.getNode(ISD::SUB, DL, OpVT, C0,
53676 DAG.getAllOnesConstant(DL, OpVT));
53677 }
53678 // Inverted Cases
53679 else if ((CC == ISD::SETUGT && (-CmpC) == 3) ||
53680 (CC == ISD::SETUGE && (-CmpC) == 2)) {
53681 // These will constant fold.
53682 C0 = DAG.getNOT(DL, AddC, OpVT);
53683 C1 = DAG.getNode(ISD::ADD, DL, OpVT, C0,
53684 DAG.getAllOnesConstant(DL, OpVT));
53685 }
53686 if (C0 && C1) {
53687 SDValue NewLHS =
53688 DAG.getSetCC(DL, VT, LHS.getOperand(0), C0, ISD::SETEQ);
53689 SDValue NewRHS =
53690 DAG.getSetCC(DL, VT, LHS.getOperand(0), C1, ISD::SETEQ);
53691 return DAG.getNode(ISD::OR, DL, VT, NewLHS, NewRHS);
53692 }
53693 }
53694 }
53695
53696 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
53697 // to avoid scalarization via legalization because v4i32 is not a legal type.
53698 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
53699 LHS.getValueType() == MVT::v4f32)
53700 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
53701
53702 // X pred 0.0 --> X pred -X
53703 // If the negation of X already exists, use it in the comparison. This removes
53704 // the need to materialize 0.0 and allows matching to SSE's MIN/MAX
53705 // instructions in patterns with a 'select' node.
53707 SDVTList FNegVT = DAG.getVTList(OpVT);
53708 if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))
53709 return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);
53710 }
53711
53712 return SDValue();
53713}
53714
53717 const X86Subtarget &Subtarget) {
53718 SDValue Src = N->getOperand(0);
53719 MVT SrcVT = Src.getSimpleValueType();
53720 MVT VT = N->getSimpleValueType(0);
53721 unsigned NumBits = VT.getScalarSizeInBits();
53722 unsigned NumElts = SrcVT.getVectorNumElements();
53723 unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits();
53724 assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types");
53725
53726 // Perform constant folding.
53727 APInt UndefElts;
53728 SmallVector<APInt, 32> EltBits;
53729 if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits,
53730 /*AllowWholeUndefs*/ true,
53731 /*AllowPartialUndefs*/ true)) {
53732 APInt Imm(32, 0);
53733 for (unsigned Idx = 0; Idx != NumElts; ++Idx)
53734 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
53735 Imm.setBit(Idx);
53736
53737 return DAG.getConstant(Imm, SDLoc(N), VT);
53738 }
53739
53740 // Look through int->fp bitcasts that don't change the element width.
53741 unsigned EltWidth = SrcVT.getScalarSizeInBits();
53742 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
53743 Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
53744 return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
53745
53746 // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
53747 // with scalar comparisons.
53748 if (SDValue NotSrc = IsNOT(Src, DAG)) {
53749 SDLoc DL(N);
53750 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
53751 NotSrc = DAG.getBitcast(SrcVT, NotSrc);
53752 return DAG.getNode(ISD::XOR, DL, VT,
53753 DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
53754 DAG.getConstant(NotMask, DL, VT));
53755 }
53756
53757 // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
53758 // results with scalar comparisons.
53759 if (Src.getOpcode() == X86ISD::PCMPGT &&
53760 ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
53761 SDLoc DL(N);
53762 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
53763 return DAG.getNode(ISD::XOR, DL, VT,
53764 DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
53765 DAG.getConstant(NotMask, DL, VT));
53766 }
53767
53768 // Fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2))
53769 // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))
53770 // iff pow2splat(c1).
53771 // Use KnownBits to determine if only a single bit is non-zero
53772 // in each element (pow2 or zero), and shift that bit to the msb.
53773 if (Src.getOpcode() == X86ISD::PCMPEQ) {
53774 KnownBits KnownLHS = DAG.computeKnownBits(Src.getOperand(0));
53775 KnownBits KnownRHS = DAG.computeKnownBits(Src.getOperand(1));
53776 unsigned ShiftAmt = KnownLHS.countMinLeadingZeros();
53777 if (KnownLHS.countMaxPopulation() == 1 &&
53778 (KnownRHS.isZero() || (KnownRHS.countMaxPopulation() == 1 &&
53779 ShiftAmt == KnownRHS.countMinLeadingZeros()))) {
53780 SDLoc DL(N);
53781 MVT ShiftVT = SrcVT;
53782 SDValue ShiftLHS = Src.getOperand(0);
53783 SDValue ShiftRHS = Src.getOperand(1);
53784 if (ShiftVT.getScalarType() == MVT::i8) {
53785 // vXi8 shifts - we only care about the signbit so can use PSLLW.
53786 ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
53787 ShiftLHS = DAG.getBitcast(ShiftVT, ShiftLHS);
53788 ShiftRHS = DAG.getBitcast(ShiftVT, ShiftRHS);
53789 }
53790 ShiftLHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
53791 ShiftLHS, ShiftAmt, DAG);
53792 ShiftRHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
53793 ShiftRHS, ShiftAmt, DAG);
53794 ShiftLHS = DAG.getBitcast(SrcVT, ShiftLHS);
53795 ShiftRHS = DAG.getBitcast(SrcVT, ShiftRHS);
53796 SDValue Res = DAG.getNode(ISD::XOR, DL, SrcVT, ShiftLHS, ShiftRHS);
53797 return DAG.getNode(X86ISD::MOVMSK, DL, VT, DAG.getNOT(DL, Res, SrcVT));
53798 }
53799 }
53800
53801 // Fold movmsk(logic(X,C)) -> logic(movmsk(X),C)
53802 if (N->isOnlyUserOf(Src.getNode())) {
53804 if (ISD::isBitwiseLogicOp(SrcBC.getOpcode())) {
53805 APInt UndefElts;
53806 SmallVector<APInt, 32> EltBits;
53807 if (getTargetConstantBitsFromNode(SrcBC.getOperand(1), NumBitsPerElt,
53808 UndefElts, EltBits)) {
53809 APInt Mask = APInt::getZero(NumBits);
53810 for (unsigned Idx = 0; Idx != NumElts; ++Idx) {
53811 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
53812 Mask.setBit(Idx);
53813 }
53814 SDLoc DL(N);
53815 SDValue NewSrc = DAG.getBitcast(SrcVT, SrcBC.getOperand(0));
53816 SDValue NewMovMsk = DAG.getNode(X86ISD::MOVMSK, DL, VT, NewSrc);
53817 return DAG.getNode(SrcBC.getOpcode(), DL, VT, NewMovMsk,
53818 DAG.getConstant(Mask, DL, VT));
53819 }
53820 }
53821 }
53822
53823 // Simplify the inputs.
53824 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53825 APInt DemandedMask(APInt::getAllOnes(NumBits));
53826 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
53827 return SDValue(N, 0);
53828
53829 return SDValue();
53830}
53831
53834 const X86Subtarget &Subtarget) {
53835 MVT VT = N->getSimpleValueType(0);
53836 unsigned NumBits = VT.getScalarSizeInBits();
53837
53838 // Simplify the inputs.
53839 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53840 APInt DemandedMask(APInt::getAllOnes(NumBits));
53841 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
53842 return SDValue(N, 0);
53843
53844 return SDValue();
53845}
53846
53849 auto *MemOp = cast<X86MaskedGatherScatterSDNode>(N);
53850 SDValue Mask = MemOp->getMask();
53851
53852 // With vector masks we only demand the upper bit of the mask.
53853 if (Mask.getScalarValueSizeInBits() != 1) {
53854 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53855 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
53856 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
53857 if (N->getOpcode() != ISD::DELETED_NODE)
53858 DCI.AddToWorklist(N);
53859 return SDValue(N, 0);
53860 }
53861 }
53862
53863 return SDValue();
53864}
53865
53868 SelectionDAG &DAG) {
53869 SDLoc DL(GorS);
53870
53871 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
53872 SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
53873 Gather->getMask(), Base, Index, Scale } ;
53874 return DAG.getMaskedGather(Gather->getVTList(),
53875 Gather->getMemoryVT(), DL, Ops,
53876 Gather->getMemOperand(),
53877 Gather->getIndexType(),
53878 Gather->getExtensionType());
53879 }
53880 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
53881 SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
53882 Scatter->getMask(), Base, Index, Scale };
53883 return DAG.getMaskedScatter(Scatter->getVTList(),
53884 Scatter->getMemoryVT(), DL,
53885 Ops, Scatter->getMemOperand(),
53886 Scatter->getIndexType(),
53887 Scatter->isTruncatingStore());
53888}
53889
53892 SDLoc DL(N);
53893 auto *GorS = cast<MaskedGatherScatterSDNode>(N);
53894 SDValue Index = GorS->getIndex();
53895 SDValue Base = GorS->getBasePtr();
53896 SDValue Scale = GorS->getScale();
53897 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53898
53899 if (DCI.isBeforeLegalize()) {
53900 unsigned IndexWidth = Index.getScalarValueSizeInBits();
53901
53902 // Shrink constant indices if they are larger than 32-bits.
53903 // Only do this before legalize types since v2i64 could become v2i32.
53904 // FIXME: We could check that the type is legal if we're after legalize
53905 // types, but then we would need to construct test cases where that happens.
53906 // FIXME: We could support more than just constant vectors, but we need to
53907 // careful with costing. A truncate that can be optimized out would be fine.
53908 // Otherwise we might only want to create a truncate if it avoids a split.
53909 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
53910 if (BV->isConstant() && IndexWidth > 32 &&
53911 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
53912 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
53913 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
53914 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
53915 }
53916 }
53917
53918 // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
53919 // there are sufficient sign bits. Only do this before legalize types to
53920 // avoid creating illegal types in truncate.
53921 if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
53922 Index.getOpcode() == ISD::ZERO_EXTEND) &&
53923 IndexWidth > 32 &&
53924 Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
53925 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
53926 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
53927 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
53928 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
53929 }
53930 }
53931
53932 EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
53933 // Try to move splat constant adders from the index operand to the base
53934 // pointer operand. Taking care to multiply by the scale. We can only do
53935 // this when index element type is the same as the pointer type.
53936 // Otherwise we need to be sure the math doesn't wrap before the scale.
53937 if (Index.getOpcode() == ISD::ADD &&
53938 Index.getValueType().getVectorElementType() == PtrVT &&
53939 isa<ConstantSDNode>(Scale)) {
53940 uint64_t ScaleAmt = Scale->getAsZExtVal();
53941 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(1))) {
53942 BitVector UndefElts;
53943 if (ConstantSDNode *C = BV->getConstantSplatNode(&UndefElts)) {
53944 // FIXME: Allow non-constant?
53945 if (UndefElts.none()) {
53946 // Apply the scale.
53947 APInt Adder = C->getAPIntValue() * ScaleAmt;
53948 // Add it to the existing base.
53949 Base = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
53950 DAG.getConstant(Adder, DL, PtrVT));
53951 Index = Index.getOperand(0);
53952 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
53953 }
53954 }
53955
53956 // It's also possible base is just a constant. In that case, just
53957 // replace it with 0 and move the displacement into the index.
53958 if (BV->isConstant() && isa<ConstantSDNode>(Base) &&
53959 isOneConstant(Scale)) {
53960 SDValue Splat = DAG.getSplatBuildVector(Index.getValueType(), DL, Base);
53961 // Combine the constant build_vector and the constant base.
53962 Splat = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
53963 Index.getOperand(1), Splat);
53964 // Add to the LHS of the original Index add.
53965 Index = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
53966 Index.getOperand(0), Splat);
53967 Base = DAG.getConstant(0, DL, Base.getValueType());
53968 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
53969 }
53970 }
53971 }
53972
53973 if (DCI.isBeforeLegalizeOps()) {
53974 unsigned IndexWidth = Index.getScalarValueSizeInBits();
53975
53976 // Make sure the index is either i32 or i64
53977 if (IndexWidth != 32 && IndexWidth != 64) {
53978 MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
53979 EVT IndexVT = Index.getValueType().changeVectorElementType(EltVT);
53980 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
53981 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
53982 }
53983 }
53984
53985 // With vector masks we only demand the upper bit of the mask.
53986 SDValue Mask = GorS->getMask();
53987 if (Mask.getScalarValueSizeInBits() != 1) {
53988 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
53989 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
53990 if (N->getOpcode() != ISD::DELETED_NODE)
53991 DCI.AddToWorklist(N);
53992 return SDValue(N, 0);
53993 }
53994 }
53995
53996 return SDValue();
53997}
53998
53999// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
54001 const X86Subtarget &Subtarget) {
54002 SDLoc DL(N);
54003 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
54004 SDValue EFLAGS = N->getOperand(1);
54005
54006 // Try to simplify the EFLAGS and condition code operands.
54007 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
54008 return getSETCC(CC, Flags, DL, DAG);
54009
54010 return SDValue();
54011}
54012
54013/// Optimize branch condition evaluation.
54015 const X86Subtarget &Subtarget) {
54016 SDLoc DL(N);
54017 SDValue EFLAGS = N->getOperand(3);
54018 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
54019
54020 // Try to simplify the EFLAGS and condition code operands.
54021 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
54022 // RAUW them under us.
54023 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
54024 SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
54025 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
54026 N->getOperand(1), Cond, Flags);
54027 }
54028
54029 return SDValue();
54030}
54031
54032// TODO: Could we move this to DAGCombine?
54034 SelectionDAG &DAG) {
54035 // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
54036 // to optimize away operation when it's from a constant.
54037 //
54038 // The general transformation is:
54039 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
54040 // AND(VECTOR_CMP(x,y), constant2)
54041 // constant2 = UNARYOP(constant)
54042
54043 // Early exit if this isn't a vector operation, the operand of the
54044 // unary operation isn't a bitwise AND, or if the sizes of the operations
54045 // aren't the same.
54046 EVT VT = N->getValueType(0);
54047 bool IsStrict = N->isStrictFPOpcode();
54048 unsigned NumEltBits = VT.getScalarSizeInBits();
54049 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
54050 if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
54051 DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
54052 VT.getSizeInBits() != Op0.getValueSizeInBits())
54053 return SDValue();
54054
54055 // Now check that the other operand of the AND is a constant. We could
54056 // make the transformation for non-constant splats as well, but it's unclear
54057 // that would be a benefit as it would not eliminate any operations, just
54058 // perform one more step in scalar code before moving to the vector unit.
54059 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
54060 // Bail out if the vector isn't a constant.
54061 if (!BV->isConstant())
54062 return SDValue();
54063
54064 // Everything checks out. Build up the new and improved node.
54065 SDLoc DL(N);
54066 EVT IntVT = BV->getValueType(0);
54067 // Create a new constant of the appropriate type for the transformed
54068 // DAG.
54069 SDValue SourceConst;
54070 if (IsStrict)
54071 SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
54072 {N->getOperand(0), SDValue(BV, 0)});
54073 else
54074 SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
54075 // The AND node needs bitcasts to/from an integer vector type around it.
54076 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
54077 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
54078 MaskConst);
54079 SDValue Res = DAG.getBitcast(VT, NewAnd);
54080 if (IsStrict)
54081 return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
54082 return Res;
54083 }
54084
54085 return SDValue();
54086}
54087
54088/// If we are converting a value to floating-point, try to replace scalar
54089/// truncate of an extracted vector element with a bitcast. This tries to keep
54090/// the sequence on XMM registers rather than moving between vector and GPRs.
54092 // TODO: This is currently only used by combineSIntToFP, but it is generalized
54093 // to allow being called by any similar cast opcode.
54094 // TODO: Consider merging this into lowering: vectorizeExtractedCast().
54095 SDValue Trunc = N->getOperand(0);
54096 if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
54097 return SDValue();
54098
54099 SDValue ExtElt = Trunc.getOperand(0);
54100 if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54101 !isNullConstant(ExtElt.getOperand(1)))
54102 return SDValue();
54103
54104 EVT TruncVT = Trunc.getValueType();
54105 EVT SrcVT = ExtElt.getValueType();
54106 unsigned DestWidth = TruncVT.getSizeInBits();
54107 unsigned SrcWidth = SrcVT.getSizeInBits();
54108 if (SrcWidth % DestWidth != 0)
54109 return SDValue();
54110
54111 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
54112 EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
54113 unsigned VecWidth = SrcVecVT.getSizeInBits();
54114 unsigned NumElts = VecWidth / DestWidth;
54115 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
54116 SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
54117 SDLoc DL(N);
54118 SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
54119 BitcastVec, ExtElt.getOperand(1));
54120 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
54121}
54122
54124 const X86Subtarget &Subtarget) {
54125 bool IsStrict = N->isStrictFPOpcode();
54126 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
54127 EVT VT = N->getValueType(0);
54128 EVT InVT = Op0.getValueType();
54129
54130 // Using i16 as an intermediate type is a bad idea, unless we have HW support
54131 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
54132 // if hasFP16 support:
54133 // UINT_TO_FP(vXi1~15) -> SINT_TO_FP(ZEXT(vXi1~15 to vXi16))
54134 // UINT_TO_FP(vXi17~31) -> SINT_TO_FP(ZEXT(vXi17~31 to vXi32))
54135 // else
54136 // UINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
54137 // UINT_TO_FP(vXi33~63) -> SINT_TO_FP(ZEXT(vXi33~63 to vXi64))
54138 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
54139 unsigned ScalarSize = InVT.getScalarSizeInBits();
54140 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
54141 ScalarSize >= 64)
54142 return SDValue();
54143 SDLoc dl(N);
54144 EVT DstVT =
54146 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
54147 : ScalarSize < 32 ? MVT::i32
54148 : MVT::i64,
54149 InVT.getVectorNumElements());
54150 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
54151 if (IsStrict)
54152 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
54153 {N->getOperand(0), P});
54154 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
54155 }
54156
54157 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
54158 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
54159 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
54160 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
54161 VT.getScalarType() != MVT::f16) {
54162 SDLoc dl(N);
54163 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
54164 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
54165
54166 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
54167 if (IsStrict)
54168 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
54169 {N->getOperand(0), P});
54170 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
54171 }
54172
54173 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
54174 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
54175 // the optimization here.
54176 SDNodeFlags Flags = N->getFlags();
54177 if (Flags.hasNonNeg() || DAG.SignBitIsZero(Op0)) {
54178 if (IsStrict)
54179 return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
54180 {N->getOperand(0), Op0});
54181 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
54182 }
54183
54184 return SDValue();
54185}
54186
54189 const X86Subtarget &Subtarget) {
54190 // First try to optimize away the conversion entirely when it's
54191 // conditionally from a constant. Vectors only.
54192 bool IsStrict = N->isStrictFPOpcode();
54194 return Res;
54195
54196 // Now move on to more general possibilities.
54197 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
54198 EVT VT = N->getValueType(0);
54199 EVT InVT = Op0.getValueType();
54200
54201 // Using i16 as an intermediate type is a bad idea, unless we have HW support
54202 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
54203 // if hasFP16 support:
54204 // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))
54205 // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
54206 // else
54207 // SINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
54208 // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
54209 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
54210 unsigned ScalarSize = InVT.getScalarSizeInBits();
54211 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
54212 ScalarSize >= 64)
54213 return SDValue();
54214 SDLoc dl(N);
54215 EVT DstVT =
54217 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
54218 : ScalarSize < 32 ? MVT::i32
54219 : MVT::i64,
54220 InVT.getVectorNumElements());
54221 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
54222 if (IsStrict)
54223 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
54224 {N->getOperand(0), P});
54225 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
54226 }
54227
54228 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
54229 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
54230 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
54231 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
54232 VT.getScalarType() != MVT::f16) {
54233 SDLoc dl(N);
54234 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
54235 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
54236 if (IsStrict)
54237 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
54238 {N->getOperand(0), P});
54239 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
54240 }
54241
54242 // Without AVX512DQ we only support i64 to float scalar conversion. For both
54243 // vectors and scalars, see if we know that the upper bits are all the sign
54244 // bit, in which case we can truncate the input to i32 and convert from that.
54245 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
54246 unsigned BitWidth = InVT.getScalarSizeInBits();
54247 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
54248 if (NumSignBits >= (BitWidth - 31)) {
54249 EVT TruncVT = MVT::i32;
54250 if (InVT.isVector())
54251 TruncVT = InVT.changeVectorElementType(TruncVT);
54252 SDLoc dl(N);
54253 if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
54254 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
54255 if (IsStrict)
54256 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
54257 {N->getOperand(0), Trunc});
54258 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
54259 }
54260 // If we're after legalize and the type is v2i32 we need to shuffle and
54261 // use CVTSI2P.
54262 assert(InVT == MVT::v2i64 && "Unexpected VT!");
54263 SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
54264 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
54265 { 0, 2, -1, -1 });
54266 if (IsStrict)
54267 return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
54268 {N->getOperand(0), Shuf});
54269 return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
54270 }
54271 }
54272
54273 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
54274 // a 32-bit target where SSE doesn't support i64->FP operations.
54275 if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
54276 Op0.getOpcode() == ISD::LOAD) {
54277 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
54278
54279 // This transformation is not supported if the result type is f16 or f128.
54280 if (VT == MVT::f16 || VT == MVT::f128)
54281 return SDValue();
54282
54283 // If we have AVX512DQ we can use packed conversion instructions unless
54284 // the VT is f80.
54285 if (Subtarget.hasDQI() && VT != MVT::f80)
54286 return SDValue();
54287
54288 if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
54289 Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
54290 std::pair<SDValue, SDValue> Tmp =
54291 Subtarget.getTargetLowering()->BuildFILD(
54292 VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
54293 Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);
54294 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
54295 return Tmp.first;
54296 }
54297 }
54298
54299 if (IsStrict)
54300 return SDValue();
54301
54302 if (SDValue V = combineToFPTruncExtElt(N, DAG))
54303 return V;
54304
54305 return SDValue();
54306}
54307
54309 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
54310
54311 for (const SDNode *User : Flags->uses()) {
54313 switch (User->getOpcode()) {
54314 default:
54315 // Be conservative.
54316 return true;
54317 case X86ISD::SETCC:
54319 CC = (X86::CondCode)User->getConstantOperandVal(0);
54320 break;
54321 case X86ISD::BRCOND:
54322 case X86ISD::CMOV:
54323 CC = (X86::CondCode)User->getConstantOperandVal(2);
54324 break;
54325 }
54326
54327 switch (CC) {
54328 // clang-format off
54329 default: break;
54330 case X86::COND_A: case X86::COND_AE:
54331 case X86::COND_B: case X86::COND_BE:
54332 case X86::COND_O: case X86::COND_NO:
54333 case X86::COND_G: case X86::COND_GE:
54334 case X86::COND_L: case X86::COND_LE:
54335 return true;
54336 // clang-format on
54337 }
54338 }
54339
54340 return false;
54341}
54342
54343static bool onlyZeroFlagUsed(SDValue Flags) {
54344 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
54345
54346 for (const SDNode *User : Flags->uses()) {
54347 unsigned CCOpNo;
54348 switch (User->getOpcode()) {
54349 default:
54350 // Be conservative.
54351 return false;
54352 case X86ISD::SETCC:
54354 CCOpNo = 0;
54355 break;
54356 case X86ISD::BRCOND:
54357 case X86ISD::CMOV:
54358 CCOpNo = 2;
54359 break;
54360 }
54361
54362 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
54363 if (CC != X86::COND_E && CC != X86::COND_NE)
54364 return false;
54365 }
54366
54367 return true;
54368}
54369
54371 const X86Subtarget &Subtarget) {
54372 // Only handle test patterns.
54373 if (!isNullConstant(N->getOperand(1)))
54374 return SDValue();
54375
54376 // If we have a CMP of a truncated binop, see if we can make a smaller binop
54377 // and use its flags directly.
54378 // TODO: Maybe we should try promoting compares that only use the zero flag
54379 // first if we can prove the upper bits with computeKnownBits?
54380 SDLoc dl(N);
54381 SDValue Op = N->getOperand(0);
54382 EVT VT = Op.getValueType();
54383 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54384
54385 // If we have a constant logical shift that's only used in a comparison
54386 // against zero turn it into an equivalent AND. This allows turning it into
54387 // a TEST instruction later.
54388 if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
54389 Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
54390 onlyZeroFlagUsed(SDValue(N, 0))) {
54391 unsigned BitWidth = VT.getSizeInBits();
54392 const APInt &ShAmt = Op.getConstantOperandAPInt(1);
54393 if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
54394 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
54395 APInt Mask = Op.getOpcode() == ISD::SRL
54396 ? APInt::getHighBitsSet(BitWidth, MaskBits)
54397 : APInt::getLowBitsSet(BitWidth, MaskBits);
54398 if (Mask.isSignedIntN(32)) {
54399 Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
54400 DAG.getConstant(Mask, dl, VT));
54401 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
54402 DAG.getConstant(0, dl, VT));
54403 }
54404 }
54405 }
54406
54407 // If we're extracting from a avx512 bool vector and comparing against zero,
54408 // then try to just bitcast the vector to an integer to use TEST/BT directly.
54409 // (and (extract_elt (kshiftr vXi1, C), 0), 1) -> (and (bc vXi1), 1<<C)
54410 if (Op.getOpcode() == ISD::AND && isOneConstant(Op.getOperand(1)) &&
54411 Op.hasOneUse() && onlyZeroFlagUsed(SDValue(N, 0))) {
54412 SDValue Src = Op.getOperand(0);
54413 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
54414 isNullConstant(Src.getOperand(1)) &&
54415 Src.getOperand(0).getValueType().getScalarType() == MVT::i1) {
54416 SDValue BoolVec = Src.getOperand(0);
54417 unsigned ShAmt = 0;
54418 if (BoolVec.getOpcode() == X86ISD::KSHIFTR) {
54419 ShAmt = BoolVec.getConstantOperandVal(1);
54420 BoolVec = BoolVec.getOperand(0);
54421 }
54422 BoolVec = widenMaskVector(BoolVec, false, Subtarget, DAG, dl);
54423 EVT VecVT = BoolVec.getValueType();
54424 unsigned BitWidth = VecVT.getVectorNumElements();
54425 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), BitWidth);
54426 if (TLI.isTypeLegal(VecVT) && TLI.isTypeLegal(BCVT)) {
54427 APInt Mask = APInt::getOneBitSet(BitWidth, ShAmt);
54428 Op = DAG.getBitcast(BCVT, BoolVec);
54429 Op = DAG.getNode(ISD::AND, dl, BCVT, Op,
54430 DAG.getConstant(Mask, dl, BCVT));
54431 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
54432 DAG.getConstant(0, dl, BCVT));
54433 }
54434 }
54435 }
54436
54437 // Peek through any zero-extend if we're only testing for a zero result.
54438 if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {
54439 SDValue Src = Op.getOperand(0);
54440 EVT SrcVT = Src.getValueType();
54441 if (SrcVT.getScalarSizeInBits() >= 8 && TLI.isTypeLegal(SrcVT))
54442 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,
54443 DAG.getConstant(0, dl, SrcVT));
54444 }
54445
54446 // Look for a truncate.
54447 if (Op.getOpcode() != ISD::TRUNCATE)
54448 return SDValue();
54449
54450 SDValue Trunc = Op;
54451 Op = Op.getOperand(0);
54452
54453 // See if we can compare with zero against the truncation source,
54454 // which should help using the Z flag from many ops. Only do this for
54455 // i32 truncated op to prevent partial-reg compares of promoted ops.
54456 EVT OpVT = Op.getValueType();
54457 APInt UpperBits =
54459 if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
54460 onlyZeroFlagUsed(SDValue(N, 0))) {
54461 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
54462 DAG.getConstant(0, dl, OpVT));
54463 }
54464
54465 // After this the truncate and arithmetic op must have a single use.
54466 if (!Trunc.hasOneUse() || !Op.hasOneUse())
54467 return SDValue();
54468
54469 unsigned NewOpc;
54470 switch (Op.getOpcode()) {
54471 default: return SDValue();
54472 case ISD::AND:
54473 // Skip and with constant. We have special handling for and with immediate
54474 // during isel to generate test instructions.
54475 if (isa<ConstantSDNode>(Op.getOperand(1)))
54476 return SDValue();
54477 NewOpc = X86ISD::AND;
54478 break;
54479 case ISD::OR: NewOpc = X86ISD::OR; break;
54480 case ISD::XOR: NewOpc = X86ISD::XOR; break;
54481 case ISD::ADD:
54482 // If the carry or overflow flag is used, we can't truncate.
54484 return SDValue();
54485 NewOpc = X86ISD::ADD;
54486 break;
54487 case ISD::SUB:
54488 // If the carry or overflow flag is used, we can't truncate.
54490 return SDValue();
54491 NewOpc = X86ISD::SUB;
54492 break;
54493 }
54494
54495 // We found an op we can narrow. Truncate its inputs.
54496 SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
54497 SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
54498
54499 // Use a X86 specific opcode to avoid DAG combine messing with it.
54500 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
54501 Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
54502
54503 // For AND, keep a CMP so that we can match the test pattern.
54504 if (NewOpc == X86ISD::AND)
54505 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
54506 DAG.getConstant(0, dl, VT));
54507
54508 // Return the flags.
54509 return Op.getValue(1);
54510}
54511
54514 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&
54515 "Expected X86ISD::ADD or X86ISD::SUB");
54516
54517 SDLoc DL(N);
54518 SDValue LHS = N->getOperand(0);
54519 SDValue RHS = N->getOperand(1);
54520 MVT VT = LHS.getSimpleValueType();
54521 bool IsSub = X86ISD::SUB == N->getOpcode();
54522 unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD;
54523
54524 // If we don't use the flag result, simplify back to a generic ADD/SUB.
54525 if (!N->hasAnyUseOfValue(1)) {
54526 SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
54527 return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
54528 }
54529
54530 // Fold any similar generic ADD/SUB opcodes to reuse this node.
54531 auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
54532 SDValue Ops[] = {N0, N1};
54533 SDVTList VTs = DAG.getVTList(N->getValueType(0));
54534 if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
54535 SDValue Op(N, 0);
54536 if (Negate)
54537 Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
54538 DCI.CombineTo(GenericAddSub, Op);
54539 }
54540 };
54541 MatchGeneric(LHS, RHS, false);
54542 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
54543
54544 // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the
54545 // EFLAGS result doesn't change.
54546 return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG,
54547 /*ZeroSecondOpOnly*/ true);
54548}
54549
54551 SDValue LHS = N->getOperand(0);
54552 SDValue RHS = N->getOperand(1);
54553 SDValue BorrowIn = N->getOperand(2);
54554
54555 if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {
54556 MVT VT = N->getSimpleValueType(0);
54557 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
54558 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);
54559 }
54560
54561 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
54562 // iff the flag result is dead.
54563 if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&
54564 !N->hasAnyUseOfValue(1))
54565 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),
54566 LHS.getOperand(1), BorrowIn);
54567
54568 return SDValue();
54569}
54570
54571// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
54574 SDValue LHS = N->getOperand(0);
54575 SDValue RHS = N->getOperand(1);
54576 SDValue CarryIn = N->getOperand(2);
54577 auto *LHSC = dyn_cast<ConstantSDNode>(LHS);
54578 auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
54579
54580 // Canonicalize constant to RHS.
54581 if (LHSC && !RHSC)
54582 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,
54583 CarryIn);
54584
54585 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
54586 // the result is either zero or one (depending on the input carry bit).
54587 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
54588 if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&
54589 // We don't have a good way to replace an EFLAGS use, so only do this when
54590 // dead right now.
54591 SDValue(N, 1).use_empty()) {
54592 SDLoc DL(N);
54593 EVT VT = N->getValueType(0);
54594 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
54595 SDValue Res1 = DAG.getNode(
54596 ISD::AND, DL, VT,
54598 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),
54599 DAG.getConstant(1, DL, VT));
54600 return DCI.CombineTo(N, Res1, CarryOut);
54601 }
54602
54603 // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)
54604 // iff the flag result is dead.
54605 // TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.
54606 if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {
54607 SDLoc DL(N);
54608 APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();
54609 return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),
54610 DAG.getConstant(0, DL, LHS.getValueType()),
54611 DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);
54612 }
54613
54614 if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {
54615 MVT VT = N->getSimpleValueType(0);
54616 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
54617 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);
54618 }
54619
54620 // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)
54621 // iff the flag result is dead.
54622 if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&
54623 !N->hasAnyUseOfValue(1))
54624 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),
54625 LHS.getOperand(1), CarryIn);
54626
54627 return SDValue();
54628}
54629
54631 const SDLoc &DL, EVT VT,
54632 const X86Subtarget &Subtarget) {
54633 // Example of pattern we try to detect:
54634 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
54635 //(add (build_vector (extract_elt t, 0),
54636 // (extract_elt t, 2),
54637 // (extract_elt t, 4),
54638 // (extract_elt t, 6)),
54639 // (build_vector (extract_elt t, 1),
54640 // (extract_elt t, 3),
54641 // (extract_elt t, 5),
54642 // (extract_elt t, 7)))
54643
54644 if (!Subtarget.hasSSE2())
54645 return SDValue();
54646
54647 if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
54649 return SDValue();
54650
54651 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
54652 VT.getVectorNumElements() < 4 ||
54654 return SDValue();
54655
54656 // Check if one of Op0,Op1 is of the form:
54657 // (build_vector (extract_elt Mul, 0),
54658 // (extract_elt Mul, 2),
54659 // (extract_elt Mul, 4),
54660 // ...
54661 // the other is of the form:
54662 // (build_vector (extract_elt Mul, 1),
54663 // (extract_elt Mul, 3),
54664 // (extract_elt Mul, 5),
54665 // ...
54666 // and identify Mul.
54667 SDValue Mul;
54668 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
54669 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
54670 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
54671 // TODO: Be more tolerant to undefs.
54672 if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54673 Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54674 Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54675 Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
54676 return SDValue();
54677 auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
54678 auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
54679 auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
54680 auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
54681 if (!Const0L || !Const1L || !Const0H || !Const1H)
54682 return SDValue();
54683 unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
54684 Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
54685 // Commutativity of mul allows factors of a product to reorder.
54686 if (Idx0L > Idx1L)
54687 std::swap(Idx0L, Idx1L);
54688 if (Idx0H > Idx1H)
54689 std::swap(Idx0H, Idx1H);
54690 // Commutativity of add allows pairs of factors to reorder.
54691 if (Idx0L > Idx0H) {
54692 std::swap(Idx0L, Idx0H);
54693 std::swap(Idx1L, Idx1H);
54694 }
54695 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
54696 Idx1H != 2 * i + 3)
54697 return SDValue();
54698 if (!Mul) {
54699 // First time an extract_elt's source vector is visited. Must be a MUL
54700 // with 2X number of vector elements than the BUILD_VECTOR.
54701 // Both extracts must be from same MUL.
54702 Mul = Op0L->getOperand(0);
54703 if (Mul->getOpcode() != ISD::MUL ||
54704 Mul.getValueType().getVectorNumElements() != 2 * e)
54705 return SDValue();
54706 }
54707 // Check that the extract is from the same MUL previously seen.
54708 if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
54709 Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
54710 return SDValue();
54711 }
54712
54713 // Check if the Mul source can be safely shrunk.
54714 ShrinkMode Mode;
54715 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
54716 Mode == ShrinkMode::MULU16)
54717 return SDValue();
54718
54719 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54720 VT.getVectorNumElements() * 2);
54721 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
54722 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
54723
54724 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
54725 ArrayRef<SDValue> Ops) {
54726 EVT InVT = Ops[0].getValueType();
54727 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
54728 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
54729 InVT.getVectorNumElements() / 2);
54730 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
54731 };
54732 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);
54733}
54734
54735// Attempt to turn this pattern into PMADDWD.
54736// (add (mul (sext (build_vector)), (sext (build_vector))),
54737// (mul (sext (build_vector)), (sext (build_vector)))
54739 const SDLoc &DL, EVT VT,
54740 const X86Subtarget &Subtarget) {
54741 if (!Subtarget.hasSSE2())
54742 return SDValue();
54743
54744 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
54745 return SDValue();
54746
54747 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
54748 VT.getVectorNumElements() < 4 ||
54750 return SDValue();
54751
54752 SDValue N00 = N0.getOperand(0);
54753 SDValue N01 = N0.getOperand(1);
54754 SDValue N10 = N1.getOperand(0);
54755 SDValue N11 = N1.getOperand(1);
54756
54757 // All inputs need to be sign extends.
54758 // TODO: Support ZERO_EXTEND from known positive?
54759 if (N00.getOpcode() != ISD::SIGN_EXTEND ||
54760 N01.getOpcode() != ISD::SIGN_EXTEND ||
54761 N10.getOpcode() != ISD::SIGN_EXTEND ||
54762 N11.getOpcode() != ISD::SIGN_EXTEND)
54763 return SDValue();
54764
54765 // Peek through the extends.
54766 N00 = N00.getOperand(0);
54767 N01 = N01.getOperand(0);
54768 N10 = N10.getOperand(0);
54769 N11 = N11.getOperand(0);
54770
54771 // Must be extending from vXi16.
54772 EVT InVT = N00.getValueType();
54773 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
54774 N10.getValueType() != InVT || N11.getValueType() != InVT)
54775 return SDValue();
54776
54777 // All inputs should be build_vectors.
54778 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
54779 N01.getOpcode() != ISD::BUILD_VECTOR ||
54780 N10.getOpcode() != ISD::BUILD_VECTOR ||
54782 return SDValue();
54783
54784 // For each element, we need to ensure we have an odd element from one vector
54785 // multiplied by the odd element of another vector and the even element from
54786 // one of the same vectors being multiplied by the even element from the
54787 // other vector. So we need to make sure for each element i, this operator
54788 // is being performed:
54789 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
54790 SDValue In0, In1;
54791 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
54792 SDValue N00Elt = N00.getOperand(i);
54793 SDValue N01Elt = N01.getOperand(i);
54794 SDValue N10Elt = N10.getOperand(i);
54795 SDValue N11Elt = N11.getOperand(i);
54796 // TODO: Be more tolerant to undefs.
54797 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54798 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54799 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54801 return SDValue();
54802 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
54803 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
54804 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
54805 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
54806 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
54807 return SDValue();
54808 unsigned IdxN00 = ConstN00Elt->getZExtValue();
54809 unsigned IdxN01 = ConstN01Elt->getZExtValue();
54810 unsigned IdxN10 = ConstN10Elt->getZExtValue();
54811 unsigned IdxN11 = ConstN11Elt->getZExtValue();
54812 // Add is commutative so indices can be reordered.
54813 if (IdxN00 > IdxN10) {
54814 std::swap(IdxN00, IdxN10);
54815 std::swap(IdxN01, IdxN11);
54816 }
54817 // N0 indices be the even element. N1 indices must be the next odd element.
54818 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
54819 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
54820 return SDValue();
54821 SDValue N00In = N00Elt.getOperand(0);
54822 SDValue N01In = N01Elt.getOperand(0);
54823 SDValue N10In = N10Elt.getOperand(0);
54824 SDValue N11In = N11Elt.getOperand(0);
54825
54826 // First time we find an input capture it.
54827 if (!In0) {
54828 In0 = N00In;
54829 In1 = N01In;
54830
54831 // The input vectors must be at least as wide as the output.
54832 // If they are larger than the output, we extract subvector below.
54833 if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
54834 In1.getValueSizeInBits() < VT.getSizeInBits())
54835 return SDValue();
54836 }
54837 // Mul is commutative so the input vectors can be in any order.
54838 // Canonicalize to make the compares easier.
54839 if (In0 != N00In)
54840 std::swap(N00In, N01In);
54841 if (In0 != N10In)
54842 std::swap(N10In, N11In);
54843 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
54844 return SDValue();
54845 }
54846
54847 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
54848 ArrayRef<SDValue> Ops) {
54849 EVT OpVT = Ops[0].getValueType();
54850 assert(OpVT.getScalarType() == MVT::i16 &&
54851 "Unexpected scalar element type");
54852 assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
54853 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
54854 OpVT.getVectorNumElements() / 2);
54855 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
54856 };
54857
54858 // If the output is narrower than an input, extract the low part of the input
54859 // vector.
54860 EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54861 VT.getVectorNumElements() * 2);
54862 if (OutVT16.bitsLT(In0.getValueType())) {
54863 In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
54864 DAG.getIntPtrConstant(0, DL));
54865 }
54866 if (OutVT16.bitsLT(In1.getValueType())) {
54867 In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
54868 DAG.getIntPtrConstant(0, DL));
54869 }
54870 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
54871 PMADDBuilder);
54872}
54873
54874// ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
54875// If upper element in each pair of both VPMADDWD are zero then we can merge
54876// the operand elements and use the implicit add of VPMADDWD.
54877// TODO: Add support for VPMADDUBSW (which isn't commutable).
54879 const SDLoc &DL, EVT VT) {
54880 if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)
54881 return SDValue();
54882
54883 // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.
54884 if (VT.getSizeInBits() > 128)
54885 return SDValue();
54886
54887 unsigned NumElts = VT.getVectorNumElements();
54888 MVT OpVT = N0.getOperand(0).getSimpleValueType();
54890 APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));
54891
54892 bool Op0HiZero =
54893 DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||
54894 DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);
54895 bool Op1HiZero =
54896 DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||
54897 DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);
54898
54899 // TODO: Check for zero lower elements once we have actual codegen that
54900 // creates them.
54901 if (!Op0HiZero || !Op1HiZero)
54902 return SDValue();
54903
54904 // Create a shuffle mask packing the lower elements from each VPMADDWD.
54905 SmallVector<int> Mask;
54906 for (int i = 0; i != (int)NumElts; ++i) {
54907 Mask.push_back(2 * i);
54908 Mask.push_back(2 * (i + NumElts));
54909 }
54910
54911 SDValue LHS =
54912 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);
54913 SDValue RHS =
54914 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);
54915 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);
54916}
54917
54918/// CMOV of constants requires materializing constant operands in registers.
54919/// Try to fold those constants into an 'add' instruction to reduce instruction
54920/// count. We do this with CMOV rather the generic 'select' because there are
54921/// earlier folds that may be used to turn select-of-constants into logic hacks.
54923 const X86Subtarget &Subtarget) {
54924 // If an operand is zero, add-of-0 gets simplified away, so that's clearly
54925 // better because we eliminate 1-2 instructions. This transform is still
54926 // an improvement without zero operands because we trade 2 move constants and
54927 // 1 add for 2 adds (LEA) as long as the constants can be represented as
54928 // immediate asm operands (fit in 32-bits).
54929 auto isSuitableCmov = [](SDValue V) {
54930 if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
54931 return false;
54932 if (!isa<ConstantSDNode>(V.getOperand(0)) ||
54933 !isa<ConstantSDNode>(V.getOperand(1)))
54934 return false;
54935 return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
54936 (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
54937 V.getConstantOperandAPInt(1).isSignedIntN(32));
54938 };
54939
54940 // Match an appropriate CMOV as the first operand of the add.
54941 SDValue Cmov = N->getOperand(0);
54942 SDValue OtherOp = N->getOperand(1);
54943 if (!isSuitableCmov(Cmov))
54944 std::swap(Cmov, OtherOp);
54945 if (!isSuitableCmov(Cmov))
54946 return SDValue();
54947
54948 // Don't remove a load folding opportunity for the add. That would neutralize
54949 // any improvements from removing constant materializations.
54950 if (X86::mayFoldLoad(OtherOp, Subtarget))
54951 return SDValue();
54952
54953 EVT VT = N->getValueType(0);
54954 SDLoc DL(N);
54955 SDValue FalseOp = Cmov.getOperand(0);
54956 SDValue TrueOp = Cmov.getOperand(1);
54957
54958 // We will push the add through the select, but we can potentially do better
54959 // if we know there is another add in the sequence and this is pointer math.
54960 // In that case, we can absorb an add into the trailing memory op and avoid
54961 // a 3-operand LEA which is likely slower than a 2-operand LEA.
54962 // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
54963 if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
54964 !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
54965 all_of(N->uses(), [&](SDNode *Use) {
54966 auto *MemNode = dyn_cast<MemSDNode>(Use);
54967 return MemNode && MemNode->getBasePtr().getNode() == N;
54968 })) {
54969 // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
54970 // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but
54971 // it is possible that choosing op1 might be better.
54972 SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);
54973 FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);
54974 TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);
54975 Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,
54976 Cmov.getOperand(2), Cmov.getOperand(3));
54977 return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);
54978 }
54979
54980 // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
54981 FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
54982 TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
54983 return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
54984 Cmov.getOperand(3));
54985}
54986
54989 const X86Subtarget &Subtarget) {
54990 EVT VT = N->getValueType(0);
54991 SDValue Op0 = N->getOperand(0);
54992 SDValue Op1 = N->getOperand(1);
54993 SDLoc DL(N);
54994
54995 if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG, Subtarget))
54996 return Select;
54997
54998 if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, DL, VT, Subtarget))
54999 return MAdd;
55000 if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, DL, VT, Subtarget))
55001 return MAdd;
55002 if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))
55003 return MAdd;
55004
55005 // Try to synthesize horizontal adds from adds of shuffles.
55006 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
55007 return V;
55008
55009 // add(psadbw(X,0),psadbw(Y,0)) -> psadbw(add(X,Y),0)
55010 // iff X and Y won't overflow.
55011 if (Op0.getOpcode() == X86ISD::PSADBW && Op1.getOpcode() == X86ISD::PSADBW &&
55014 if (DAG.willNotOverflowAdd(false, Op0.getOperand(0), Op1.getOperand(0))) {
55015 MVT OpVT = Op0.getOperand(1).getSimpleValueType();
55016 SDValue Sum =
55017 DAG.getNode(ISD::ADD, DL, OpVT, Op0.getOperand(0), Op1.getOperand(0));
55018 return DAG.getNode(X86ISD::PSADBW, DL, VT, Sum,
55019 getZeroVector(OpVT, Subtarget, DAG, DL));
55020 }
55021 }
55022
55023 // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
55024 // (sub Y, (sext (vXi1 X))).
55025 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
55026 // generic DAG combine without a legal type check, but adding this there
55027 // caused regressions.
55028 if (VT.isVector()) {
55029 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55030 if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
55031 Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
55032 TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
55033 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
55034 return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
55035 }
55036
55037 if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
55038 Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
55039 TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
55040 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
55041 return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
55042 }
55043 }
55044
55045 // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)
55046 if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&
55047 X86::isZeroNode(Op0.getOperand(1))) {
55048 assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use");
55049 return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,
55050 Op0.getOperand(0), Op0.getOperand(2));
55051 }
55052
55053 return combineAddOrSubToADCOrSBB(N, DAG);
55054}
55055
55056// Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov
55057// condition comes from the subtract node that produced -X. This matches the
55058// cmov expansion for absolute value. By swapping the operands we convert abs
55059// to nabs.
55061 SDValue N0 = N->getOperand(0);
55062 SDValue N1 = N->getOperand(1);
55063
55064 if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())
55065 return SDValue();
55066
55068 if (CC != X86::COND_S && CC != X86::COND_NS)
55069 return SDValue();
55070
55071 // Condition should come from a negate operation.
55072 SDValue Cond = N1.getOperand(3);
55073 if (Cond.getOpcode() != X86ISD::SUB || !isNullConstant(Cond.getOperand(0)))
55074 return SDValue();
55075 assert(Cond.getResNo() == 1 && "Unexpected result number");
55076
55077 // Get the X and -X from the negate.
55078 SDValue NegX = Cond.getValue(0);
55079 SDValue X = Cond.getOperand(1);
55080
55081 SDValue FalseOp = N1.getOperand(0);
55082 SDValue TrueOp = N1.getOperand(1);
55083
55084 // Cmov operands should be X and NegX. Order doesn't matter.
55085 if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))
55086 return SDValue();
55087
55088 // Build a new CMOV with the operands swapped.
55089 SDLoc DL(N);
55090 MVT VT = N->getSimpleValueType(0);
55091 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,
55092 N1.getOperand(2), Cond);
55093 // Convert sub to add.
55094 return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);
55095}
55096
55098 SDValue Op0 = N->getOperand(0);
55099 SDValue Op1 = N->getOperand(1);
55100
55101 // (sub C (zero_extend (setcc)))
55102 // =>
55103 // (add (zero_extend (setcc inverted) C-1)) if C is a nonzero immediate
55104 // Don't disturb (sub 0 setcc), which is easily done with neg.
55105 EVT VT = N->getValueType(0);
55106 auto *Op0C = dyn_cast<ConstantSDNode>(Op0);
55107 if (Op1.getOpcode() == ISD::ZERO_EXTEND && Op1.hasOneUse() && Op0C &&
55108 !Op0C->isZero() && Op1.getOperand(0).getOpcode() == X86ISD::SETCC &&
55109 Op1.getOperand(0).hasOneUse()) {
55110 SDValue SetCC = Op1.getOperand(0);
55113 APInt NewImm = Op0C->getAPIntValue() - 1;
55114 SDLoc DL(Op1);
55115 SDValue NewSetCC = getSETCC(NewCC, SetCC.getOperand(1), DL, DAG);
55116 NewSetCC = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NewSetCC);
55117 return DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(VT, VT), NewSetCC,
55118 DAG.getConstant(NewImm, DL, VT));
55119 }
55120
55121 return SDValue();
55122}
55123
55126 const X86Subtarget &Subtarget) {
55127 SDValue Op0 = N->getOperand(0);
55128 SDValue Op1 = N->getOperand(1);
55129
55130 // TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt.
55131 auto IsNonOpaqueConstant = [&](SDValue Op) {
55133 if (auto *Cst = dyn_cast<ConstantSDNode>(C))
55134 return !Cst->isOpaque();
55135 return true;
55136 }
55137 return false;
55138 };
55139
55140 // X86 can't encode an immediate LHS of a sub. See if we can push the
55141 // negation into a preceding instruction. If the RHS of the sub is a XOR with
55142 // one use and a constant, invert the immediate, saving one register.
55143 // However, ignore cases where C1 is 0, as those will become a NEG.
55144 // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
55145 if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
55146 !isNullConstant(Op0) && IsNonOpaqueConstant(Op1.getOperand(1)) &&
55147 Op1->hasOneUse()) {
55148 SDLoc DL(N);
55149 EVT VT = Op0.getValueType();
55150 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
55151 DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
55152 SDValue NewAdd =
55153 DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
55154 return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
55155 }
55156
55157 if (SDValue V = combineSubABS(N, DAG))
55158 return V;
55159
55160 // Try to synthesize horizontal subs from subs of shuffles.
55161 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
55162 return V;
55163
55164 // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)
55165 if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&
55166 X86::isZeroNode(Op1.getOperand(1))) {
55167 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
55168 return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,
55169 Op1.getOperand(0), Op1.getOperand(2));
55170 }
55171
55172 // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)
55173 // Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.
55174 if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&
55175 !(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {
55176 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
55177 SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,
55178 Op1.getOperand(1), Op1.getOperand(2));
55179 return DAG.getNode(ISD::SUB, SDLoc(N), Op0.getValueType(), ADC.getValue(0),
55180 Op1.getOperand(0));
55181 }
55182
55183 if (SDValue V = combineXorSubCTLZ(N, DAG, Subtarget))
55184 return V;
55185
55186 if (SDValue V = combineAddOrSubToADCOrSBB(N, DAG))
55187 return V;
55188
55189 return combineSubSetcc(N, DAG);
55190}
55191
55193 const X86Subtarget &Subtarget) {
55194 MVT VT = N->getSimpleValueType(0);
55195 SDLoc DL(N);
55196
55197 if (N->getOperand(0) == N->getOperand(1)) {
55198 if (N->getOpcode() == X86ISD::PCMPEQ)
55199 return DAG.getConstant(-1, DL, VT);
55200 if (N->getOpcode() == X86ISD::PCMPGT)
55201 return DAG.getConstant(0, DL, VT);
55202 }
55203
55204 return SDValue();
55205}
55206
55207/// Helper that combines an array of subvector ops as if they were the operands
55208/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
55209/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
55213 const X86Subtarget &Subtarget) {
55214 assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
55215 unsigned EltSizeInBits = VT.getScalarSizeInBits();
55216
55217 if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
55218 return DAG.getUNDEF(VT);
55219
55220 if (llvm::all_of(Ops, [](SDValue Op) {
55221 return ISD::isBuildVectorAllZeros(Op.getNode());
55222 }))
55223 return getZeroVector(VT, Subtarget, DAG, DL);
55224
55225 SDValue Op0 = Ops[0];
55226 bool IsSplat = llvm::all_equal(Ops);
55227 unsigned NumOps = Ops.size();
55228 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55229 LLVMContext &Ctx = *DAG.getContext();
55230
55231 // Repeated subvectors.
55232 if (IsSplat &&
55233 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
55234 // If this broadcast is inserted into both halves, use a larger broadcast.
55235 if (Op0.getOpcode() == X86ISD::VBROADCAST)
55236 return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
55237
55238 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
55239 if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
55240 (Subtarget.hasAVX2() ||
55242 VT.getScalarType(), Subtarget)))
55243 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
55244 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
55245 Op0.getOperand(0),
55246 DAG.getIntPtrConstant(0, DL)));
55247
55248 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
55249 if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
55250 (Subtarget.hasAVX2() ||
55251 (EltSizeInBits >= 32 &&
55252 X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&
55253 Op0.getOperand(0).getValueType() == VT.getScalarType())
55254 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
55255
55256 // concat_vectors(extract_subvector(broadcast(x)),
55257 // extract_subvector(broadcast(x))) -> broadcast(x)
55258 // concat_vectors(extract_subvector(subv_broadcast(x)),
55259 // extract_subvector(subv_broadcast(x))) -> subv_broadcast(x)
55260 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
55261 Op0.getOperand(0).getValueType() == VT) {
55262 SDValue SrcVec = Op0.getOperand(0);
55263 if (SrcVec.getOpcode() == X86ISD::VBROADCAST ||
55265 return Op0.getOperand(0);
55266 if (SrcVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
55267 Op0.getValueType() == cast<MemSDNode>(SrcVec)->getMemoryVT())
55268 return Op0.getOperand(0);
55269 }
55270
55271 // concat_vectors(permq(x),permq(x)) -> permq(concat_vectors(x,x))
55272 if (Op0.getOpcode() == X86ISD::VPERMI && Subtarget.useAVX512Regs() &&
55273 !X86::mayFoldLoad(Op0.getOperand(0), Subtarget))
55274 return DAG.getNode(Op0.getOpcode(), DL, VT,
55276 Op0.getOperand(0), Op0.getOperand(0)),
55277 Op0.getOperand(1));
55278 }
55279
55280 // concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128.
55281 // Only concat of subvector high halves which vperm2x128 is best at.
55282 // TODO: This should go in combineX86ShufflesRecursively eventually.
55283 if (VT.is256BitVector() && NumOps == 2) {
55284 SDValue Src0 = peekThroughBitcasts(Ops[0]);
55285 SDValue Src1 = peekThroughBitcasts(Ops[1]);
55286 if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
55288 EVT SrcVT0 = Src0.getOperand(0).getValueType();
55289 EVT SrcVT1 = Src1.getOperand(0).getValueType();
55290 unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
55291 unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
55292 if (SrcVT0.is256BitVector() && SrcVT1.is256BitVector() &&
55293 Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) &&
55294 Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) {
55295 return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
55296 DAG.getBitcast(VT, Src0.getOperand(0)),
55297 DAG.getBitcast(VT, Src1.getOperand(0)),
55298 DAG.getTargetConstant(0x31, DL, MVT::i8));
55299 }
55300 }
55301 }
55302
55303 // Repeated opcode.
55304 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
55305 // but it currently struggles with different vector widths.
55306 if (llvm::all_of(Ops, [Op0](SDValue Op) {
55307 return Op.getOpcode() == Op0.getOpcode() && Op.hasOneUse();
55308 })) {
55309 auto ConcatSubOperand = [&](EVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
55311 for (SDValue SubOp : SubOps)
55312 Subs.push_back(SubOp.getOperand(I));
55313 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
55314 };
55315 auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {
55316 bool AllConstants = true;
55317 bool AllSubVectors = true;
55318 for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {
55319 SDValue Sub = SubOps[I].getOperand(Op);
55320 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
55321 SDValue BC = peekThroughBitcasts(Sub);
55322 AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
55324 AllSubVectors &= Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
55325 Sub.getOperand(0).getValueType() == VT &&
55326 Sub.getConstantOperandAPInt(1) == (I * NumSubElts);
55327 }
55328 return AllConstants || AllSubVectors;
55329 };
55330
55331 switch (Op0.getOpcode()) {
55332 case X86ISD::VBROADCAST: {
55333 if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) {
55334 return Op.getOperand(0).getValueType().is128BitVector();
55335 })) {
55336 if (VT == MVT::v4f64 || VT == MVT::v4i64)
55337 return DAG.getNode(X86ISD::UNPCKL, DL, VT,
55338 ConcatSubOperand(VT, Ops, 0),
55339 ConcatSubOperand(VT, Ops, 0));
55340 // TODO: Add pseudo v8i32 PSHUFD handling to AVX1Only targets.
55341 if (VT == MVT::v8f32 || (VT == MVT::v8i32 && Subtarget.hasInt256()))
55342 return DAG.getNode(VT == MVT::v8f32 ? X86ISD::VPERMILPI
55344 DL, VT, ConcatSubOperand(VT, Ops, 0),
55345 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
55346 }
55347 break;
55348 }
55349 case X86ISD::MOVDDUP:
55350 case X86ISD::MOVSHDUP:
55351 case X86ISD::MOVSLDUP: {
55352 if (!IsSplat)
55353 return DAG.getNode(Op0.getOpcode(), DL, VT,
55354 ConcatSubOperand(VT, Ops, 0));
55355 break;
55356 }
55357 case X86ISD::SHUFP: {
55358 // Add SHUFPD support if/when necessary.
55359 if (!IsSplat && VT.getScalarType() == MVT::f32 &&
55360 llvm::all_of(Ops, [Op0](SDValue Op) {
55361 return Op.getOperand(2) == Op0.getOperand(2);
55362 })) {
55363 return DAG.getNode(Op0.getOpcode(), DL, VT,
55364 ConcatSubOperand(VT, Ops, 0),
55365 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
55366 }
55367 break;
55368 }
55369 case X86ISD::UNPCKH:
55370 case X86ISD::UNPCKL: {
55371 // Don't concatenate build_vector patterns.
55372 if (!IsSplat && EltSizeInBits >= 32 &&
55373 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
55374 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
55375 none_of(Ops, [](SDValue Op) {
55376 return peekThroughBitcasts(Op.getOperand(0)).getOpcode() ==
55378 peekThroughBitcasts(Op.getOperand(1)).getOpcode() ==
55380 })) {
55381 return DAG.getNode(Op0.getOpcode(), DL, VT,
55382 ConcatSubOperand(VT, Ops, 0),
55383 ConcatSubOperand(VT, Ops, 1));
55384 }
55385 break;
55386 }
55387 case X86ISD::PSHUFHW:
55388 case X86ISD::PSHUFLW:
55389 case X86ISD::PSHUFD:
55390 if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
55391 Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
55392 return DAG.getNode(Op0.getOpcode(), DL, VT,
55393 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
55394 }
55395 [[fallthrough]];
55396 case X86ISD::VPERMILPI:
55397 if (!IsSplat && EltSizeInBits == 32 &&
55398 (VT.is256BitVector() ||
55399 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
55400 all_of(Ops, [&Op0](SDValue Op) {
55401 return Op0.getOperand(1) == Op.getOperand(1);
55402 })) {
55403 MVT FloatVT = VT.changeVectorElementType(MVT::f32);
55404 SDValue Res = DAG.getBitcast(FloatVT, ConcatSubOperand(VT, Ops, 0));
55405 Res =
55406 DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1));
55407 return DAG.getBitcast(VT, Res);
55408 }
55409 if (!IsSplat && NumOps == 2 && VT == MVT::v4f64) {
55410 uint64_t Idx0 = Ops[0].getConstantOperandVal(1);
55411 uint64_t Idx1 = Ops[1].getConstantOperandVal(1);
55412 uint64_t Idx = ((Idx1 & 3) << 2) | (Idx0 & 3);
55413 return DAG.getNode(Op0.getOpcode(), DL, VT,
55414 ConcatSubOperand(VT, Ops, 0),
55415 DAG.getTargetConstant(Idx, DL, MVT::i8));
55416 }
55417 break;
55418 case X86ISD::PSHUFB:
55419 case X86ISD::PSADBW:
55420 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
55421 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
55422 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
55423 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
55424 NumOps * SrcVT.getVectorNumElements());
55425 return DAG.getNode(Op0.getOpcode(), DL, VT,
55426 ConcatSubOperand(SrcVT, Ops, 0),
55427 ConcatSubOperand(SrcVT, Ops, 1));
55428 }
55429 break;
55430 case X86ISD::VPERMV:
55431 if (!IsSplat && NumOps == 2 &&
55432 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
55433 MVT OpVT = Op0.getSimpleValueType();
55434 int NumSrcElts = OpVT.getVectorNumElements();
55435 SmallVector<int, 64> ConcatMask;
55436 for (unsigned i = 0; i != NumOps; ++i) {
55437 SmallVector<int, 64> SubMask;
55439 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
55440 break;
55441 for (int M : SubMask) {
55442 if (0 <= M)
55443 M += i * NumSrcElts;
55444 ConcatMask.push_back(M);
55445 }
55446 }
55447 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
55448 SDValue Src = concatSubVectors(Ops[0].getOperand(1),
55449 Ops[1].getOperand(1), DAG, DL);
55450 MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits);
55451 MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
55452 SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
55453 return DAG.getNode(X86ISD::VPERMV, DL, VT, Mask, Src);
55454 }
55455 }
55456 break;
55457 case X86ISD::VPERMV3:
55458 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
55459 MVT OpVT = Op0.getSimpleValueType();
55460 int NumSrcElts = OpVT.getVectorNumElements();
55461 SmallVector<int, 64> ConcatMask;
55462 for (unsigned i = 0; i != NumOps; ++i) {
55463 SmallVector<int, 64> SubMask;
55465 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
55466 break;
55467 for (int M : SubMask) {
55468 if (0 <= M) {
55469 M += M < NumSrcElts ? 0 : NumSrcElts;
55470 M += i * NumSrcElts;
55471 }
55472 ConcatMask.push_back(M);
55473 }
55474 }
55475 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
55476 SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),
55477 Ops[1].getOperand(0), DAG, DL);
55478 SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),
55479 Ops[1].getOperand(2), DAG, DL);
55480 MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits);
55481 MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
55482 SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
55483 return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);
55484 }
55485 }
55486 break;
55487 case X86ISD::VPERM2X128: {
55488 if (!IsSplat && VT.is512BitVector() && Subtarget.useAVX512Regs()) {
55489 assert(NumOps == 2 && "Bad concat_vectors operands");
55490 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
55491 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
55492 // TODO: Handle zero'd subvectors.
55493 if ((Imm0 & 0x88) == 0 && (Imm1 & 0x88) == 0) {
55494 int Mask[4] = {(int)(Imm0 & 0x03), (int)((Imm0 >> 4) & 0x3), (int)(Imm1 & 0x03),
55495 (int)((Imm1 >> 4) & 0x3)};
55496 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
55497 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
55498 Ops[0].getOperand(1), DAG, DL);
55499 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
55500 Ops[1].getOperand(1), DAG, DL);
55501 SDValue Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
55502 DAG.getBitcast(ShuffleVT, LHS),
55503 DAG.getBitcast(ShuffleVT, RHS),
55504 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
55505 return DAG.getBitcast(VT, Res);
55506 }
55507 }
55508 break;
55509 }
55510 case X86ISD::SHUF128: {
55511 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
55512 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
55513 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
55514 unsigned Imm = ((Imm0 & 1) << 0) | ((Imm0 & 2) << 1) | 0x08 |
55515 ((Imm1 & 1) << 4) | ((Imm1 & 2) << 5) | 0x80;
55516 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
55517 Ops[0].getOperand(1), DAG, DL);
55518 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
55519 Ops[1].getOperand(1), DAG, DL);
55520 return DAG.getNode(X86ISD::SHUF128, DL, VT, LHS, RHS,
55521 DAG.getTargetConstant(Imm, DL, MVT::i8));
55522 }
55523 break;
55524 }
55525 case ISD::TRUNCATE:
55526 if (!IsSplat && NumOps == 2 && VT.is256BitVector()) {
55527 EVT SrcVT = Ops[0].getOperand(0).getValueType();
55528 if (SrcVT.is256BitVector() && SrcVT.isSimple() &&
55529 SrcVT == Ops[1].getOperand(0).getValueType() &&
55530 Subtarget.useAVX512Regs() &&
55531 Subtarget.getPreferVectorWidth() >= 512 &&
55532 (SrcVT.getScalarSizeInBits() > 16 || Subtarget.useBWIRegs())) {
55533 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);
55534 return DAG.getNode(ISD::TRUNCATE, DL, VT,
55535 ConcatSubOperand(NewSrcVT, Ops, 0));
55536 }
55537 }
55538 break;
55539 case X86ISD::VSHLI:
55540 case X86ISD::VSRLI:
55541 // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
55542 // TODO: Move this to LowerShiftByScalarImmediate?
55543 if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
55544 llvm::all_of(Ops, [](SDValue Op) {
55545 return Op.getConstantOperandAPInt(1) == 32;
55546 })) {
55547 SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0));
55548 SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
55549 if (Op0.getOpcode() == X86ISD::VSHLI) {
55550 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
55551 {8, 0, 8, 2, 8, 4, 8, 6});
55552 } else {
55553 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
55554 {1, 8, 3, 8, 5, 8, 7, 8});
55555 }
55556 return DAG.getBitcast(VT, Res);
55557 }
55558 [[fallthrough]];
55559 case X86ISD::VSRAI:
55560 case X86ISD::VSHL:
55561 case X86ISD::VSRL:
55562 case X86ISD::VSRA:
55563 if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
55564 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
55565 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
55566 llvm::all_of(Ops, [Op0](SDValue Op) {
55567 return Op0.getOperand(1) == Op.getOperand(1);
55568 })) {
55569 return DAG.getNode(Op0.getOpcode(), DL, VT,
55570 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
55571 }
55572 break;
55573 case X86ISD::VPERMI:
55574 case X86ISD::VROTLI:
55575 case X86ISD::VROTRI:
55576 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
55577 llvm::all_of(Ops, [Op0](SDValue Op) {
55578 return Op0.getOperand(1) == Op.getOperand(1);
55579 })) {
55580 return DAG.getNode(Op0.getOpcode(), DL, VT,
55581 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
55582 }
55583 break;
55584 case ISD::AND:
55585 case ISD::OR:
55586 case ISD::XOR:
55587 case X86ISD::ANDNP:
55588 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
55589 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
55590 return DAG.getNode(Op0.getOpcode(), DL, VT,
55591 ConcatSubOperand(VT, Ops, 0),
55592 ConcatSubOperand(VT, Ops, 1));
55593 }
55594 break;
55595 case X86ISD::PCMPEQ:
55596 case X86ISD::PCMPGT:
55597 if (!IsSplat && VT.is256BitVector() && Subtarget.hasInt256() &&
55598 (IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1))) {
55599 return DAG.getNode(Op0.getOpcode(), DL, VT,
55600 ConcatSubOperand(VT, Ops, 0),
55601 ConcatSubOperand(VT, Ops, 1));
55602 }
55603 break;
55604 case ISD::CTPOP:
55605 case ISD::CTTZ:
55606 case ISD::CTLZ:
55609 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
55610 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
55611 return DAG.getNode(Op0.getOpcode(), DL, VT,
55612 ConcatSubOperand(VT, Ops, 0));
55613 }
55614 break;
55616 if (!IsSplat &&
55617 (VT.is256BitVector() ||
55618 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
55619 llvm::all_of(Ops, [Op0](SDValue Op) {
55620 return Op0.getOperand(2) == Op.getOperand(2);
55621 })) {
55622 return DAG.getNode(Op0.getOpcode(), DL, VT,
55623 ConcatSubOperand(VT, Ops, 0),
55624 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
55625 }
55626 break;
55627 case ISD::ADD:
55628 case ISD::SUB:
55629 case ISD::MUL:
55630 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
55631 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
55632 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
55633 return DAG.getNode(Op0.getOpcode(), DL, VT,
55634 ConcatSubOperand(VT, Ops, 0),
55635 ConcatSubOperand(VT, Ops, 1));
55636 }
55637 break;
55638 // Due to VADD, VSUB, VMUL can executed on more ports than VINSERT and
55639 // their latency are short, so here we don't replace them unless we won't
55640 // introduce extra VINSERT.
55641 case ISD::FADD:
55642 case ISD::FSUB:
55643 case ISD::FMUL:
55644 if (!IsSplat && (IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1)) &&
55645 (VT.is256BitVector() ||
55646 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
55647 return DAG.getNode(Op0.getOpcode(), DL, VT,
55648 ConcatSubOperand(VT, Ops, 0),
55649 ConcatSubOperand(VT, Ops, 1));
55650 }
55651 break;
55652 case ISD::FDIV:
55653 if (!IsSplat && (VT.is256BitVector() ||
55654 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
55655 return DAG.getNode(Op0.getOpcode(), DL, VT,
55656 ConcatSubOperand(VT, Ops, 0),
55657 ConcatSubOperand(VT, Ops, 1));
55658 }
55659 break;
55660 case X86ISD::HADD:
55661 case X86ISD::HSUB:
55662 case X86ISD::FHADD:
55663 case X86ISD::FHSUB:
55664 if (!IsSplat && VT.is256BitVector() &&
55665 (VT.isFloatingPoint() || Subtarget.hasInt256())) {
55666 return DAG.getNode(Op0.getOpcode(), DL, VT,
55667 ConcatSubOperand(VT, Ops, 0),
55668 ConcatSubOperand(VT, Ops, 1));
55669 }
55670 break;
55671 case X86ISD::PACKSS:
55672 case X86ISD::PACKUS:
55673 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
55674 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
55675 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
55676 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
55677 NumOps * SrcVT.getVectorNumElements());
55678 return DAG.getNode(Op0.getOpcode(), DL, VT,
55679 ConcatSubOperand(SrcVT, Ops, 0),
55680 ConcatSubOperand(SrcVT, Ops, 1));
55681 }
55682 break;
55683 case X86ISD::PALIGNR:
55684 if (!IsSplat &&
55685 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
55686 (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
55687 llvm::all_of(Ops, [Op0](SDValue Op) {
55688 return Op0.getOperand(2) == Op.getOperand(2);
55689 })) {
55690 return DAG.getNode(Op0.getOpcode(), DL, VT,
55691 ConcatSubOperand(VT, Ops, 0),
55692 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
55693 }
55694 break;
55695 case X86ISD::BLENDI:
55696 if (NumOps == 2 && VT.is512BitVector() && Subtarget.useBWIRegs()) {
55697 uint64_t Mask0 = Ops[0].getConstantOperandVal(2);
55698 uint64_t Mask1 = Ops[1].getConstantOperandVal(2);
55699 // MVT::v16i16 has repeated blend mask.
55700 if (Op0.getSimpleValueType() == MVT::v16i16) {
55701 Mask0 = (Mask0 << 8) | Mask0;
55702 Mask1 = (Mask1 << 8) | Mask1;
55703 }
55704 uint64_t Mask = (Mask1 << (VT.getVectorNumElements() / 2)) | Mask0;
55706 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
55707 SDValue Sel =
55708 DAG.getBitcast(MaskVT, DAG.getConstant(Mask, DL, MaskSVT));
55709 return DAG.getSelect(DL, VT, Sel, ConcatSubOperand(VT, Ops, 1),
55710 ConcatSubOperand(VT, Ops, 0));
55711 }
55712 break;
55713 case ISD::VSELECT:
55714 if (!IsSplat && Subtarget.hasAVX512() &&
55715 (VT.is256BitVector() ||
55716 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
55717 (EltSizeInBits >= 32 || Subtarget.hasBWI())) {
55718 EVT SelVT = Ops[0].getOperand(0).getValueType();
55719 if (SelVT.getVectorElementType() == MVT::i1) {
55720 SelVT = EVT::getVectorVT(Ctx, MVT::i1,
55721 NumOps * SelVT.getVectorNumElements());
55722 if (TLI.isTypeLegal(SelVT))
55723 return DAG.getNode(Op0.getOpcode(), DL, VT,
55724 ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
55725 ConcatSubOperand(VT, Ops, 1),
55726 ConcatSubOperand(VT, Ops, 2));
55727 }
55728 }
55729 [[fallthrough]];
55730 case X86ISD::BLENDV:
55731 if (!IsSplat && VT.is256BitVector() && NumOps == 2 &&
55732 (EltSizeInBits >= 32 || Subtarget.hasInt256()) &&
55733 IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) {
55734 EVT SelVT = Ops[0].getOperand(0).getValueType();
55735 SelVT = SelVT.getDoubleNumVectorElementsVT(Ctx);
55736 if (TLI.isTypeLegal(SelVT))
55737 return DAG.getNode(Op0.getOpcode(), DL, VT,
55738 ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
55739 ConcatSubOperand(VT, Ops, 1),
55740 ConcatSubOperand(VT, Ops, 2));
55741 }
55742 break;
55743 }
55744 }
55745
55746 // Fold subvector loads into one.
55747 // If needed, look through bitcasts to get to the load.
55748 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
55749 unsigned Fast;
55750 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
55751 if (TLI->allowsMemoryAccess(Ctx, DAG.getDataLayout(), VT,
55752 *FirstLd->getMemOperand(), &Fast) &&
55753 Fast) {
55754 if (SDValue Ld =
55755 EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
55756 return Ld;
55757 }
55758 }
55759
55760 // Attempt to fold target constant loads.
55761 if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {
55762 SmallVector<APInt> EltBits;
55763 APInt UndefElts = APInt::getZero(VT.getVectorNumElements());
55764 for (unsigned I = 0; I != NumOps; ++I) {
55765 APInt OpUndefElts;
55766 SmallVector<APInt> OpEltBits;
55767 if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,
55768 OpEltBits, /*AllowWholeUndefs*/ true,
55769 /*AllowPartialUndefs*/ false))
55770 break;
55771 EltBits.append(OpEltBits);
55772 UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());
55773 }
55774 if (EltBits.size() == VT.getVectorNumElements()) {
55775 Constant *C = getConstantVector(VT, EltBits, UndefElts, Ctx);
55776 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
55777 SDValue CV = DAG.getConstantPool(C, PVT);
55780 SDValue Ld = DAG.getLoad(VT, DL, DAG.getEntryNode(), CV, MPI);
55781 SDValue Sub = extractSubVector(Ld, 0, DAG, DL, Op0.getValueSizeInBits());
55782 DAG.ReplaceAllUsesOfValueWith(Op0, Sub);
55783 return Ld;
55784 }
55785 }
55786
55787 // If this simple subvector or scalar/subvector broadcast_load is inserted
55788 // into both halves, use a larger broadcast_load. Update other uses to use
55789 // an extracted subvector.
55790 if (IsSplat &&
55791 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
55792 if (ISD::isNormalLoad(Op0.getNode()) ||
55795 auto *Mem = cast<MemSDNode>(Op0);
55796 unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD
55799 if (SDValue BcastLd =
55800 getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {
55801 SDValue BcastSrc =
55802 extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());
55803 DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);
55804 return BcastLd;
55805 }
55806 }
55807 }
55808
55809 // If we're splatting a 128-bit subvector to 512-bits, use SHUF128 directly.
55810 if (IsSplat && NumOps == 4 && VT.is512BitVector() &&
55811 Subtarget.useAVX512Regs()) {
55812 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
55813 SDValue Res = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
55814 Res = DAG.getBitcast(ShuffleVT, Res);
55815 Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT, Res, Res,
55816 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
55817 return DAG.getBitcast(VT, Res);
55818 }
55819
55820 return SDValue();
55821}
55822
55825 const X86Subtarget &Subtarget) {
55826 EVT VT = N->getValueType(0);
55827 EVT SrcVT = N->getOperand(0).getValueType();
55828 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55829 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
55830
55831 if (VT.getVectorElementType() == MVT::i1) {
55832 // Attempt to constant fold.
55833 unsigned SubSizeInBits = SrcVT.getSizeInBits();
55835 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
55836 auto *C = dyn_cast<ConstantSDNode>(peekThroughBitcasts(Ops[I]));
55837 if (!C) break;
55838 Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);
55839 if (I == (E - 1)) {
55840 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
55841 if (TLI.isTypeLegal(IntVT))
55842 return DAG.getBitcast(VT, DAG.getConstant(Constant, SDLoc(N), IntVT));
55843 }
55844 }
55845
55846 // Don't do anything else for i1 vectors.
55847 return SDValue();
55848 }
55849
55850 if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
55851 if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
55852 DCI, Subtarget))
55853 return R;
55854 }
55855
55856 return SDValue();
55857}
55858
55861 const X86Subtarget &Subtarget) {
55862 if (DCI.isBeforeLegalizeOps())
55863 return SDValue();
55864
55865 MVT OpVT = N->getSimpleValueType(0);
55866
55867 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
55868
55869 SDLoc dl(N);
55870 SDValue Vec = N->getOperand(0);
55871 SDValue SubVec = N->getOperand(1);
55872
55873 uint64_t IdxVal = N->getConstantOperandVal(2);
55874 MVT SubVecVT = SubVec.getSimpleValueType();
55875
55876 if (Vec.isUndef() && SubVec.isUndef())
55877 return DAG.getUNDEF(OpVT);
55878
55879 // Inserting undefs/zeros into zeros/undefs is a zero vector.
55880 if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
55881 (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
55882 return getZeroVector(OpVT, Subtarget, DAG, dl);
55883
55885 // If we're inserting into a zero vector and then into a larger zero vector,
55886 // just insert into the larger zero vector directly.
55887 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
55889 uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
55890 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
55891 getZeroVector(OpVT, Subtarget, DAG, dl),
55892 SubVec.getOperand(1),
55893 DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
55894 }
55895
55896 // If we're inserting into a zero vector and our input was extracted from an
55897 // insert into a zero vector of the same type and the extraction was at
55898 // least as large as the original insertion. Just insert the original
55899 // subvector into a zero vector.
55900 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
55901 isNullConstant(SubVec.getOperand(1)) &&
55903 SDValue Ins = SubVec.getOperand(0);
55904 if (isNullConstant(Ins.getOperand(2)) &&
55905 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
55906 Ins.getOperand(1).getValueSizeInBits().getFixedValue() <=
55907 SubVecVT.getFixedSizeInBits())
55908 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
55909 getZeroVector(OpVT, Subtarget, DAG, dl),
55910 Ins.getOperand(1), N->getOperand(2));
55911 }
55912 }
55913
55914 // Stop here if this is an i1 vector.
55915 if (IsI1Vector)
55916 return SDValue();
55917
55918 // Eliminate an intermediate vector widening:
55919 // insert_subvector X, (insert_subvector undef, Y, 0), Idx -->
55920 // insert_subvector X, Y, Idx
55921 // TODO: This is a more general version of a DAGCombiner fold, can we move it
55922 // there?
55923 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
55924 SubVec.getOperand(0).isUndef() && isNullConstant(SubVec.getOperand(2)))
55925 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
55926 SubVec.getOperand(1), N->getOperand(2));
55927
55928 // If this is an insert of an extract, combine to a shuffle. Don't do this
55929 // if the insert or extract can be represented with a subregister operation.
55930 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
55931 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
55932 (IdxVal != 0 ||
55933 !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
55934 int ExtIdxVal = SubVec.getConstantOperandVal(1);
55935 if (ExtIdxVal != 0) {
55936 int VecNumElts = OpVT.getVectorNumElements();
55937 int SubVecNumElts = SubVecVT.getVectorNumElements();
55938 SmallVector<int, 64> Mask(VecNumElts);
55939 // First create an identity shuffle mask.
55940 for (int i = 0; i != VecNumElts; ++i)
55941 Mask[i] = i;
55942 // Now insert the extracted portion.
55943 for (int i = 0; i != SubVecNumElts; ++i)
55944 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
55945
55946 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
55947 }
55948 }
55949
55950 // Match concat_vector style patterns.
55951 SmallVector<SDValue, 2> SubVectorOps;
55952 if (collectConcatOps(N, SubVectorOps, DAG)) {
55953 if (SDValue Fold =
55954 combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
55955 return Fold;
55956
55957 // If we're inserting all zeros into the upper half, change this to
55958 // a concat with zero. We will match this to a move
55959 // with implicit upper bit zeroing during isel.
55960 // We do this here because we don't want combineConcatVectorOps to
55961 // create INSERT_SUBVECTOR from CONCAT_VECTORS.
55962 if (SubVectorOps.size() == 2 &&
55963 ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
55964 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
55965 getZeroVector(OpVT, Subtarget, DAG, dl),
55966 SubVectorOps[0], DAG.getIntPtrConstant(0, dl));
55967
55968 // Attempt to recursively combine to a shuffle.
55969 if (all_of(SubVectorOps, [](SDValue SubOp) {
55970 return isTargetShuffle(SubOp.getOpcode());
55971 })) {
55972 SDValue Op(N, 0);
55973 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
55974 return Res;
55975 }
55976 }
55977
55978 // If this is a broadcast insert into an upper undef, use a larger broadcast.
55979 if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
55980 return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
55981
55982 // If this is a broadcast load inserted into an upper undef, use a larger
55983 // broadcast load.
55984 if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
55985 SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
55986 auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
55987 SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);
55988 SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
55989 SDValue BcastLd =
55991 MemIntr->getMemoryVT(),
55992 MemIntr->getMemOperand());
55993 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
55994 return BcastLd;
55995 }
55996
55997 // If we're splatting the lower half subvector of a full vector load into the
55998 // upper half, attempt to create a subvector broadcast.
55999 if (IdxVal == (OpVT.getVectorNumElements() / 2) && SubVec.hasOneUse() &&
56000 Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
56001 auto *VecLd = dyn_cast<LoadSDNode>(Vec);
56002 auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
56003 if (VecLd && SubLd &&
56004 DAG.areNonVolatileConsecutiveLoads(SubLd, VecLd,
56005 SubVec.getValueSizeInBits() / 8, 0))
56006 return getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT, SubVecVT,
56007 SubLd, 0, DAG);
56008 }
56009
56010 return SDValue();
56011}
56012
56013/// If we are extracting a subvector of a vector select and the select condition
56014/// is composed of concatenated vectors, try to narrow the select width. This
56015/// is a common pattern for AVX1 integer code because 256-bit selects may be
56016/// legal, but there is almost no integer math/logic available for 256-bit.
56017/// This function should only be called with legal types (otherwise, the calls
56018/// to get simple value types will assert).
56020 SelectionDAG &DAG) {
56021 SDValue Sel = Ext->getOperand(0);
56022 if (Sel.getOpcode() != ISD::VSELECT ||
56023 !isFreeToSplitVector(Sel.getOperand(0).getNode(), DAG))
56024 return SDValue();
56025
56026 // Note: We assume simple value types because this should only be called with
56027 // legal operations/types.
56028 // TODO: This can be extended to handle extraction to 256-bits.
56029 MVT VT = Ext->getSimpleValueType(0);
56030 if (!VT.is128BitVector())
56031 return SDValue();
56032
56033 MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
56034 if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
56035 return SDValue();
56036
56037 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
56038 MVT SelVT = Sel.getSimpleValueType();
56039 assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
56040 "Unexpected vector type with legal operations");
56041
56042 unsigned SelElts = SelVT.getVectorNumElements();
56043 unsigned CastedElts = WideVT.getVectorNumElements();
56044 unsigned ExtIdx = Ext->getConstantOperandVal(1);
56045 if (SelElts % CastedElts == 0) {
56046 // The select has the same or more (narrower) elements than the extract
56047 // operand. The extraction index gets scaled by that factor.
56048 ExtIdx *= (SelElts / CastedElts);
56049 } else if (CastedElts % SelElts == 0) {
56050 // The select has less (wider) elements than the extract operand. Make sure
56051 // that the extraction index can be divided evenly.
56052 unsigned IndexDivisor = CastedElts / SelElts;
56053 if (ExtIdx % IndexDivisor != 0)
56054 return SDValue();
56055 ExtIdx /= IndexDivisor;
56056 } else {
56057 llvm_unreachable("Element count of simple vector types are not divisible?");
56058 }
56059
56060 unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
56061 unsigned NarrowElts = SelElts / NarrowingFactor;
56062 MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
56063 SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
56064 SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
56065 SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
56066 SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
56067 return DAG.getBitcast(VT, NarrowSel);
56068}
56069
56072 const X86Subtarget &Subtarget) {
56073 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
56074 // eventually get combined/lowered into ANDNP) with a concatenated operand,
56075 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
56076 // We let generic combining take over from there to simplify the
56077 // insert/extract and 'not'.
56078 // This pattern emerges during AVX1 legalization. We handle it before lowering
56079 // to avoid complications like splitting constant vector loads.
56080
56081 // Capture the original wide type in the likely case that we need to bitcast
56082 // back to this type.
56083 if (!N->getValueType(0).isSimple())
56084 return SDValue();
56085
56086 MVT VT = N->getSimpleValueType(0);
56087 SDValue InVec = N->getOperand(0);
56088 unsigned IdxVal = N->getConstantOperandVal(1);
56089 SDValue InVecBC = peekThroughBitcasts(InVec);
56090 EVT InVecVT = InVec.getValueType();
56091 unsigned SizeInBits = VT.getSizeInBits();
56092 unsigned InSizeInBits = InVecVT.getSizeInBits();
56093 unsigned NumSubElts = VT.getVectorNumElements();
56094 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56095 SDLoc DL(N);
56096
56097 if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
56098 TLI.isTypeLegal(InVecVT) &&
56099 InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {
56100 auto isConcatenatedNot = [](SDValue V) {
56101 V = peekThroughBitcasts(V);
56102 if (!isBitwiseNot(V))
56103 return false;
56104 SDValue NotOp = V->getOperand(0);
56106 };
56107 if (isConcatenatedNot(InVecBC.getOperand(0)) ||
56108 isConcatenatedNot(InVecBC.getOperand(1))) {
56109 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
56110 SDValue Concat = splitVectorIntBinary(InVecBC, DAG, SDLoc(InVecBC));
56111 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
56112 DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
56113 }
56114 }
56115
56116 if (DCI.isBeforeLegalizeOps())
56117 return SDValue();
56118
56119 if (SDValue V = narrowExtractedVectorSelect(N, DL, DAG))
56120 return V;
56121
56123 return getZeroVector(VT, Subtarget, DAG, DL);
56124
56125 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
56126 if (VT.getScalarType() == MVT::i1)
56127 return DAG.getConstant(1, DL, VT);
56128 return getOnesVector(VT, DAG, DL);
56129 }
56130
56131 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
56132 return DAG.getBuildVector(VT, DL, InVec->ops().slice(IdxVal, NumSubElts));
56133
56134 // If we are extracting from an insert into a larger vector, replace with a
56135 // smaller insert if we don't access less than the original subvector. Don't
56136 // do this for i1 vectors.
56137 // TODO: Relax the matching indices requirement?
56138 if (VT.getVectorElementType() != MVT::i1 &&
56139 InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse() &&
56140 IdxVal == InVec.getConstantOperandVal(2) &&
56141 InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {
56142 SDValue NewExt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
56143 InVec.getOperand(0), N->getOperand(1));
56144 unsigned NewIdxVal = InVec.getConstantOperandVal(2) - IdxVal;
56145 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewExt,
56146 InVec.getOperand(1),
56147 DAG.getVectorIdxConstant(NewIdxVal, DL));
56148 }
56149
56150 // If we're extracting an upper subvector from a broadcast we should just
56151 // extract the lowest subvector instead which should allow
56152 // SimplifyDemandedVectorElts do more simplifications.
56153 if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||
56155 DAG.isSplatValue(InVec, /*AllowUndefs*/ false)))
56156 return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
56157
56158 // If we're extracting a broadcasted subvector, just use the lowest subvector.
56159 if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
56160 cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)
56161 return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
56162
56163 // Attempt to extract from the source of a shuffle vector.
56164 if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {
56165 SmallVector<int, 32> ShuffleMask;
56166 SmallVector<int, 32> ScaledMask;
56167 SmallVector<SDValue, 2> ShuffleInputs;
56168 unsigned NumSubVecs = InSizeInBits / SizeInBits;
56169 // Decode the shuffle mask and scale it so its shuffling subvectors.
56170 if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
56171 scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
56172 unsigned SubVecIdx = IdxVal / NumSubElts;
56173 if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
56174 return DAG.getUNDEF(VT);
56175 if (ScaledMask[SubVecIdx] == SM_SentinelZero)
56176 return getZeroVector(VT, Subtarget, DAG, DL);
56177 SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
56178 if (Src.getValueSizeInBits() == InSizeInBits) {
56179 unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
56180 unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts;
56181 return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
56182 DL, SizeInBits);
56183 }
56184 }
56185 }
56186
56187 auto IsExtractFree = [](SDValue V) {
56188 V = peekThroughBitcasts(V);
56189 if (ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
56190 return true;
56192 return true;
56193 return V.isUndef();
56194 };
56195
56196 // If we're extracting the lowest subvector and we're the only user,
56197 // we may be able to perform this with a smaller vector width.
56198 unsigned InOpcode = InVec.getOpcode();
56199 if (InVec.hasOneUse()) {
56200 if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
56201 // v2f64 CVTDQ2PD(v4i32).
56202 if (InOpcode == ISD::SINT_TO_FP &&
56203 InVec.getOperand(0).getValueType() == MVT::v4i32) {
56204 return DAG.getNode(X86ISD::CVTSI2P, DL, VT, InVec.getOperand(0));
56205 }
56206 // v2f64 CVTUDQ2PD(v4i32).
56207 if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
56208 InVec.getOperand(0).getValueType() == MVT::v4i32) {
56209 return DAG.getNode(X86ISD::CVTUI2P, DL, VT, InVec.getOperand(0));
56210 }
56211 // v2f64 CVTPS2PD(v4f32).
56212 if (InOpcode == ISD::FP_EXTEND &&
56213 InVec.getOperand(0).getValueType() == MVT::v4f32) {
56214 return DAG.getNode(X86ISD::VFPEXT, DL, VT, InVec.getOperand(0));
56215 }
56216 }
56217 // v4i32 CVTPS2DQ(v4f32).
56218 if (InOpcode == ISD::FP_TO_SINT && VT == MVT::v4i32) {
56219 SDValue Src = InVec.getOperand(0);
56220 if (Src.getValueType().getScalarType() == MVT::f32)
56221 return DAG.getNode(InOpcode, DL, VT,
56222 extractSubVector(Src, IdxVal, DAG, DL, SizeInBits));
56223 }
56224 if (IdxVal == 0 &&
56225 (ISD::isExtOpcode(InOpcode) || ISD::isExtVecInRegOpcode(InOpcode)) &&
56226 (SizeInBits == 128 || SizeInBits == 256) &&
56227 InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
56228 SDValue Ext = InVec.getOperand(0);
56229 if (Ext.getValueSizeInBits() > SizeInBits)
56230 Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
56231 unsigned ExtOp = DAG.getOpcode_EXTEND_VECTOR_INREG(InOpcode);
56232 return DAG.getNode(ExtOp, DL, VT, Ext);
56233 }
56234 if (IdxVal == 0 && InOpcode == ISD::VSELECT &&
56235 InVec.getOperand(0).getValueType().is256BitVector() &&
56236 InVec.getOperand(1).getValueType().is256BitVector() &&
56237 InVec.getOperand(2).getValueType().is256BitVector()) {
56238 SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
56239 SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
56240 SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
56241 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
56242 }
56243 if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
56244 (SizeInBits == 128 || SizeInBits == 256)) {
56245 SDValue InVecSrc = InVec.getOperand(0);
56246 unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
56247 SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
56248 return DAG.getNode(InOpcode, DL, VT, Ext);
56249 }
56250 if ((InOpcode == X86ISD::CMPP || InOpcode == X86ISD::PCMPEQ ||
56251 InOpcode == X86ISD::PCMPGT) &&
56252 (IsExtractFree(InVec.getOperand(0)) ||
56253 IsExtractFree(InVec.getOperand(1))) &&
56254 SizeInBits == 128) {
56255 SDValue Ext0 =
56256 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
56257 SDValue Ext1 =
56258 extractSubVector(InVec.getOperand(1), IdxVal, DAG, DL, SizeInBits);
56259 if (InOpcode == X86ISD::CMPP)
56260 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, InVec.getOperand(2));
56261 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1);
56262 }
56263 if (InOpcode == X86ISD::MOVDDUP &&
56264 (SizeInBits == 128 || SizeInBits == 256)) {
56265 SDValue Ext0 =
56266 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
56267 return DAG.getNode(InOpcode, DL, VT, Ext0);
56268 }
56269 }
56270
56271 // Always split vXi64 logical shifts where we're extracting the upper 32-bits
56272 // as this is very likely to fold into a shuffle/truncation.
56273 if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
56274 InVecVT.getScalarSizeInBits() == 64 &&
56275 InVec.getConstantOperandAPInt(1) == 32) {
56276 SDValue Ext =
56277 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
56278 return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
56279 }
56280
56281 return SDValue();
56282}
56283
56285 EVT VT = N->getValueType(0);
56286 SDValue Src = N->getOperand(0);
56287 SDLoc DL(N);
56288
56289 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
56290 // This occurs frequently in our masked scalar intrinsic code and our
56291 // floating point select lowering with AVX512.
56292 // TODO: SimplifyDemandedBits instead?
56293 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse() &&
56294 isOneConstant(Src.getOperand(1)))
56295 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0));
56296
56297 // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
56298 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
56299 Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
56300 Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
56301 isNullConstant(Src.getOperand(1)))
56302 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
56303 Src.getOperand(1));
56304
56305 // Reduce v2i64 to v4i32 if we don't need the upper bits or are known zero.
56306 // TODO: Move to DAGCombine/SimplifyDemandedBits?
56307 if ((VT == MVT::v2i64 || VT == MVT::v2f64) && Src.hasOneUse()) {
56308 auto IsExt64 = [&DAG](SDValue Op, bool IsZeroExt) {
56309 if (Op.getValueType() != MVT::i64)
56310 return SDValue();
56311 unsigned Opc = IsZeroExt ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND;
56312 if (Op.getOpcode() == Opc &&
56313 Op.getOperand(0).getScalarValueSizeInBits() <= 32)
56314 return Op.getOperand(0);
56315 unsigned Ext = IsZeroExt ? ISD::ZEXTLOAD : ISD::EXTLOAD;
56316 if (auto *Ld = dyn_cast<LoadSDNode>(Op))
56317 if (Ld->getExtensionType() == Ext &&
56318 Ld->getMemoryVT().getScalarSizeInBits() <= 32)
56319 return Op;
56320 if (IsZeroExt) {
56321 KnownBits Known = DAG.computeKnownBits(Op);
56322 if (!Known.isConstant() && Known.countMinLeadingZeros() >= 32)
56323 return Op;
56324 }
56325 return SDValue();
56326 };
56327
56328 if (SDValue AnyExt = IsExt64(peekThroughOneUseBitcasts(Src), false))
56329 return DAG.getBitcast(
56330 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
56331 DAG.getAnyExtOrTrunc(AnyExt, DL, MVT::i32)));
56332
56333 if (SDValue ZeroExt = IsExt64(peekThroughOneUseBitcasts(Src), true))
56334 return DAG.getBitcast(
56335 VT,
56336 DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32,
56337 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
56338 DAG.getZExtOrTrunc(ZeroExt, DL, MVT::i32))));
56339 }
56340
56341 // Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.
56342 if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&
56343 Src.getOperand(0).getValueType() == MVT::x86mmx)
56344 return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));
56345
56346 // See if we're broadcasting the scalar value, in which case just reuse that.
56347 // Ensure the same SDValue from the SDNode use is being used.
56348 if (VT.getScalarType() == Src.getValueType())
56349 for (SDNode *User : Src->uses())
56350 if (User->getOpcode() == X86ISD::VBROADCAST &&
56351 Src == User->getOperand(0)) {
56352 unsigned SizeInBits = VT.getFixedSizeInBits();
56353 unsigned BroadcastSizeInBits =
56354 User->getValueSizeInBits(0).getFixedValue();
56355 if (BroadcastSizeInBits == SizeInBits)
56356 return SDValue(User, 0);
56357 if (BroadcastSizeInBits > SizeInBits)
56358 return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
56359 // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
56360 // coverage.
56361 }
56362
56363 return SDValue();
56364}
56365
56366// Simplify PMULDQ and PMULUDQ operations.
56369 const X86Subtarget &Subtarget) {
56370 SDValue LHS = N->getOperand(0);
56371 SDValue RHS = N->getOperand(1);
56372
56373 // Canonicalize constant to RHS.
56376 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
56377
56378 // Multiply by zero.
56379 // Don't return RHS as it may contain UNDEFs.
56380 if (ISD::isBuildVectorAllZeros(RHS.getNode()))
56381 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
56382
56383 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
56384 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56385 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))
56386 return SDValue(N, 0);
56387
56388 // If the input is an extend_invec and the SimplifyDemandedBits call didn't
56389 // convert it to any_extend_invec, due to the LegalOperations check, do the
56390 // conversion directly to a vector shuffle manually. This exposes combine
56391 // opportunities missed by combineEXTEND_VECTOR_INREG not calling
56392 // combineX86ShufflesRecursively on SSE4.1 targets.
56393 // FIXME: This is basically a hack around several other issues related to
56394 // ANY_EXTEND_VECTOR_INREG.
56395 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
56396 (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
56397 LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
56398 LHS.getOperand(0).getValueType() == MVT::v4i32) {
56399 SDLoc dl(N);
56400 LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
56401 LHS.getOperand(0), { 0, -1, 1, -1 });
56402 LHS = DAG.getBitcast(MVT::v2i64, LHS);
56403 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
56404 }
56405 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
56406 (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
56407 RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
56408 RHS.getOperand(0).getValueType() == MVT::v4i32) {
56409 SDLoc dl(N);
56410 RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
56411 RHS.getOperand(0), { 0, -1, 1, -1 });
56412 RHS = DAG.getBitcast(MVT::v2i64, RHS);
56413 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
56414 }
56415
56416 return SDValue();
56417}
56418
56419// Simplify VPMADDUBSW/VPMADDWD operations.
56422 EVT VT = N->getValueType(0);
56423 SDValue LHS = N->getOperand(0);
56424 SDValue RHS = N->getOperand(1);
56425
56426 // Multiply by zero.
56427 // Don't return LHS/RHS as it may contain UNDEFs.
56428 if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||
56430 return DAG.getConstant(0, SDLoc(N), VT);
56431
56432 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56433 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
56434 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
56435 return SDValue(N, 0);
56436
56437 return SDValue();
56438}
56439
56442 const X86Subtarget &Subtarget) {
56443 EVT VT = N->getValueType(0);
56444 SDValue In = N->getOperand(0);
56445 unsigned Opcode = N->getOpcode();
56446 unsigned InOpcode = In.getOpcode();
56447 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56448 SDLoc DL(N);
56449
56450 // Try to merge vector loads and extend_inreg to an extload.
56451 if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
56452 In.hasOneUse()) {
56453 auto *Ld = cast<LoadSDNode>(In);
56454 if (Ld->isSimple()) {
56455 MVT SVT = In.getSimpleValueType().getVectorElementType();
56458 : ISD::ZEXTLOAD;
56459 EVT MemVT = VT.changeVectorElementType(SVT);
56460 if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
56461 SDValue Load = DAG.getExtLoad(
56462 Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
56463 MemVT, Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags());
56464 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
56465 return Load;
56466 }
56467 }
56468 }
56469
56470 // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
56471 if (Opcode == InOpcode)
56472 return DAG.getNode(Opcode, DL, VT, In.getOperand(0));
56473
56474 // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
56475 // -> EXTEND_VECTOR_INREG(X).
56476 // TODO: Handle non-zero subvector indices.
56477 if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
56478 In.getOperand(0).getOpcode() == DAG.getOpcode_EXTEND(Opcode) &&
56479 In.getOperand(0).getOperand(0).getValueSizeInBits() ==
56480 In.getValueSizeInBits())
56481 return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));
56482
56483 // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).
56484 // TODO: Move to DAGCombine?
56485 if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&
56486 In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&
56487 In.getValueSizeInBits() == VT.getSizeInBits()) {
56488 unsigned NumElts = VT.getVectorNumElements();
56489 unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();
56490 EVT EltVT = In.getOperand(0).getValueType();
56491 SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));
56492 for (unsigned I = 0; I != NumElts; ++I)
56493 Elts[I * Scale] = In.getOperand(I);
56494 return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));
56495 }
56496
56497 // Attempt to combine as a shuffle on SSE41+ targets.
56498 if (Subtarget.hasSSE41()) {
56499 SDValue Op(N, 0);
56500 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
56501 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
56502 return Res;
56503 }
56504
56505 return SDValue();
56506}
56507
56510 EVT VT = N->getValueType(0);
56511
56512 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
56513 return DAG.getConstant(0, SDLoc(N), VT);
56514
56515 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56516 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
56517 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
56518 return SDValue(N, 0);
56519
56520 return SDValue();
56521}
56522
56523// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
56524// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
56525// extra instructions between the conversion due to going to scalar and back.
56527 const X86Subtarget &Subtarget) {
56528 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
56529 return SDValue();
56530
56531 if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
56532 return SDValue();
56533
56534 if (N->getValueType(0) != MVT::f32 ||
56535 N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
56536 return SDValue();
56537
56538 SDLoc dl(N);
56539 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
56540 N->getOperand(0).getOperand(0));
56541 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
56542 DAG.getTargetConstant(4, dl, MVT::i32));
56543 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
56544 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
56545 DAG.getIntPtrConstant(0, dl));
56546}
56547
56549 const X86Subtarget &Subtarget) {
56550 EVT VT = N->getValueType(0);
56551 bool IsStrict = N->isStrictFPOpcode();
56552 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
56553 EVT SrcVT = Src.getValueType();
56554
56555 SDLoc dl(N);
56556 if (SrcVT.getScalarType() == MVT::bf16) {
56557 if (!IsStrict && Src.getOpcode() == ISD::FP_ROUND &&
56558 Src.getOperand(0).getValueType() == VT)
56559 return Src.getOperand(0);
56560
56561 if (!SrcVT.isVector())
56562 return SDValue();
56563
56564 assert(!IsStrict && "Strict FP doesn't support BF16");
56565 if (VT.getVectorElementType() == MVT::f64) {
56566 MVT TmpVT = VT.getSimpleVT().changeVectorElementType(MVT::f32);
56567 return DAG.getNode(ISD::FP_EXTEND, dl, VT,
56568 DAG.getNode(ISD::FP_EXTEND, dl, TmpVT, Src));
56569 }
56570 assert(VT.getVectorElementType() == MVT::f32 && "Unexpected fpext");
56571 MVT NVT = SrcVT.getSimpleVT().changeVectorElementType(MVT::i32);
56572 Src = DAG.getBitcast(SrcVT.changeTypeToInteger(), Src);
56573 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Src);
56574 Src = DAG.getNode(ISD::SHL, dl, NVT, Src, DAG.getConstant(16, dl, NVT));
56575 return DAG.getBitcast(VT, Src);
56576 }
56577
56578 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
56579 return SDValue();
56580
56581 if (Subtarget.hasFP16())
56582 return SDValue();
56583
56584 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
56585 return SDValue();
56586
56587 if (VT.getVectorElementType() != MVT::f32 &&
56588 VT.getVectorElementType() != MVT::f64)
56589 return SDValue();
56590
56591 unsigned NumElts = VT.getVectorNumElements();
56592 if (NumElts == 1 || !isPowerOf2_32(NumElts))
56593 return SDValue();
56594
56595 // Convert the input to vXi16.
56596 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
56597 Src = DAG.getBitcast(IntVT, Src);
56598
56599 // Widen to at least 8 input elements.
56600 if (NumElts < 8) {
56601 unsigned NumConcats = 8 / NumElts;
56602 SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
56603 : DAG.getConstant(0, dl, IntVT);
56604 SmallVector<SDValue, 4> Ops(NumConcats, Fill);
56605 Ops[0] = Src;
56606 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
56607 }
56608
56609 // Destination is vXf32 with at least 4 elements.
56610 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
56611 std::max(4U, NumElts));
56612 SDValue Cvt, Chain;
56613 if (IsStrict) {
56614 Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
56615 {N->getOperand(0), Src});
56616 Chain = Cvt.getValue(1);
56617 } else {
56618 Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
56619 }
56620
56621 if (NumElts < 4) {
56622 assert(NumElts == 2 && "Unexpected size");
56623 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
56624 DAG.getIntPtrConstant(0, dl));
56625 }
56626
56627 if (IsStrict) {
56628 // Extend to the original VT if necessary.
56629 if (Cvt.getValueType() != VT) {
56630 Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
56631 {Chain, Cvt});
56632 Chain = Cvt.getValue(1);
56633 }
56634 return DAG.getMergeValues({Cvt, Chain}, dl);
56635 }
56636
56637 // Extend to the original VT if necessary.
56638 return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
56639}
56640
56641// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract
56642// from. Limit this to cases where the loads have the same input chain and the
56643// output chains are unused. This avoids any memory ordering issues.
56646 assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
56647 N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
56648 "Unknown broadcast load type");
56649
56650 // Only do this if the chain result is unused.
56651 if (N->hasAnyUseOfValue(1))
56652 return SDValue();
56653
56654 auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
56655
56656 SDValue Ptr = MemIntrin->getBasePtr();
56657 SDValue Chain = MemIntrin->getChain();
56658 EVT VT = N->getSimpleValueType(0);
56659 EVT MemVT = MemIntrin->getMemoryVT();
56660
56661 // Look at other users of our base pointer and try to find a wider broadcast.
56662 // The input chain and the size of the memory VT must match.
56663 for (SDNode *User : Ptr->uses())
56664 if (User != N && User->getOpcode() == N->getOpcode() &&
56665 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
56666 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
56667 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
56668 MemVT.getSizeInBits() &&
56669 !User->hasAnyUseOfValue(1) &&
56670 User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {
56671 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
56672 VT.getSizeInBits());
56673 Extract = DAG.getBitcast(VT, Extract);
56674 return DCI.CombineTo(N, Extract, SDValue(User, 1));
56675 }
56676
56677 return SDValue();
56678}
56679
56681 const X86Subtarget &Subtarget) {
56682 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
56683 return SDValue();
56684
56685 bool IsStrict = N->isStrictFPOpcode();
56686 EVT VT = N->getValueType(0);
56687 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
56688 EVT SrcVT = Src.getValueType();
56689
56690 if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
56691 SrcVT.getVectorElementType() != MVT::f32)
56692 return SDValue();
56693
56694 SDLoc dl(N);
56695
56696 SDValue Cvt, Chain;
56697 unsigned NumElts = VT.getVectorNumElements();
56698 if (Subtarget.hasFP16()) {
56699 // Combine (v8f16 fp_round(concat_vectors(v4f32 (xint_to_fp v4i64), ..)))
56700 // into (v8f16 vector_shuffle(v8f16 (CVTXI2P v4i64), ..))
56701 if (NumElts == 8 && Src.getOpcode() == ISD::CONCAT_VECTORS) {
56702 SDValue Cvt0, Cvt1;
56703 SDValue Op0 = Src.getOperand(0);
56704 SDValue Op1 = Src.getOperand(1);
56705 bool IsOp0Strict = Op0->isStrictFPOpcode();
56706 if (Op0.getOpcode() != Op1.getOpcode() ||
56707 Op0.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64 ||
56708 Op1.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64) {
56709 return SDValue();
56710 }
56711 int Mask[8] = {0, 1, 2, 3, 8, 9, 10, 11};
56712 if (IsStrict) {
56713 assert(IsOp0Strict && "Op0 must be strict node");
56714 unsigned Opc = Op0.getOpcode() == ISD::STRICT_SINT_TO_FP
56717 Cvt0 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
56718 {Op0.getOperand(0), Op0.getOperand(1)});
56719 Cvt1 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
56720 {Op1.getOperand(0), Op1.getOperand(1)});
56721 Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
56722 return DAG.getMergeValues({Cvt, Cvt0.getValue(1)}, dl);
56723 }
56724 unsigned Opc = Op0.getOpcode() == ISD::SINT_TO_FP ? X86ISD::CVTSI2P
56726 Cvt0 = DAG.getNode(Opc, dl, MVT::v8f16, Op0.getOperand(0));
56727 Cvt1 = DAG.getNode(Opc, dl, MVT::v8f16, Op1.getOperand(0));
56728 return Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
56729 }
56730 return SDValue();
56731 }
56732
56733 if (NumElts == 1 || !isPowerOf2_32(NumElts))
56734 return SDValue();
56735
56736 // Widen to at least 4 input elements.
56737 if (NumElts < 4)
56738 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
56739 DAG.getConstantFP(0.0, dl, SrcVT));
56740
56741 // Destination is v8i16 with at least 8 elements.
56742 EVT CvtVT =
56743 EVT::getVectorVT(*DAG.getContext(), MVT::i16, std::max(8U, NumElts));
56744 SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32);
56745 if (IsStrict) {
56746 Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other},
56747 {N->getOperand(0), Src, Rnd});
56748 Chain = Cvt.getValue(1);
56749 } else {
56750 Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd);
56751 }
56752
56753 // Extract down to real number of elements.
56754 if (NumElts < 8) {
56756 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
56757 DAG.getIntPtrConstant(0, dl));
56758 }
56759
56760 Cvt = DAG.getBitcast(VT, Cvt);
56761
56762 if (IsStrict)
56763 return DAG.getMergeValues({Cvt, Chain}, dl);
56764
56765 return Cvt;
56766}
56767
56769 SDValue Src = N->getOperand(0);
56770
56771 // Turn MOVDQ2Q+simple_load into an mmx load.
56772 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
56773 LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
56774
56775 if (LN->isSimple()) {
56776 SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
56777 LN->getBasePtr(),
56778 LN->getPointerInfo(),
56779 LN->getOriginalAlign(),
56780 LN->getMemOperand()->getFlags());
56781 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
56782 return NewLd;
56783 }
56784 }
56785
56786 return SDValue();
56787}
56788
56791 unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
56792 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56793 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))
56794 return SDValue(N, 0);
56795
56796 return SDValue();
56797}
56798
56800 DAGCombinerInfo &DCI) const {
56801 SelectionDAG &DAG = DCI.DAG;
56802 switch (N->getOpcode()) {
56803 // clang-format off
56804 default: break;
56806 return combineScalarToVector(N, DAG);
56808 case X86ISD::PEXTRW:
56809 case X86ISD::PEXTRB:
56810 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
56812 return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);
56814 return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget);
56816 return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget);
56817 case ISD::VSELECT:
56818 case ISD::SELECT:
56819 case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
56820 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
56821 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
56822 case X86ISD::CMP: return combineCMP(N, DAG, Subtarget);
56823 case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
56824 case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
56825 case X86ISD::ADD:
56826 case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI);
56827 case X86ISD::SBB: return combineSBB(N, DAG);
56828 case X86ISD::ADC: return combineADC(N, DAG, DCI);
56829 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
56830 case ISD::SHL: return combineShiftLeft(N, DAG);
56831 case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
56832 case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
56833 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
56834 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
56835 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
56836 case ISD::BITREVERSE: return combineBITREVERSE(N, DAG, DCI, Subtarget);
56837 case X86ISD::BEXTR:
56838 case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
56839 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
56840 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
56841 case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
56842 case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
56844 return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
56845 case ISD::SINT_TO_FP:
56847 return combineSIntToFP(N, DAG, DCI, Subtarget);
56848 case ISD::UINT_TO_FP:
56850 return combineUIntToFP(N, DAG, Subtarget);
56851 case ISD::FADD:
56852 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
56853 case X86ISD::VFCMULC:
56854 case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget);
56855 case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
56856 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
56857 case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
56858 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
56859 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
56860 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
56861 case X86ISD::FXOR:
56862 case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
56863 case X86ISD::FMIN:
56864 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
56865 case ISD::FMINNUM:
56866 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
56867 case X86ISD::CVTSI2P:
56868 case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
56869 case X86ISD::CVTP2SI:
56870 case X86ISD::CVTP2UI:
56872 case X86ISD::CVTTP2SI:
56874 case X86ISD::CVTTP2UI:
56875 return combineCVTP2I_CVTTP2I(N, DAG, DCI);
56877 case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
56878 case X86ISD::BT: return combineBT(N, DAG, DCI);
56879 case ISD::ANY_EXTEND:
56880 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
56881 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
56882 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
56886 return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
56887 case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);
56888 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
56889 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
56890 case X86ISD::PACKSS:
56891 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
56892 case X86ISD::HADD:
56893 case X86ISD::HSUB:
56894 case X86ISD::FHADD:
56895 case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
56896 case X86ISD::VSHL:
56897 case X86ISD::VSRA:
56898 case X86ISD::VSRL:
56899 return combineVectorShiftVar(N, DAG, DCI, Subtarget);
56900 case X86ISD::VSHLI:
56901 case X86ISD::VSRAI:
56902 case X86ISD::VSRLI:
56903 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
56905 case X86ISD::PINSRB:
56906 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
56907 case X86ISD::SHUFP: // Handle all target specific shuffles
56908 case X86ISD::INSERTPS:
56909 case X86ISD::EXTRQI:
56910 case X86ISD::INSERTQI:
56911 case X86ISD::VALIGN:
56912 case X86ISD::PALIGNR:
56913 case X86ISD::VSHLDQ:
56914 case X86ISD::VSRLDQ:
56915 case X86ISD::BLENDI:
56916 case X86ISD::UNPCKH:
56917 case X86ISD::UNPCKL:
56918 case X86ISD::MOVHLPS:
56919 case X86ISD::MOVLHPS:
56920 case X86ISD::PSHUFB:
56921 case X86ISD::PSHUFD:
56922 case X86ISD::PSHUFHW:
56923 case X86ISD::PSHUFLW:
56924 case X86ISD::MOVSHDUP:
56925 case X86ISD::MOVSLDUP:
56926 case X86ISD::MOVDDUP:
56927 case X86ISD::MOVSS:
56928 case X86ISD::MOVSD:
56929 case X86ISD::MOVSH:
56930 case X86ISD::VBROADCAST:
56931 case X86ISD::VPPERM:
56932 case X86ISD::VPERMI:
56933 case X86ISD::VPERMV:
56934 case X86ISD::VPERMV3:
56935 case X86ISD::VPERMIL2:
56936 case X86ISD::VPERMILPI:
56937 case X86ISD::VPERMILPV:
56938 case X86ISD::VPERM2X128:
56939 case X86ISD::SHUF128:
56940 case X86ISD::VZEXT_MOVL:
56941 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
56942 case X86ISD::FMADD_RND:
56943 case X86ISD::FMSUB:
56945 case X86ISD::FMSUB_RND:
56946 case X86ISD::FNMADD:
56948 case X86ISD::FNMADD_RND:
56949 case X86ISD::FNMSUB:
56951 case X86ISD::FNMSUB_RND:
56952 case ISD::FMA:
56953 case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
56956 case X86ISD::FMADDSUB:
56957 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
56958 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
56959 case X86ISD::TESTP: return combineTESTP(N, DAG, DCI, Subtarget);
56960 case X86ISD::MGATHER:
56961 case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI);
56962 case ISD::MGATHER:
56963 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
56964 case X86ISD::PCMPEQ:
56965 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
56966 case X86ISD::PMULDQ:
56967 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
56968 case X86ISD::VPMADDUBSW:
56969 case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);
56970 case X86ISD::KSHIFTL:
56971 case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
56972 case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
56974 case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget);
56976 case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
56978 case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
56979 case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
56980 case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
56981 // clang-format on
56982 }
56983
56984 return SDValue();
56985}
56986
56988 return false;
56989}
56990
56991// Prefer (non-AVX512) vector TRUNCATE(SIGN_EXTEND_INREG(X)) to use of PACKSS.
56993 EVT ExtVT) const {
56994 return Subtarget.hasAVX512() || !VT.isVector();
56995}
56996
56997bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
56998 if (!isTypeLegal(VT))
56999 return false;
57000
57001 // There are no vXi8 shifts.
57002 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
57003 return false;
57004
57005 // TODO: Almost no 8-bit ops are desirable because they have no actual
57006 // size/speed advantages vs. 32-bit ops, but they do have a major
57007 // potential disadvantage by causing partial register stalls.
57008 //
57009 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
57010 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
57011 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
57012 // check for a constant operand to the multiply.
57013 if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
57014 return false;
57015
57016 // i16 instruction encodings are longer and some i16 instructions are slow,
57017 // so those are not desirable.
57018 if (VT == MVT::i16) {
57019 switch (Opc) {
57020 default:
57021 break;
57022 case ISD::LOAD:
57023 case ISD::SIGN_EXTEND:
57024 case ISD::ZERO_EXTEND:
57025 case ISD::ANY_EXTEND:
57026 case ISD::SHL:
57027 case ISD::SRA:
57028 case ISD::SRL:
57029 case ISD::SUB:
57030 case ISD::ADD:
57031 case ISD::MUL:
57032 case ISD::AND:
57033 case ISD::OR:
57034 case ISD::XOR:
57035 return false;
57036 }
57037 }
57038
57039 // Any legal type not explicitly accounted for above here is desirable.
57040 return true;
57041}
57042
57045 int JTI,
57046 SelectionDAG &DAG) const {
57047 const Module *M = DAG.getMachineFunction().getMMI().getModule();
57048 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
57049 if (IsCFProtectionSupported) {
57050 // In case control-flow branch protection is enabled, we need to add
57051 // notrack prefix to the indirect branch.
57052 // In order to do that we create NT_BRIND SDNode.
57053 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
57054 SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Value, dl);
57055 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, JTInfo, Addr);
57056 }
57057
57058 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, JTI, DAG);
57059}
57060
57063 const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const {
57065 EVT VT = LogicOp->getValueType(0);
57066 EVT OpVT = SETCC0->getOperand(0).getValueType();
57067 if (!VT.isInteger())
57069
57070 if (VT.isVector())
57075
57076 // Don't use `NotAnd` as even though `not` is generally shorter code size than
57077 // `add`, `add` can lower to LEA which can save moves / spills. Any case where
57078 // `NotAnd` applies, `AddAnd` does as well.
57079 // TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`,
57080 // if we change that to `andn Y, X` it may be worth prefering `NotAnd` here.
57082}
57083
57085 EVT VT = Op.getValueType();
57086 bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
57087 isa<ConstantSDNode>(Op.getOperand(1));
57088
57089 // i16 is legal, but undesirable since i16 instruction encodings are longer
57090 // and some i16 instructions are slow.
57091 // 8-bit multiply-by-constant can usually be expanded to something cheaper
57092 // using LEA and/or other ALU ops.
57093 if (VT != MVT::i16 && !Is8BitMulByConstant)
57094 return false;
57095
57096 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
57097 if (!Op.hasOneUse())
57098 return false;
57099 SDNode *User = *Op->use_begin();
57101 return false;
57102 auto *Ld = cast<LoadSDNode>(Load);
57103 auto *St = cast<StoreSDNode>(User);
57104 return Ld->getBasePtr() == St->getBasePtr();
57105 };
57106
57107 auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
57108 if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
57109 return false;
57110 if (!Op.hasOneUse())
57111 return false;
57112 SDNode *User = *Op->use_begin();
57113 if (User->getOpcode() != ISD::ATOMIC_STORE)
57114 return false;
57115 auto *Ld = cast<AtomicSDNode>(Load);
57116 auto *St = cast<AtomicSDNode>(User);
57117 return Ld->getBasePtr() == St->getBasePtr();
57118 };
57119
57120 bool Commute = false;
57121 switch (Op.getOpcode()) {
57122 default: return false;
57123 case ISD::SIGN_EXTEND:
57124 case ISD::ZERO_EXTEND:
57125 case ISD::ANY_EXTEND:
57126 break;
57127 case ISD::SHL:
57128 case ISD::SRA:
57129 case ISD::SRL: {
57130 SDValue N0 = Op.getOperand(0);
57131 // Look out for (store (shl (load), x)).
57132 if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))
57133 return false;
57134 break;
57135 }
57136 case ISD::ADD:
57137 case ISD::MUL:
57138 case ISD::AND:
57139 case ISD::OR:
57140 case ISD::XOR:
57141 Commute = true;
57142 [[fallthrough]];
57143 case ISD::SUB: {
57144 SDValue N0 = Op.getOperand(0);
57145 SDValue N1 = Op.getOperand(1);
57146 // Avoid disabling potential load folding opportunities.
57147 if (X86::mayFoldLoad(N1, Subtarget) &&
57148 (!Commute || !isa<ConstantSDNode>(N0) ||
57149 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
57150 return false;
57151 if (X86::mayFoldLoad(N0, Subtarget) &&
57152 ((Commute && !isa<ConstantSDNode>(N1)) ||
57153 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
57154 return false;
57155 if (IsFoldableAtomicRMW(N0, Op) ||
57156 (Commute && IsFoldableAtomicRMW(N1, Op)))
57157 return false;
57158 }
57159 }
57160
57161 PVT = MVT::i32;
57162 return true;
57163}
57164
57165//===----------------------------------------------------------------------===//
57166// X86 Inline Assembly Support
57167//===----------------------------------------------------------------------===//
57168
57169// Helper to match a string separated by whitespace.
57171 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
57172
57173 for (StringRef Piece : Pieces) {
57174 if (!S.starts_with(Piece)) // Check if the piece matches.
57175 return false;
57176
57177 S = S.substr(Piece.size());
57179 if (Pos == 0) // We matched a prefix.
57180 return false;
57181
57182 S = S.substr(Pos);
57183 }
57184
57185 return S.empty();
57186}
57187
57189
57190 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
57191 if (llvm::is_contained(AsmPieces, "~{cc}") &&
57192 llvm::is_contained(AsmPieces, "~{flags}") &&
57193 llvm::is_contained(AsmPieces, "~{fpsr}")) {
57194
57195 if (AsmPieces.size() == 3)
57196 return true;
57197 else if (llvm::is_contained(AsmPieces, "~{dirflag}"))
57198 return true;
57199 }
57200 }
57201 return false;
57202}
57203
57205 InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
57206
57207 const std::string &AsmStr = IA->getAsmString();
57208
57209 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
57210 if (!Ty || Ty->getBitWidth() % 16 != 0)
57211 return false;
57212
57213 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
57214 SmallVector<StringRef, 4> AsmPieces;
57215 SplitString(AsmStr, AsmPieces, ";\n");
57216
57217 switch (AsmPieces.size()) {
57218 default: return false;
57219 case 1:
57220 // FIXME: this should verify that we are targeting a 486 or better. If not,
57221 // we will turn this bswap into something that will be lowered to logical
57222 // ops instead of emitting the bswap asm. For now, we don't support 486 or
57223 // lower so don't worry about this.
57224 // bswap $0
57225 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
57226 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
57227 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
57228 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
57229 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
57230 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
57231 // No need to check constraints, nothing other than the equivalent of
57232 // "=r,0" would be valid here.
57234 }
57235
57236 // rorw $$8, ${0:w} --> llvm.bswap.i16
57237 if (CI->getType()->isIntegerTy(16) &&
57238 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
57239 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
57240 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
57241 AsmPieces.clear();
57242 StringRef ConstraintsStr = IA->getConstraintString();
57243 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
57244 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
57245 if (clobbersFlagRegisters(AsmPieces))
57247 }
57248 break;
57249 case 3:
57250 if (CI->getType()->isIntegerTy(32) &&
57251 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
57252 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
57253 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
57254 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
57255 AsmPieces.clear();
57256 StringRef ConstraintsStr = IA->getConstraintString();
57257 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
57258 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
57259 if (clobbersFlagRegisters(AsmPieces))
57261 }
57262
57263 if (CI->getType()->isIntegerTy(64)) {
57264 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
57265 if (Constraints.size() >= 2 &&
57266 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
57267 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
57268 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
57269 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
57270 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
57271 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
57273 }
57274 }
57275 break;
57276 }
57277 return false;
57278}
57279
57282 .Case("{@cca}", X86::COND_A)
57283 .Case("{@ccae}", X86::COND_AE)
57284 .Case("{@ccb}", X86::COND_B)
57285 .Case("{@ccbe}", X86::COND_BE)
57286 .Case("{@ccc}", X86::COND_B)
57287 .Case("{@cce}", X86::COND_E)
57288 .Case("{@ccz}", X86::COND_E)
57289 .Case("{@ccg}", X86::COND_G)
57290 .Case("{@ccge}", X86::COND_GE)
57291 .Case("{@ccl}", X86::COND_L)
57292 .Case("{@ccle}", X86::COND_LE)
57293 .Case("{@ccna}", X86::COND_BE)
57294 .Case("{@ccnae}", X86::COND_B)
57295 .Case("{@ccnb}", X86::COND_AE)
57296 .Case("{@ccnbe}", X86::COND_A)
57297 .Case("{@ccnc}", X86::COND_AE)
57298 .Case("{@ccne}", X86::COND_NE)
57299 .Case("{@ccnz}", X86::COND_NE)
57300 .Case("{@ccng}", X86::COND_LE)
57301 .Case("{@ccnge}", X86::COND_L)
57302 .Case("{@ccnl}", X86::COND_GE)
57303 .Case("{@ccnle}", X86::COND_G)
57304 .Case("{@ccno}", X86::COND_NO)
57305 .Case("{@ccnp}", X86::COND_NP)
57306 .Case("{@ccns}", X86::COND_NS)
57307 .Case("{@cco}", X86::COND_O)
57308 .Case("{@ccp}", X86::COND_P)
57309 .Case("{@ccs}", X86::COND_S)
57311 return Cond;
57312}
57313
57314/// Given a constraint letter, return the type of constraint for this target.
57317 if (Constraint.size() == 1) {
57318 switch (Constraint[0]) {
57319 case 'R':
57320 case 'q':
57321 case 'Q':
57322 case 'f':
57323 case 't':
57324 case 'u':
57325 case 'y':
57326 case 'x':
57327 case 'v':
57328 case 'l':
57329 case 'k': // AVX512 masking registers.
57330 return C_RegisterClass;
57331 case 'a':
57332 case 'b':
57333 case 'c':
57334 case 'd':
57335 case 'S':
57336 case 'D':
57337 case 'A':
57338 return C_Register;
57339 case 'I':
57340 case 'J':
57341 case 'K':
57342 case 'N':
57343 case 'G':
57344 case 'L':
57345 case 'M':
57346 return C_Immediate;
57347 case 'C':
57348 case 'e':
57349 case 'Z':
57350 return C_Other;
57351 default:
57352 break;
57353 }
57354 }
57355 else if (Constraint.size() == 2) {
57356 switch (Constraint[0]) {
57357 default:
57358 break;
57359 case 'W':
57360 if (Constraint[1] != 's')
57361 break;
57362 return C_Other;
57363 case 'Y':
57364 switch (Constraint[1]) {
57365 default:
57366 break;
57367 case 'z':
57368 return C_Register;
57369 case 'i':
57370 case 'm':
57371 case 'k':
57372 case 't':
57373 case '2':
57374 return C_RegisterClass;
57375 }
57376 }
57377 } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
57378 return C_Other;
57379 return TargetLowering::getConstraintType(Constraint);
57380}
57381
57382/// Examine constraint type and operand type and determine a weight value.
57383/// This object must already have been set up with the operand type
57384/// and the current alternative constraint selected.
57387 AsmOperandInfo &Info, const char *Constraint) const {
57389 Value *CallOperandVal = Info.CallOperandVal;
57390 // If we don't have a value, we can't do a match,
57391 // but allow it at the lowest weight.
57392 if (!CallOperandVal)
57393 return CW_Default;
57394 Type *Ty = CallOperandVal->getType();
57395 // Look at the constraint type.
57396 switch (*Constraint) {
57397 default:
57399 [[fallthrough]];
57400 case 'R':
57401 case 'q':
57402 case 'Q':
57403 case 'a':
57404 case 'b':
57405 case 'c':
57406 case 'd':
57407 case 'S':
57408 case 'D':
57409 case 'A':
57410 if (CallOperandVal->getType()->isIntegerTy())
57411 Wt = CW_SpecificReg;
57412 break;
57413 case 'f':
57414 case 't':
57415 case 'u':
57416 if (Ty->isFloatingPointTy())
57417 Wt = CW_SpecificReg;
57418 break;
57419 case 'y':
57420 if (Ty->isX86_MMXTy() && Subtarget.hasMMX())
57421 Wt = CW_SpecificReg;
57422 break;
57423 case 'Y':
57424 if (StringRef(Constraint).size() != 2)
57425 break;
57426 switch (Constraint[1]) {
57427 default:
57428 return CW_Invalid;
57429 // XMM0
57430 case 'z':
57431 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
57432 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
57433 ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
57434 return CW_SpecificReg;
57435 return CW_Invalid;
57436 // Conditional OpMask regs (AVX512)
57437 case 'k':
57438 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
57439 return CW_Register;
57440 return CW_Invalid;
57441 // Any MMX reg
57442 case 'm':
57443 if (Ty->isX86_MMXTy() && Subtarget.hasMMX())
57444 return Wt;
57445 return CW_Invalid;
57446 // Any SSE reg when ISA >= SSE2, same as 'x'
57447 case 'i':
57448 case 't':
57449 case '2':
57450 if (!Subtarget.hasSSE2())
57451 return CW_Invalid;
57452 break;
57453 }
57454 break;
57455 case 'v':
57456 if ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
57457 Wt = CW_Register;
57458 [[fallthrough]];
57459 case 'x':
57460 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
57461 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
57462 Wt = CW_Register;
57463 break;
57464 case 'k':
57465 // Enable conditional vector operations using %k<#> registers.
57466 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
57467 Wt = CW_Register;
57468 break;
57469 case 'I':
57470 if (auto *C = dyn_cast<ConstantInt>(Info.CallOperandVal))
57471 if (C->getZExtValue() <= 31)
57472 Wt = CW_Constant;
57473 break;
57474 case 'J':
57475 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
57476 if (C->getZExtValue() <= 63)
57477 Wt = CW_Constant;
57478 break;
57479 case 'K':
57480 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
57481 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
57482 Wt = CW_Constant;
57483 break;
57484 case 'L':
57485 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
57486 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
57487 Wt = CW_Constant;
57488 break;
57489 case 'M':
57490 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
57491 if (C->getZExtValue() <= 3)
57492 Wt = CW_Constant;
57493 break;
57494 case 'N':
57495 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
57496 if (C->getZExtValue() <= 0xff)
57497 Wt = CW_Constant;
57498 break;
57499 case 'G':
57500 case 'C':
57501 if (isa<ConstantFP>(CallOperandVal))
57502 Wt = CW_Constant;
57503 break;
57504 case 'e':
57505 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
57506 if ((C->getSExtValue() >= -0x80000000LL) &&
57507 (C->getSExtValue() <= 0x7fffffffLL))
57508 Wt = CW_Constant;
57509 break;
57510 case 'Z':
57511 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
57512 if (C->getZExtValue() <= 0xffffffff)
57513 Wt = CW_Constant;
57514 break;
57515 }
57516 return Wt;
57517}
57518
57519/// Try to replace an X constraint, which matches anything, with another that
57520/// has more specific requirements based on the type of the corresponding
57521/// operand.
57523LowerXConstraint(EVT ConstraintVT) const {
57524 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
57525 // 'f' like normal targets.
57526 if (ConstraintVT.isFloatingPoint()) {
57527 if (Subtarget.hasSSE1())
57528 return "x";
57529 }
57530
57531 return TargetLowering::LowerXConstraint(ConstraintVT);
57532}
57533
57534// Lower @cc targets via setcc.
57536 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
57537 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
57539 if (Cond == X86::COND_INVALID)
57540 return SDValue();
57541 // Check that return type is valid.
57542 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
57543 OpInfo.ConstraintVT.getSizeInBits() < 8)
57544 report_fatal_error("Glue output operand is of invalid type");
57545
57546 // Get EFLAGS register. Only update chain when copyfrom is glued.
57547 if (Glue.getNode()) {
57548 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Glue);
57549 Chain = Glue.getValue(1);
57550 } else
57551 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
57552 // Extract CC code.
57553 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
57554 // Extend to 32-bits
57555 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
57556
57557 return Result;
57558}
57559
57560/// Lower the specified operand into the Ops vector.
57561/// If it is invalid, don't add anything to Ops.
57563 StringRef Constraint,
57564 std::vector<SDValue> &Ops,
57565 SelectionDAG &DAG) const {
57566 SDValue Result;
57567 char ConstraintLetter = Constraint[0];
57568 switch (ConstraintLetter) {
57569 default: break;
57570 case 'I':
57571 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57572 if (C->getZExtValue() <= 31) {
57573 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
57574 Op.getValueType());
57575 break;
57576 }
57577 }
57578 return;
57579 case 'J':
57580 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57581 if (C->getZExtValue() <= 63) {
57582 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
57583 Op.getValueType());
57584 break;
57585 }
57586 }
57587 return;
57588 case 'K':
57589 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57590 if (isInt<8>(C->getSExtValue())) {
57591 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
57592 Op.getValueType());
57593 break;
57594 }
57595 }
57596 return;
57597 case 'L':
57598 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57599 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
57600 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
57601 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
57602 Op.getValueType());
57603 break;
57604 }
57605 }
57606 return;
57607 case 'M':
57608 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57609 if (C->getZExtValue() <= 3) {
57610 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
57611 Op.getValueType());
57612 break;
57613 }
57614 }
57615 return;
57616 case 'N':
57617 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57618 if (C->getZExtValue() <= 255) {
57619 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
57620 Op.getValueType());
57621 break;
57622 }
57623 }
57624 return;
57625 case 'O':
57626 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57627 if (C->getZExtValue() <= 127) {
57628 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
57629 Op.getValueType());
57630 break;
57631 }
57632 }
57633 return;
57634 case 'e': {
57635 // 32-bit signed value
57636 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57638 C->getSExtValue())) {
57639 // Widen to 64 bits here to get it sign extended.
57640 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
57641 break;
57642 }
57643 // FIXME gcc accepts some relocatable values here too, but only in certain
57644 // memory models; it's complicated.
57645 }
57646 return;
57647 }
57648 case 'W': {
57649 assert(Constraint[1] == 's');
57650 // Op is a BlockAddressSDNode or a GlobalAddressSDNode with an optional
57651 // offset.
57652 if (const auto *BA = dyn_cast<BlockAddressSDNode>(Op)) {
57653 Ops.push_back(DAG.getTargetBlockAddress(BA->getBlockAddress(),
57654 BA->getValueType(0)));
57655 } else {
57656 int64_t Offset = 0;
57657 if (Op->getOpcode() == ISD::ADD &&
57658 isa<ConstantSDNode>(Op->getOperand(1))) {
57659 Offset = cast<ConstantSDNode>(Op->getOperand(1))->getSExtValue();
57660 Op = Op->getOperand(0);
57661 }
57662 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
57663 Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
57664 GA->getValueType(0), Offset));
57665 }
57666 return;
57667 }
57668 case 'Z': {
57669 // 32-bit unsigned value
57670 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57672 C->getZExtValue())) {
57673 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
57674 Op.getValueType());
57675 break;
57676 }
57677 }
57678 // FIXME gcc accepts some relocatable values here too, but only in certain
57679 // memory models; it's complicated.
57680 return;
57681 }
57682 case 'i': {
57683 // Literal immediates are always ok.
57684 if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {
57685 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
57686 BooleanContent BCont = getBooleanContents(MVT::i64);
57687 ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
57689 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
57690 : CST->getSExtValue();
57691 Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
57692 break;
57693 }
57694
57695 // In any sort of PIC mode addresses need to be computed at runtime by
57696 // adding in a register or some sort of table lookup. These can't
57697 // be used as immediates. BlockAddresses and BasicBlocks are fine though.
57698 if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&
57699 !(isa<BlockAddressSDNode>(Op) || isa<BasicBlockSDNode>(Op)))
57700 return;
57701
57702 // If we are in non-pic codegen mode, we allow the address of a global (with
57703 // an optional displacement) to be used with 'i'.
57704 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
57705 // If we require an extra load to get this address, as in PIC mode, we
57706 // can't accept it.
57708 Subtarget.classifyGlobalReference(GA->getGlobal())))
57709 return;
57710 break;
57711 }
57712 }
57713
57714 if (Result.getNode()) {
57715 Ops.push_back(Result);
57716 return;
57717 }
57718 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
57719}
57720
57721/// Check if \p RC is a general purpose register class.
57722/// I.e., GR* or one of their variant.
57723static bool isGRClass(const TargetRegisterClass &RC) {
57724 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
57725 RC.hasSuperClassEq(&X86::GR16RegClass) ||
57726 RC.hasSuperClassEq(&X86::GR32RegClass) ||
57727 RC.hasSuperClassEq(&X86::GR64RegClass) ||
57728 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
57729}
57730
57731/// Check if \p RC is a vector register class.
57732/// I.e., FR* / VR* or one of their variant.
57733static bool isFRClass(const TargetRegisterClass &RC) {
57734 return RC.hasSuperClassEq(&X86::FR16XRegClass) ||
57735 RC.hasSuperClassEq(&X86::FR32XRegClass) ||
57736 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
57737 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
57738 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
57739 RC.hasSuperClassEq(&X86::VR512RegClass);
57740}
57741
57742/// Check if \p RC is a mask register class.
57743/// I.e., VK* or one of their variant.
57744static bool isVKClass(const TargetRegisterClass &RC) {
57745 return RC.hasSuperClassEq(&X86::VK1RegClass) ||
57746 RC.hasSuperClassEq(&X86::VK2RegClass) ||
57747 RC.hasSuperClassEq(&X86::VK4RegClass) ||
57748 RC.hasSuperClassEq(&X86::VK8RegClass) ||
57749 RC.hasSuperClassEq(&X86::VK16RegClass) ||
57750 RC.hasSuperClassEq(&X86::VK32RegClass) ||
57751 RC.hasSuperClassEq(&X86::VK64RegClass);
57752}
57753
57754std::pair<unsigned, const TargetRegisterClass *>
57756 StringRef Constraint,
57757 MVT VT) const {
57758 // First, see if this is a constraint that directly corresponds to an LLVM
57759 // register class.
57760 if (Constraint.size() == 1) {
57761 // GCC Constraint Letters
57762 switch (Constraint[0]) {
57763 default: break;
57764 // 'A' means [ER]AX + [ER]DX.
57765 case 'A':
57766 if (Subtarget.is64Bit())
57767 return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
57768 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
57769 "Expecting 64, 32 or 16 bit subtarget");
57770 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
57771
57772 // TODO: Slight differences here in allocation order and leaving
57773 // RIP in the class. Do they matter any more here than they do
57774 // in the normal allocation?
57775 case 'k':
57776 if (Subtarget.hasAVX512()) {
57777 if (VT == MVT::v1i1 || VT == MVT::i1)
57778 return std::make_pair(0U, &X86::VK1RegClass);
57779 if (VT == MVT::v8i1 || VT == MVT::i8)
57780 return std::make_pair(0U, &X86::VK8RegClass);
57781 if (VT == MVT::v16i1 || VT == MVT::i16)
57782 return std::make_pair(0U, &X86::VK16RegClass);
57783 }
57784 if (Subtarget.hasBWI()) {
57785 if (VT == MVT::v32i1 || VT == MVT::i32)
57786 return std::make_pair(0U, &X86::VK32RegClass);
57787 if (VT == MVT::v64i1 || VT == MVT::i64)
57788 return std::make_pair(0U, &X86::VK64RegClass);
57789 }
57790 break;
57791 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
57792 if (Subtarget.is64Bit()) {
57793 if (VT == MVT::i8 || VT == MVT::i1)
57794 return std::make_pair(0U, &X86::GR8_NOREX2RegClass);
57795 if (VT == MVT::i16)
57796 return std::make_pair(0U, &X86::GR16_NOREX2RegClass);
57797 if (VT == MVT::i32 || VT == MVT::f32)
57798 return std::make_pair(0U, &X86::GR32_NOREX2RegClass);
57799 if (VT != MVT::f80 && !VT.isVector())
57800 return std::make_pair(0U, &X86::GR64_NOREX2RegClass);
57801 break;
57802 }
57803 [[fallthrough]];
57804 // 32-bit fallthrough
57805 case 'Q': // Q_REGS
57806 if (VT == MVT::i8 || VT == MVT::i1)
57807 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
57808 if (VT == MVT::i16)
57809 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
57810 if (VT == MVT::i32 || VT == MVT::f32 ||
57811 (!VT.isVector() && !Subtarget.is64Bit()))
57812 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
57813 if (VT != MVT::f80 && !VT.isVector())
57814 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
57815 break;
57816 case 'r': // GENERAL_REGS
57817 case 'l': // INDEX_REGS
57818 if (VT == MVT::i8 || VT == MVT::i1)
57819 return std::make_pair(0U, &X86::GR8_NOREX2RegClass);
57820 if (VT == MVT::i16)
57821 return std::make_pair(0U, &X86::GR16_NOREX2RegClass);
57822 if (VT == MVT::i32 || VT == MVT::f32 ||
57823 (!VT.isVector() && !Subtarget.is64Bit()))
57824 return std::make_pair(0U, &X86::GR32_NOREX2RegClass);
57825 if (VT != MVT::f80 && !VT.isVector())
57826 return std::make_pair(0U, &X86::GR64_NOREX2RegClass);
57827 break;
57828 case 'R': // LEGACY_REGS
57829 if (VT == MVT::i8 || VT == MVT::i1)
57830 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
57831 if (VT == MVT::i16)
57832 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
57833 if (VT == MVT::i32 || VT == MVT::f32 ||
57834 (!VT.isVector() && !Subtarget.is64Bit()))
57835 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
57836 if (VT != MVT::f80 && !VT.isVector())
57837 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
57838 break;
57839 case 'f': // FP Stack registers.
57840 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
57841 // value to the correct fpstack register class.
57842 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
57843 return std::make_pair(0U, &X86::RFP32RegClass);
57844 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
57845 return std::make_pair(0U, &X86::RFP64RegClass);
57846 if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
57847 return std::make_pair(0U, &X86::RFP80RegClass);
57848 break;
57849 case 'y': // MMX_REGS if MMX allowed.
57850 if (!Subtarget.hasMMX()) break;
57851 return std::make_pair(0U, &X86::VR64RegClass);
57852 case 'v':
57853 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
57854 if (!Subtarget.hasSSE1()) break;
57855 bool VConstraint = (Constraint[0] == 'v');
57856
57857 switch (VT.SimpleTy) {
57858 default: break;
57859 // Scalar SSE types.
57860 case MVT::f16:
57861 if (VConstraint && Subtarget.hasFP16())
57862 return std::make_pair(0U, &X86::FR16XRegClass);
57863 break;
57864 case MVT::f32:
57865 case MVT::i32:
57866 if (VConstraint && Subtarget.hasVLX())
57867 return std::make_pair(0U, &X86::FR32XRegClass);
57868 return std::make_pair(0U, &X86::FR32RegClass);
57869 case MVT::f64:
57870 case MVT::i64:
57871 if (VConstraint && Subtarget.hasVLX())
57872 return std::make_pair(0U, &X86::FR64XRegClass);
57873 return std::make_pair(0U, &X86::FR64RegClass);
57874 case MVT::i128:
57875 if (Subtarget.is64Bit()) {
57876 if (VConstraint && Subtarget.hasVLX())
57877 return std::make_pair(0U, &X86::VR128XRegClass);
57878 return std::make_pair(0U, &X86::VR128RegClass);
57879 }
57880 break;
57881 // Vector types and fp128.
57882 case MVT::v8f16:
57883 if (!Subtarget.hasFP16())
57884 break;
57885 if (VConstraint)
57886 return std::make_pair(0U, &X86::VR128XRegClass);
57887 return std::make_pair(0U, &X86::VR128RegClass);
57888 case MVT::v8bf16:
57889 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
57890 break;
57891 if (VConstraint)
57892 return std::make_pair(0U, &X86::VR128XRegClass);
57893 return std::make_pair(0U, &X86::VR128RegClass);
57894 case MVT::f128:
57895 case MVT::v16i8:
57896 case MVT::v8i16:
57897 case MVT::v4i32:
57898 case MVT::v2i64:
57899 case MVT::v4f32:
57900 case MVT::v2f64:
57901 if (VConstraint && Subtarget.hasVLX())
57902 return std::make_pair(0U, &X86::VR128XRegClass);
57903 return std::make_pair(0U, &X86::VR128RegClass);
57904 // AVX types.
57905 case MVT::v16f16:
57906 if (!Subtarget.hasFP16())
57907 break;
57908 if (VConstraint)
57909 return std::make_pair(0U, &X86::VR256XRegClass);
57910 return std::make_pair(0U, &X86::VR256RegClass);
57911 case MVT::v16bf16:
57912 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
57913 break;
57914 if (VConstraint)
57915 return std::make_pair(0U, &X86::VR256XRegClass);
57916 return std::make_pair(0U, &X86::VR256RegClass);
57917 case MVT::v32i8:
57918 case MVT::v16i16:
57919 case MVT::v8i32:
57920 case MVT::v4i64:
57921 case MVT::v8f32:
57922 case MVT::v4f64:
57923 if (VConstraint && Subtarget.hasVLX())
57924 return std::make_pair(0U, &X86::VR256XRegClass);
57925 if (Subtarget.hasAVX())
57926 return std::make_pair(0U, &X86::VR256RegClass);
57927 break;
57928 case MVT::v32f16:
57929 if (!Subtarget.hasFP16())
57930 break;
57931 if (VConstraint)
57932 return std::make_pair(0U, &X86::VR512RegClass);
57933 return std::make_pair(0U, &X86::VR512_0_15RegClass);
57934 case MVT::v32bf16:
57935 if (!Subtarget.hasBF16())
57936 break;
57937 if (VConstraint)
57938 return std::make_pair(0U, &X86::VR512RegClass);
57939 return std::make_pair(0U, &X86::VR512_0_15RegClass);
57940 case MVT::v64i8:
57941 case MVT::v32i16:
57942 case MVT::v8f64:
57943 case MVT::v16f32:
57944 case MVT::v16i32:
57945 case MVT::v8i64:
57946 if (!Subtarget.hasAVX512()) break;
57947 if (VConstraint)
57948 return std::make_pair(0U, &X86::VR512RegClass);
57949 return std::make_pair(0U, &X86::VR512_0_15RegClass);
57950 }
57951 break;
57952 }
57953 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
57954 switch (Constraint[1]) {
57955 default:
57956 break;
57957 case 'i':
57958 case 't':
57959 case '2':
57960 return getRegForInlineAsmConstraint(TRI, "x", VT);
57961 case 'm':
57962 if (!Subtarget.hasMMX()) break;
57963 return std::make_pair(0U, &X86::VR64RegClass);
57964 case 'z':
57965 if (!Subtarget.hasSSE1()) break;
57966 switch (VT.SimpleTy) {
57967 default: break;
57968 // Scalar SSE types.
57969 case MVT::f16:
57970 if (!Subtarget.hasFP16())
57971 break;
57972 return std::make_pair(X86::XMM0, &X86::FR16XRegClass);
57973 case MVT::f32:
57974 case MVT::i32:
57975 return std::make_pair(X86::XMM0, &X86::FR32RegClass);
57976 case MVT::f64:
57977 case MVT::i64:
57978 return std::make_pair(X86::XMM0, &X86::FR64RegClass);
57979 case MVT::v8f16:
57980 if (!Subtarget.hasFP16())
57981 break;
57982 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
57983 case MVT::v8bf16:
57984 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
57985 break;
57986 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
57987 case MVT::f128:
57988 case MVT::v16i8:
57989 case MVT::v8i16:
57990 case MVT::v4i32:
57991 case MVT::v2i64:
57992 case MVT::v4f32:
57993 case MVT::v2f64:
57994 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
57995 // AVX types.
57996 case MVT::v16f16:
57997 if (!Subtarget.hasFP16())
57998 break;
57999 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
58000 case MVT::v16bf16:
58001 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
58002 break;
58003 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
58004 case MVT::v32i8:
58005 case MVT::v16i16:
58006 case MVT::v8i32:
58007 case MVT::v4i64:
58008 case MVT::v8f32:
58009 case MVT::v4f64:
58010 if (Subtarget.hasAVX())
58011 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
58012 break;
58013 case MVT::v32f16:
58014 if (!Subtarget.hasFP16())
58015 break;
58016 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
58017 case MVT::v32bf16:
58018 if (!Subtarget.hasBF16())
58019 break;
58020 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
58021 case MVT::v64i8:
58022 case MVT::v32i16:
58023 case MVT::v8f64:
58024 case MVT::v16f32:
58025 case MVT::v16i32:
58026 case MVT::v8i64:
58027 if (Subtarget.hasAVX512())
58028 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
58029 break;
58030 }
58031 break;
58032 case 'k':
58033 // This register class doesn't allocate k0 for masked vector operation.
58034 if (Subtarget.hasAVX512()) {
58035 if (VT == MVT::v1i1 || VT == MVT::i1)
58036 return std::make_pair(0U, &X86::VK1WMRegClass);
58037 if (VT == MVT::v8i1 || VT == MVT::i8)
58038 return std::make_pair(0U, &X86::VK8WMRegClass);
58039 if (VT == MVT::v16i1 || VT == MVT::i16)
58040 return std::make_pair(0U, &X86::VK16WMRegClass);
58041 }
58042 if (Subtarget.hasBWI()) {
58043 if (VT == MVT::v32i1 || VT == MVT::i32)
58044 return std::make_pair(0U, &X86::VK32WMRegClass);
58045 if (VT == MVT::v64i1 || VT == MVT::i64)
58046 return std::make_pair(0U, &X86::VK64WMRegClass);
58047 }
58048 break;
58049 }
58050 }
58051
58052 if (parseConstraintCode(Constraint) != X86::COND_INVALID)
58053 return std::make_pair(0U, &X86::GR32RegClass);
58054
58055 // Use the default implementation in TargetLowering to convert the register
58056 // constraint into a member of a register class.
58057 std::pair<Register, const TargetRegisterClass*> Res;
58059
58060 // Not found as a standard register?
58061 if (!Res.second) {
58062 // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
58063 // to/from f80.
58064 if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
58065 // Map st(0) -> st(7) -> ST0
58066 if (Constraint.size() == 7 && Constraint[0] == '{' &&
58067 tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
58068 Constraint[3] == '(' &&
58069 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
58070 Constraint[5] == ')' && Constraint[6] == '}') {
58071 // st(7) is not allocatable and thus not a member of RFP80. Return
58072 // singleton class in cases where we have a reference to it.
58073 if (Constraint[4] == '7')
58074 return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
58075 return std::make_pair(X86::FP0 + Constraint[4] - '0',
58076 &X86::RFP80RegClass);
58077 }
58078
58079 // GCC allows "st(0)" to be called just plain "st".
58080 if (StringRef("{st}").equals_insensitive(Constraint))
58081 return std::make_pair(X86::FP0, &X86::RFP80RegClass);
58082 }
58083
58084 // flags -> EFLAGS
58085 if (StringRef("{flags}").equals_insensitive(Constraint))
58086 return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
58087
58088 // dirflag -> DF
58089 // Only allow for clobber.
58090 if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
58091 VT == MVT::Other)
58092 return std::make_pair(X86::DF, &X86::DFCCRRegClass);
58093
58094 // fpsr -> FPSW
58095 // Only allow for clobber.
58096 if (StringRef("{fpsr}").equals_insensitive(Constraint) && VT == MVT::Other)
58097 return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
58098
58099 return Res;
58100 }
58101
58102 // Make sure it isn't a register that requires 64-bit mode.
58103 if (!Subtarget.is64Bit() &&
58104 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
58105 TRI->getEncodingValue(Res.first) >= 8) {
58106 // Register requires REX prefix, but we're in 32-bit mode.
58107 return std::make_pair(0, nullptr);
58108 }
58109
58110 // Make sure it isn't a register that requires AVX512.
58111 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
58112 TRI->getEncodingValue(Res.first) & 0x10) {
58113 // Register requires EVEX prefix.
58114 return std::make_pair(0, nullptr);
58115 }
58116
58117 // Otherwise, check to see if this is a register class of the wrong value
58118 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
58119 // turn into {ax},{dx}.
58120 // MVT::Other is used to specify clobber names.
58121 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
58122 return Res; // Correct type already, nothing to do.
58123
58124 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
58125 // return "eax". This should even work for things like getting 64bit integer
58126 // registers when given an f64 type.
58127 const TargetRegisterClass *Class = Res.second;
58128 // The generic code will match the first register class that contains the
58129 // given register. Thus, based on the ordering of the tablegened file,
58130 // the "plain" GR classes might not come first.
58131 // Therefore, use a helper method.
58132 if (isGRClass(*Class)) {
58133 unsigned Size = VT.getSizeInBits();
58134 if (Size == 1) Size = 8;
58135 if (Size != 8 && Size != 16 && Size != 32 && Size != 64)
58136 return std::make_pair(0, nullptr);
58137 Register DestReg = getX86SubSuperRegister(Res.first, Size);
58138 if (DestReg.isValid()) {
58139 bool is64Bit = Subtarget.is64Bit();
58140 const TargetRegisterClass *RC =
58141 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
58142 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
58143 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
58144 : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);
58145 if (Size == 64 && !is64Bit) {
58146 // Model GCC's behavior here and select a fixed pair of 32-bit
58147 // registers.
58148 switch (DestReg) {
58149 case X86::RAX:
58150 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
58151 case X86::RDX:
58152 return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
58153 case X86::RCX:
58154 return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
58155 case X86::RBX:
58156 return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
58157 case X86::RSI:
58158 return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
58159 case X86::RDI:
58160 return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
58161 case X86::RBP:
58162 return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
58163 default:
58164 return std::make_pair(0, nullptr);
58165 }
58166 }
58167 if (RC && RC->contains(DestReg))
58168 return std::make_pair(DestReg, RC);
58169 return Res;
58170 }
58171 // No register found/type mismatch.
58172 return std::make_pair(0, nullptr);
58173 } else if (isFRClass(*Class)) {
58174 // Handle references to XMM physical registers that got mapped into the
58175 // wrong class. This can happen with constraints like {xmm0} where the
58176 // target independent register mapper will just pick the first match it can
58177 // find, ignoring the required type.
58178
58179 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
58180 if (VT == MVT::f16)
58181 Res.second = &X86::FR16XRegClass;
58182 else if (VT == MVT::f32 || VT == MVT::i32)
58183 Res.second = &X86::FR32XRegClass;
58184 else if (VT == MVT::f64 || VT == MVT::i64)
58185 Res.second = &X86::FR64XRegClass;
58186 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
58187 Res.second = &X86::VR128XRegClass;
58188 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
58189 Res.second = &X86::VR256XRegClass;
58190 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
58191 Res.second = &X86::VR512RegClass;
58192 else {
58193 // Type mismatch and not a clobber: Return an error;
58194 Res.first = 0;
58195 Res.second = nullptr;
58196 }
58197 } else if (isVKClass(*Class)) {
58198 if (VT == MVT::v1i1 || VT == MVT::i1)
58199 Res.second = &X86::VK1RegClass;
58200 else if (VT == MVT::v8i1 || VT == MVT::i8)
58201 Res.second = &X86::VK8RegClass;
58202 else if (VT == MVT::v16i1 || VT == MVT::i16)
58203 Res.second = &X86::VK16RegClass;
58204 else if (VT == MVT::v32i1 || VT == MVT::i32)
58205 Res.second = &X86::VK32RegClass;
58206 else if (VT == MVT::v64i1 || VT == MVT::i64)
58207 Res.second = &X86::VK64RegClass;
58208 else {
58209 // Type mismatch and not a clobber: Return an error;
58210 Res.first = 0;
58211 Res.second = nullptr;
58212 }
58213 }
58214
58215 return Res;
58216}
58217
58219 // Integer division on x86 is expensive. However, when aggressively optimizing
58220 // for code size, we prefer to use a div instruction, as it is usually smaller
58221 // than the alternative sequence.
58222 // The exception to this is vector division. Since x86 doesn't have vector
58223 // integer division, leaving the division as-is is a loss even in terms of
58224 // size, because it will have to be scalarized, while the alternative code
58225 // sequence can be performed in vector form.
58226 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
58227 return OptSize && !VT.isVector();
58228}
58229
58230void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
58231 if (!Subtarget.is64Bit())
58232 return;
58233
58234 // Update IsSplitCSR in X86MachineFunctionInfo.
58236 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
58237 AFI->setIsSplitCSR(true);
58238}
58239
58240void X86TargetLowering::insertCopiesSplitCSR(
58241 MachineBasicBlock *Entry,
58242 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
58243 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
58244 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
58245 if (!IStart)
58246 return;
58247
58248 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
58249 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
58250 MachineBasicBlock::iterator MBBI = Entry->begin();
58251 for (const MCPhysReg *I = IStart; *I; ++I) {
58252 const TargetRegisterClass *RC = nullptr;
58253 if (X86::GR64RegClass.contains(*I))
58254 RC = &X86::GR64RegClass;
58255 else
58256 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
58257
58258 Register NewVR = MRI->createVirtualRegister(RC);
58259 // Create copy from CSR to a virtual register.
58260 // FIXME: this currently does not emit CFI pseudo-instructions, it works
58261 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
58262 // nounwind. If we want to generalize this later, we may need to emit
58263 // CFI pseudo-instructions.
58264 assert(
58265 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
58266 "Function should be nounwind in insertCopiesSplitCSR!");
58267 Entry->addLiveIn(*I);
58268 BuildMI(*Entry, MBBI, MIMetadata(), TII->get(TargetOpcode::COPY), NewVR)
58269 .addReg(*I);
58270
58271 // Insert the copy-back instructions right before the terminator.
58272 for (auto *Exit : Exits)
58273 BuildMI(*Exit, Exit->getFirstTerminator(), MIMetadata(),
58274 TII->get(TargetOpcode::COPY), *I)
58275 .addReg(NewVR);
58276 }
58277}
58278
58280 return Subtarget.is64Bit();
58281}
58282
58286 const TargetInstrInfo *TII) const {
58287 assert(MBBI->isCall() && MBBI->getCFIType() &&
58288 "Invalid call instruction for a KCFI check");
58289
58290 MachineFunction &MF = *MBB.getParent();
58291 // If the call target is a memory operand, unfold it and use R11 for the
58292 // call, so KCFI_CHECK won't have to recompute the address.
58293 switch (MBBI->getOpcode()) {
58294 case X86::CALL64m:
58295 case X86::CALL64m_NT:
58296 case X86::TAILJMPm64:
58297 case X86::TAILJMPm64_REX: {
58300 if (!TII->unfoldMemoryOperand(MF, *OrigCall, X86::R11, /*UnfoldLoad=*/true,
58301 /*UnfoldStore=*/false, NewMIs))
58302 report_fatal_error("Failed to unfold memory operand for a KCFI check");
58303 for (auto *NewMI : NewMIs)
58304 MBBI = MBB.insert(OrigCall, NewMI);
58305 assert(MBBI->isCall() &&
58306 "Unexpected instruction after memory operand unfolding");
58307 if (OrigCall->shouldUpdateCallSiteInfo())
58308 MF.moveCallSiteInfo(&*OrigCall, &*MBBI);
58309 MBBI->setCFIType(MF, OrigCall->getCFIType());
58310 OrigCall->eraseFromParent();
58311 break;
58312 }
58313 default:
58314 break;
58315 }
58316
58317 MachineOperand &Target = MBBI->getOperand(0);
58318 Register TargetReg;
58319 switch (MBBI->getOpcode()) {
58320 case X86::CALL64r:
58321 case X86::CALL64r_NT:
58322 case X86::TAILJMPr64:
58323 case X86::TAILJMPr64_REX:
58324 assert(Target.isReg() && "Unexpected target operand for an indirect call");
58325 Target.setIsRenamable(false);
58326 TargetReg = Target.getReg();
58327 break;
58328 case X86::CALL64pcrel32:
58329 case X86::TAILJMPd64:
58330 assert(Target.isSymbol() && "Unexpected target operand for a direct call");
58331 // X86TargetLowering::EmitLoweredIndirectThunk always uses r11 for
58332 // 64-bit indirect thunk calls.
58333 assert(StringRef(Target.getSymbolName()).ends_with("_r11") &&
58334 "Unexpected register for an indirect thunk call");
58335 TargetReg = X86::R11;
58336 break;
58337 default:
58338 llvm_unreachable("Unexpected CFI call opcode");
58339 break;
58340 }
58341
58342 return BuildMI(MBB, MBBI, MIMetadata(*MBBI), TII->get(X86::KCFI_CHECK))
58343 .addReg(TargetReg)
58344 .addImm(MBBI->getCFIType())
58345 .getInstr();
58346}
58347
58348/// Returns true if stack probing through a function call is requested.
58350 return !getStackProbeSymbolName(MF).empty();
58351}
58352
58353/// Returns true if stack probing through inline assembly is requested.
58355
58356 // No inline stack probe for Windows, they have their own mechanism.
58357 if (Subtarget.isOSWindows() ||
58358 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
58359 return false;
58360
58361 // If the function specifically requests inline stack probes, emit them.
58362 if (MF.getFunction().hasFnAttribute("probe-stack"))
58363 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
58364 "inline-asm";
58365
58366 return false;
58367}
58368
58369/// Returns the name of the symbol used to emit stack probes or the empty
58370/// string if not applicable.
58373 // Inline Stack probes disable stack probe call
58374 if (hasInlineStackProbe(MF))
58375 return "";
58376
58377 // If the function specifically requests stack probes, emit them.
58378 if (MF.getFunction().hasFnAttribute("probe-stack"))
58379 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
58380
58381 // Generally, if we aren't on Windows, the platform ABI does not include
58382 // support for stack probes, so don't emit them.
58383 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
58384 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
58385 return "";
58386
58387 // We need a stack probe to conform to the Windows ABI. Choose the right
58388 // symbol.
58389 if (Subtarget.is64Bit())
58390 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
58391 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
58392}
58393
58394unsigned
58396 // The default stack probe size is 4096 if the function has no stackprobesize
58397 // attribute.
58398 return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",
58399 4096);
58400}
58401
58403 if (ML && ML->isInnermost() &&
58404 ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
58407}
unsigned const MachineRegisterInfo * MRI
#define Success
static SDValue Widen(SelectionDAG *CurDAG, SDValue N)
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
unsigned RegSize
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
#define NODE_NAME_CASE(node)
static const LLT S1
amdgpu AMDGPU Register Bank Select
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
#define EXPAND(Op)
Function Alias Analysis Results
BitTracker BT
Definition: BitTracker.cpp:73
BlockVerifier::State From
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:203
This file contains the declarations for the subclasses of Constant, which represent the different fla...
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
Looks at all the uses of the given value Returns the Liveness deduced from the uses of this value Adds all uses that cause the result to be MaybeLive to MaybeLiveRetUses If the result is MaybeLiveUses might be modified but its content should be ignored(since it might not be complete). DeadArgumentEliminationPass
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
std::string Name
uint64_t Size
Symbol * Sym
Definition: ELF_riscv.cpp:479
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
static KnownBits extractBits(unsigned BitWidth, const KnownBits &SrcOpKnown, const KnownBits &OffsetKnown, const KnownBits &WidthKnown)
Hexagon Common GEP
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
IRTranslator LLVM IR MI
static const unsigned MaxDepth
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
static int matchShuffleAsBitRotate(ArrayRef< int > Mask, int NumSubElts)
Try to lower a vector shuffle as a bit rotation.
static Value * LowerCTLZ(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctlz of V before the specified instruction IP.
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition: Lint.cpp:528
Live Register Matrix
static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc)
Return true if node is an ISD::AND or ISD::OR of two M68k::SETcc nodes each of which has no other use...
static bool hasNonFlagsUse(SDValue Op)
return true if Op has a use that doesn't just read flags.
static bool isCMOVPseudo(MachineInstr &MI)
static SDValue combineCarryThroughADD(SDValue CCR)
static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG)
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
#define R2(n)
#define T1
uint64_t High
LLVMContext & Context
This file defines ARC utility functions which are used by various parts of the compiler.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
PowerPC Reduce CR logical Operation
PowerPC TLS Dynamic Call Fixup
if(VerifyEach)
const char LLVMTargetMachineRef TM
static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget)
static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc)
const SmallVectorImpl< MachineOperand > & Cond
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isSimple(Instruction *I)
unsigned OpIndex
static StringRef substr(StringRef Str, uint64_t Len)
This file implements the SmallBitVector class.
This file defines the SmallSet class.
static bool Enabled
Definition: Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
This file contains some functions that are useful when dealing with strings.
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...
This file describes how to lower LLVM code to machine code.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &DL, unsigned VectorWidth)
static bool is64Bit(const char *name)
#define GET_EGPR_IF_ENABLED(OPC)
static unsigned getSUBriOpcode(bool IsLP64)
static bool isNoopOrBroadcastShuffleMask(ArrayRef< int > Mask)
static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask)
static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget)
Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer t...
static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::ANDNP nodes.
static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0, const SDValue &Zext1, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT, SDValue X, SDValue Y, SelectionDAG &DAG, bool ZeroSecondOpOnly=false)
If this is an add or subtract where one operand is produced by a cmp+setcc, then try to convert it to...
static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp, SmallVectorImpl< SDValue > &SrcOps, SmallVectorImpl< APInt > *SrcMask=nullptr)
Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...)) style scalarized (associative) ...
static SDValue combineSubABS(SDNode *N, SelectionDAG &DAG)
static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, SDValue &Op1, bool &IsAlwaysSignaling)
Turns an ISD::CondCode into a value suitable for SSE floating-point mask CMPs.
static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC)
static SDValue combineXorSubCTLZ(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If a value is a scalar FP zero or a vector FP zero (potentially including undefined elements),...
static bool matchBinaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static SDValue combineSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isGRClass(const TargetRegisterClass &RC)
Check if RC is a general purpose register class.
static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero, SmallVectorImpl< SDValue > &Ops, SmallVectorImpl< int > &Mask, bool &IsUnary)
Calculates the shuffle mask corresponding to the target-specific opcode.
static SDValue vectorizeExtractedCast(SDValue Cast, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast operation that is extracted from a vector, try to vectorize the cast op followed ...
static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG)
static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable)
static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode, const SDLoc &DL, SelectionDAG &DAG, unsigned BaseIdx, unsigned LastIdx, SDValue &V0, SDValue &V1)
This is a helper function of LowerToHorizontalOp().
static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In, const SDLoc &dl, SelectionDAG &DAG)
static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > HalfMask, int HalfIdx1, int HalfIdx2, bool UndefLower, SelectionDAG &DAG, bool UseConcat=false)
Given the output values from getHalfShuffleMask(), create a half width shuffle of extracted vectors f...
static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, SDValue ShAmt, int ShAmtIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle vector element shifts by a splat shift amount.
@ ConstantBit
@ NotConstantBit
@ NotShiftBit
@ ShiftBit
@ UndefBit
static SDValue combineZext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc, bool NSW)
Given a buildvector constant, return a new vector constant with each element incremented or decrement...
static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, unsigned &NumExtracts, bool &IsSubAdd)
Returns true iff BV builds a vector with the result equivalent to the result of ADDSUB/SUBADD operati...
static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode)
static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane 32-bit floating point shuffles.
static MachineBasicBlock * emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, const TargetInstrInfo *TII)
Utility function to emit xbegin specifying the start of an RTM region.
static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef< SDValue > Elts, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
Given the initializing elements 'Elts' of a vector of type 'VT', see if the elements can be replaced ...
static bool scaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts, SmallVectorImpl< int > &ScaledMask)
static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerTruncateVecPackWithSignBits(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
This function lowers a vector truncation of 'extended sign-bits' or 'extended zero-bits' values.
static APInt getExtractedDemandedElts(SDNode *N)
static SDValue combineBitOpWithPACK(SDNode *N, SelectionDAG &DAG)
static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit integer shuffles.
static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we are inverting an PTEST/TESTP operand, attempt to adjust the CC to avoid the inversion.
static unsigned getAltBitOpcode(unsigned Opcode)
static Constant * getConstantVector(MVT VT, ArrayRef< APInt > Bits, const APInt &Undefs, LLVMContext &C)
static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue promoteXINT_TO_FP(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert i1-subvector to i1-vector.
static SDValue materializeVectorConstant(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Create a vector constant without a load.
static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle with a single PSHUFB of V1 or V2.
static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsDecomposedShuffleMerge(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic routine to decompose a shuffle and blend into independent blends and permutes.
static SDValue combineBMILogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerUINT_TO_FP_i64(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
64-bit unsigned integer to double expansion.
static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT, const X86Subtarget &Subtarget)
static bool isX86CCSigned(unsigned X86CC)
Return true if the condition is an signed comparison operation.
static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a 128-bit shuffles.
static SDValue combineBitOpWithShift(SDNode *N, SelectionDAG &DAG)
static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on SELECT and VSELECT nodes.
static bool isUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is undef or ...
static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getConstVector(ArrayRef< int > Values, MVT VT, SelectionDAG &DAG, const SDLoc &dl, bool IsMask=false)
static MachineInstrBuilder createPHIsForCMOVsInSinkBB(MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd, MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, MachineBasicBlock *SinkMBB)
static SDValue combineCMP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to put 128-bits into a vector > 128 bits.
static bool onlyZeroFlagUsed(SDValue Flags)
static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 256-bits from a 512-bit vector.
static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Either split a vector in halves or decompose the shuffles and the blend/unpack.
static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleAsLanePermuteAndShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one source with a lane permutatio...
static bool isFoldableUseOfShuffle(SDNode *N)
static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return (and Op, Mask) for compare instructions or (vselect Mask, Op, PreservedSrc) for others along w...
static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKSS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg sign extension and X86ISD::PACKSS.
static SDValue combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool isShuffleMaskInputInPlace(int Input, ArrayRef< int > Mask)
Test whether the specified input (0 or 1) is in-place blended by the given mask.
static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether elements in each LaneSizeInBits lane in this shuffle mask come from multiple lanes - thi...
static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT, ISD::CondCode Cond, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
As another special case, use PSUBUS[BW] when it's profitable.
static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 128-bit lane.
static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void getPackDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineADC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static bool isShuffleFoldableLoad(SDValue V)
Helper to test for a load that can be folded with x86 shuffles.
static SDValue lowerShuffleAsElementInsertion(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower insertion of a single element into a zero vector.
static SDValue combineXor(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnpackWdShuffleMask(ArrayRef< int > Mask, MVT VT, const SelectionDAG &DAG)
static SDValue LowerTruncateVecPack(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into X86ISD::PACKUS/X86ISD::P...
static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle case where shuffle sources are coming from the same 128-bit lane and every lane can be represe...
static void computeKnownBitsForPSADBW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static int getSEHRegistrationNodeSize(const Function *Fn)
static SDValue combineShuffleOfConcatUndef(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Creates an SDNode for a predicated scalar operation.
static SDValue buildFromShuffleMostly(SDValue Op, const SDLoc &DL, SelectionDAG &DAG)
static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
If a BUILD_VECTOR's source elements all apply the same bit operation and one of their operands is con...
static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Dispatching routine to lower various 128-bit x86 vector shuffles.
static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG)
static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth=0)
Returns the negated value if the node N flips sign of FP value.
static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 16-bit integer shuffles.
static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower atomic_load_ops into LOCK-prefixed operations.
static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 8-bit integer shuffles.
static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG)
static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0, int BroadcastIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single - truncated - integer element, coming from a scalar_to_vector/buil...
static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, const SDLoc &DL, SelectionDAG &DAG, unsigned X86Opcode, bool Mode, bool isUndefLO, bool isUndefHI)
Emit a sequence of two 128-bit horizontal add/sub followed by a concat_vector.
static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, SDValue *InGlue, const EVT PtrVT, unsigned ReturnReg, unsigned char OperandFlags, bool LocalDynamic=false)
static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to fold: and (vector_shuffle<Z,...,Z> (insert_vector_elt undef, (xor X, -1), Z),...
static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a bitmask instruction for a shuffle.
static bool is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 256-bit lane.
static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl, SDValue V1, SDValue V2, ArrayRef< int > Mask)
static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL, SelectionDAG &DAG)
static SDValue LowerUINT_TO_FP_i32(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
32-bit unsigned integer to float expansion.
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static cl::opt< int > ExperimentalPrefInnermostLoopAlignment("x86-experimental-pref-innermost-loop-alignment", cl::init(4), cl::desc("Sets the preferable loop alignment for experiments (as log2 bytes) " "for innermost loops only. If specified, this option overrides " "alignment set by x86-experimental-pref-loop-alignment."), cl::Hidden)
static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute from a vector of source v...
static SDValue getHopForBuildVector(const BuildVectorSDNode *BV, const SDLoc &DL, SelectionDAG &DAG, unsigned HOpcode, SDValue V0, SDValue V1)
static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle as a zero or any extension.
static bool needCarryOrOverflowFlag(SDValue Flags)
static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
Returns a vector of specified type with all bits set.
static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefLowerHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose lower half is undefined.
static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineRedundantDWordShuffle(SDValue N, MutableArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
Search for a combinable shuffle across a chain ending in pshufd.
static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG, SDValue OpMustEq, SDValue Op, unsigned Depth)
static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, uint64_t ShiftAmt, SelectionDAG &DAG)
Handle vector element shifts where the shift amount is a constant.
static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS, bool PackHiHalf=false)
Returns a node that packs the LHS + RHS nodes together at half width.
static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG)
static bool matchUnaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue V1, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT)
static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast to FP with a cast to integer operand (almost an ftrunc), try to vectorize the cas...
static bool getHalfShuffleMask(ArrayRef< int > Mask, MutableArrayRef< int > HalfMask, int &HalfIdx1, int &HalfIdx2)
If the input shuffle mask results in a vector that is undefined in all upper or lower half elements a...
static cl::opt< int > BrMergingBaseCostThresh("x86-br-merging-base-cost", cl::init(2), cl::desc("Sets the cost threshold for when multiple conditionals will be merged " "into one branch versus be split in multiple branches. Merging " "conditionals saves branches at the cost of additional instructions. " "This value sets the instruction cost limit, below which conditionals " "will be merged, and above which conditionals will be split. Set to -1 " "to never merge branches."), cl::Hidden)
static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT)
static SDValue emitLockedStackOp(SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue Chain, const SDLoc &DL)
Emit a locked operation on a stack location which does not change any memory location,...
static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, bool &ForceV1Zero, bool &ForceV2Zero, unsigned &ShuffleImm, ArrayRef< int > Mask, const APInt &Zeroable)
static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 8-lane 16-bit floating point shuffles.
static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle using bit math.
static SDValue reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-extending masked load, it is a scalar load and ve...
static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, unsigned TargetOpcode, unsigned SrcReg, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics with chain that return their value into registers EDX:EAX.
static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI)
static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, SelectionDAG &DAG)
static SDValue LowerBuildVectorAsInsert(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, unsigned EltSizeInBits, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a target shuffle mask is equivalent within each sub-lane.
static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to map a 128-bit or larger integer comparison to vector instructions before type legalization spl...
static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
This function detects the AVG pattern between vectors of unsigned i8/i16, which is c = (a + b + 1) / ...
static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether there are elements crossing LaneSizeInBits lanes in this shuffle mask.
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, X86::CondCode &X86CC)
Result of 'and' is compared against zero.
static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsZeroOrAnyExtend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a zero extension on any microarch.
static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool supportedVectorShiftWithBaseAmnt(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Compute the horizontal sum of bytes in V for the elements of VT.
static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 16-bit integer shuffles.
static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG)
static void computeInLaneShuffleMask(const ArrayRef< int > &Mask, int LaneSize, SmallVector< int > &InLaneMask)
Helper to get compute inlane shuffle mask for a complete shuffle mask.
static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG)
static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineTESTP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT, EVT MemVT, MemSDNode *Mem, unsigned Offset, SelectionDAG &DAG)
static bool isUndefUpperHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose upper half is undefined.
static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG)
Lower SRA_PARTS and friends, which return two i32 values and take a 2 x i32 value to shift plus a shi...
static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode)
static std::pair< SDValue, SDValue > getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG)
static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs reference the same FP CMP,...
static bool isVKClass(const TargetRegisterClass &RC)
Check if RC is a mask register class.
static SDValue combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If a vector select has an operand that is -1 or 0, try to simplify the select to a bitwise logic oper...
static int canLowerByDroppingElements(ArrayRef< int > Mask, bool MatchEven, bool IsSingleInput)
Check whether a compaction lowering can be done by dropping even/odd elements and compute how many ti...
static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
Attempt to pre-truncate inputs to arithmetic ops if it will simplify the codegen.
static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single element.
static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void resolveTargetShuffleInputsAndMask(SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask)
Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 64-lane 8-bit integer shuffles.
static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to combine a shuffle into a target-specific add-sub or mul-add-sub node.
static SDValue lowerShuffleAsLanePermuteAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes as a lane permutation followed by a per-lane p...
static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG)
static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of 8-lane i16 shuffles.
static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue In, SelectionDAG &DAG)
static bool canonicalizeShuffleMaskWithCommute(ArrayRef< int > Mask)
Helper function that returns true if the shuffle mask should be commuted to improve canonicalization.
static bool matchAsm(StringRef S, ArrayRef< const char * > Pieces)
static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getV4X86ShuffleImm8ForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG)
Change a vector store into a pair of half-size vector stores.
static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a vector to a larger size with the same scalar type, with the new elements either zero or undef...
static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static bool isUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue MatchVectorAllEqualTest(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FANDN nodes.
static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, TLSModel::Model model, bool is64Bit, bool isPIC)
static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R, SDValue And1_L, SDValue And1_R, const SDLoc &DL, SelectionDAG &DAG)
static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendBoolVectorInReg(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break a binary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue createSetFPEnvNodes(SDValue Ptr, SDValue Chain, SDLoc DL, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static const char * getIndirectThunkSymbol(const X86Subtarget &Subtarget, unsigned Reg)
static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG)
static unsigned getV4X86ShuffleImm(ArrayRef< int > Mask)
Get a 4-lane 8-bit shuffle immediate for a mask.
static void resolveTargetShuffleFromZeroables(SmallVectorImpl< int > &Mask, const APInt &KnownUndef, const APInt &KnownZero, bool ResolveKnownZeros=true)
static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert one bit to mask vector, like v16i1 or v8i1.
static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle by first fixing the 128-bit lanes and then shuffling each lane.
static bool isSoftF16(T VT, const X86Subtarget &Subtarget)
static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit integer shuffles.
static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG)
static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Detect vector gather/scatter index generation and convert it from being a bunch of shuffles and extra...
static bool isSingleSHUFPSMask(ArrayRef< int > Mask)
Test whether this can be lowered with a single SHUFPS instruction.
static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, X86::CondCode &CC1, SDValue &Flags, bool &isAnd)
Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
static bool isX86LogicalCmp(SDValue Op)
Return true if opcode is a X86 logical comparison.
static bool isAnyInRange(ArrayRef< int > Mask, int Low, int Hi)
Return true if the value of any element in Mask falls within the specified range (L,...
static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG)
static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT, SelectionDAG &DAG, unsigned Depth)
static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS=false)
Detect patterns of truncation with signed saturation: (truncate (smin ((smax (x, signed_min_of_dest_t...
const unsigned FPStateSize
static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, unsigned &UnpackOpcode, bool IsUnary, ArrayRef< int > TargetMask, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating point negations.
static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl, unsigned vectorWidth)
static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If both input operands of a logic op are being cast from floating-point types or FP compares,...
static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG, unsigned &HOpcode, SDValue &V0, SDValue &V1)
static SDValue combineFOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool createShuffleMaskFromVSELECT(SmallVectorImpl< int > &Mask, SDValue Cond, bool IsBLENDV=false)
static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size, bool AllowTruncate)
static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Helper to determine if In truncated to DstVT has the necessary signbits / leading zero bits to be tru...
static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Return Mask with the necessary casting or extending for Mask according to MaskVT when lowering maskin...
static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit floating point shuffles.
static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Horizontal vector math instructions may be slower than normal math with shuffles.
static bool isFRClass(const TargetRegisterClass &RC)
Check if RC is a vector register class.
static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool SimpleOnly)
Generic routine to split vector shuffle into half-sized shuffles.
static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue IsNOT(SDValue V, SelectionDAG &DAG)
static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG)
Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
static SDValue combineOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "test Op0,Op0", or something equivalent.
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, const SDLoc &dl)
Return a vector logical shift node.
static bool isFreeToSplitVector(SDNode *N, SelectionDAG &DAG)
static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane i32 vector shuffles.
static SDValue combineX86ShuffleChain(ArrayRef< SDValue > Inputs, SDValue Root, ArrayRef< int > BaseMask, int Depth, bool HasVariableMask, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Combine an arbitrary chain of shuffles into a single instruction if possible.
static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer types.
static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, SelectionDAG &DAG)
static bool isInRange(int Val, int Low, int Hi)
Return true if Val falls within the specified range (L, H].
static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Try to combine x86 target specific shuffles.
static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static std::pair< SDValue, SDValue > splitVector(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG)
Helper for attempting to create a X86ISD::BT node.
static SDValue EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Truncating Store with signed or unsigned saturation.
static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, bool FillWithZeroes=false)
Widen a vector input to a vector of NVT.
static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG)
static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool ImmBlends=false)
Try to lower as a blend of elements from two inputs followed by a single-input permutation.
static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx, const APInt &Zeroable)
const unsigned X87StateSize
static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit integer shuffles.
static bool isLegalConversion(MVT VT, bool IsSigned, const X86Subtarget &Subtarget)
static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool isUndefOrEqual(int Val, int CmpVal)
Val is the undef sentinel value or equal to the specified value.
static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isTargetShuffle(unsigned Opcode)
static bool isSingleElementRepeatedMask(ArrayRef< int > Mask)
Check if the Mask consists of the same element repeated multiple times.
static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG)
static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0, SDValue N1, ArrayRef< int > Mask, SelectionDAG &DAG)
If we are extracting two 128-bit halves of a vector and shuffling the result, match that to a 256-bit...
static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit floating point shuffles.
static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or 'fsubadd' operation accordingly...
static SDValue lowerV8I16GeneralSingleInputShuffle(const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 shuffle lowering,...
static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
High-level routine to lower various 256-bit x86 vector shuffles.
static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG)
Try to turn tests against the signbit in the form of: XOR(TRUNCATE(SRL(X, size(X)-1)),...
static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit floating point shuffles.
static SDValue combineOrXorWithSETCC(SDNode *N, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue isUpperSubvectorUndef(SDValue V, const SDLoc &DL, SelectionDAG &DAG)
static cl::opt< int > BrMergingLikelyBias("x86-br-merging-likely-bias", cl::init(0), cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "likely, then it is likely that if the conditionals are split " "both sides will be executed, so it may be desirable to increase " "the instruction cost threshold. Set to -1 to never merge likely " "branches."), cl::Hidden)
static bool clobbersFlagRegisters(const SmallVector< StringRef, 4 > &AsmPieces)
static SDValue getInvertedVectorForFMA(SDValue V, SelectionDAG &DAG)
static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp, int Idx, int ExpectedIdx)
Checks whether the vector elements referenced by two shuffle masks are equivalent.
static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Try to match a vector shuffle as an element rotation.
static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi)
Return true if Val is undef, zero or if its value falls within the specified range (L,...
static const Constant * getTargetConstantFromBasePtr(SDValue Ptr)
static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT, SDValue Src, const SDLoc &DL)
static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Original, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle.
static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset)
static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Helper that combines an array of subvector ops as if they were the operands of a ISD::CONCAT_VECTORS ...
static bool isUndefOrInRange(int Val, int Low, int Hi)
Return true if Val is undef or if its value falls within the specified range (L, H].
static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, EVT VT)
static bool collectConcatOps(SDNode *N, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG)
static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If both arms of a vector select are concatenated vectors, split the select, and concatenate the resul...
static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue combineSBB(SDNode *N, SelectionDAG &DAG)
static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static std::pair< Value *, BitTestKind > FindSingleBitChange(Value *V)
static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG)
If we are converting a value to floating-point, try to replace scalar truncate of an extracted vector...
static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef< int > Mask)
Test whether there are elements crossing 128-bit lanes in this shuffle mask.
static SDValue LowerI64IntToFP16(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit integer shuffles.
static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "cmp Op0,Op1", or something equivalent.
static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG)
const unsigned FPStateSizeInBits
static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-truncating masked store, it is a vector extract a...
static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode)
static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue narrowExtractedVectorSelect(SDNode *Ext, const SDLoc &DL, SelectionDAG &DAG)
If we are extracting a subvector of a vector select and the select condition is composed of concatena...
static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNoopShuffleMask(ArrayRef< int > Mask)
Tiny helper function to identify a no-op mask.
static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackh operation.
static SDValue combineExtractFromVectorLoad(SDNode *N, EVT VecVT, SDValue SrcVec, uint64_t Idx, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If this is a zero/all-bits result that is bitwise-anded with a low bits mask.
static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a byte shift sequence.
static SDValue combineX86ShuffleChainWithExtract(ArrayRef< SDValue > Inputs, SDValue Root, ArrayRef< int > BaseMask, int Depth, bool HasVariableMask, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isTargetShuffleVariableMask(unsigned Opcode)
static bool isLogicOp(unsigned Opcode)
static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool BitwiseOnly)
static SDValue LowerBuildVectorv8i16(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v8i16.
static bool matchBinaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT, bool IsUnary)
static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to lower as an unpack of elements from two inputs followed by a single-input permutation.
static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG)
static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, bool IsZero, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return a vector_shuffle of the specified vector of zero or undef vector.
static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Attempt to use the vbroadcast instruction to generate a splat value from a splat BUILD_VECTOR which u...
static SDValue combineX86ShufflesConstants(ArrayRef< SDValue > Ops, ArrayRef< int > Mask, SDValue Root, bool HasVariableMask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit floating point shuffles.
static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG)
static bool getTargetShuffleMaskIndices(SDValue MaskNode, unsigned MaskEltSizeInBits, SmallVectorImpl< uint64_t > &RawMask, APInt &UndefElts)
static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG, const X86Subtarget &Subtarget)
sext(add_nsw(x, C)) --> add(sext(x), C_sext) zext(add_nuw(x, C)) --> add(zext(x), C_zext) Promoting a...
static const Constant * getTargetConstantFromNode(LoadSDNode *Load)
static bool canCombineAsMaskOperation(SDValue V, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a dword/qword rotation.
static bool isProfitableToUseFlagOp(SDValue Op)
static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG)
ISD::FROUND is defined to round to nearest with ties rounding away from 0.
static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG, const SDLoc &DL)
Detect patterns of truncation with unsigned saturation:
static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG)
If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the low half of each source v...
static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, bool isFP, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG)
Do a one-to-one translation of a ISD::CondCode to the X86-specific condition code,...
static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, unsigned ScalarSizeInBits, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable, const X86Subtarget &Subtarget)
Try to lower a vector shuffle as a bit shift (shifts in zeros).
static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG)
static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
High-level routine to lower various 512-bit x86 vector shuffles.
static SDValue LowerBuildVectorv16i8(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v16i8.
static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, APInt &UndefElts, SmallVectorImpl< APInt > &EltBits, bool AllowWholeUndefs=true, bool AllowPartialUndefs=false)
static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0, SDValue &Op1)
static SDValue splitIntVSETCC(EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SelectionDAG &DAG, const SDLoc &dl)
Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then concatenate the result back.
static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG)
Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit) followed by unpack 256-bit.
static SDValue lowerAddSubToHorizontalOp(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Depending on uarch and/or optimizing for size, we might prefer to use a vector operation in place of ...
static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp, SelectionDAG &DAG, SDValue &Addr, SDValue &Index, Align &Alignment, unsigned &Offset)
Given a masked memory load/store operation, return true if it has one mask bit set.
static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode)
static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG)
static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2, unsigned ExpectedUses)
Returns true if is possible to fold MUL and an idiom that has already been recognized as ADDSUB/SUBAD...
static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS, unsigned &LogBias, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering 2-lane 128-bit shuffles.
static SDValue lowerUINT_TO_FP_vec(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute)
static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG)
The only differences between FABS and FNEG are the mask and the logic op.
ShrinkMode
Different mul shrinking modes.
static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG, const SDLoc &dl)
static SDValue canonicalizeShuffleMaskWithHorizOp(MutableArrayRef< SDValue > Ops, MutableArrayRef< int > Mask, unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void computeZeroableShuffleElements(ArrayRef< int > Mask, SDValue V1, SDValue V2, APInt &KnownUndef, APInt &KnownZero)
Compute whether each element of a shuffle is zeroable.
static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Masked Truncating Store with signed or unsigned saturation.
static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a VSELECT instruction to a vector shuffle.
static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2, MutableArrayRef< int > Mask, const APInt &Zeroable, bool &ForceV1Zero, bool &ForceV2Zero, uint64_t &BlendMask)
static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src, const SDLoc &DL)
static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
CMOV of constants requires materializing constant operands in registers.
static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG, EVT VT, const SDLoc &DL)
static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackl operation.
static SDValue getScalarValueForVectorElement(SDValue V, int Idx, SelectionDAG &DAG)
Try to get a scalar value for a specific element of a vector.
static unsigned getOpcodeForIndirectThunk(unsigned RPOpc)
static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of v16i8 shuffles.
static bool isNullFPScalarOrVectorConst(SDValue V)
static bool hasIdenticalHalvesShuffleMask(ArrayRef< int > Mask)
Return true if a shuffle mask chooses elements identically in its top and bottom halves.
static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2, unsigned &PackOpcode, ArrayRef< int > TargetMask, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned MaxStages=1)
static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget)
static SDValue combineBITREVERSE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to convert a vector reduction sequence composed of binops and shuffles into horizontal ops.
static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffle using X86ISD::VROTLI rotations.
static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT)
static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Combine: (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S) to: (brcond/cmov/setcc ....
static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize an EFLAGS definition used according to the condition code CC into a simpler EFLAGS value,...
static bool isBroadcastShuffleMask(ArrayRef< int > Mask)
static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsCommutative, SmallVectorImpl< int > &PostShuffleMask)
Return 'true' if this vector operation is "horizontal" and return the operands for the horizontal ope...
static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Extracting a scalar FP value from vector element 0 is free, so extract each operand first,...
static SDValue combineX86ShufflesRecursively(ArrayRef< SDValue > SrcOps, int SrcOpIndex, SDValue Root, ArrayRef< int > RootMask, ArrayRef< const SDNode * > SrcNodes, unsigned Depth, unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Fully generic combining of x86 shuffle instructions.
static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static StringRef getInstrStrFromOpNo(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo)
static bool isSequentialOrUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size,...
static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
When the operands of vector mul are extended from smaller size values, like i8 and i16,...
static bool canWidenShuffleElements(ArrayRef< int > Mask, SmallVectorImpl< int > &WidenedMask)
Helper function to test whether a shuffle mask could be simplified by widening the elements being shu...
static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an unary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue combineSext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit integer shuffles.
static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineLogicBlendIntoConditionalNegate(EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getShuffleScalarElt(SDValue Op, unsigned Index, SelectionDAG &DAG, unsigned Depth)
Returns the scalar element that will make up the i'th element of the result of the vector shuffle.
static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable)
static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG)
Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2, unsigned &InsertPSMask, const APInt &Zeroable, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNonZeroElementsInOrder(const APInt &Zeroable, ArrayRef< int > Mask, const EVT &VectorType, bool &IsZeroSideLeft)
static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMul(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG, EVT VecVT, EVT CmpVT, bool HasPT, F SToV)
Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp expansion.
static SDValue truncateAVX512SetCCNoBWI(EVT VT, EVT OpVT, SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just pre-promote its result type since...
static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Try to lower a vector shuffle as a byte rotation.
static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle as a permute of the inputs followed by an UNPCK instruction.
static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT, SelectionDAG &DAG)
static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isAddSubOrSubAddMask(ArrayRef< int > Mask, bool &Op0Even)
Checks if the shuffle mask takes subsequent elements alternately from two vectors.
static bool isCompletePermute(ArrayRef< int > Mask)
Return true if every element of a single input is referenced by the shuffle mask.
static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, SDValue EntryEBP)
When the MSVC runtime transfers control to us, either to an outlined function or when returning to a ...
static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode, SelectionDAG &DAG, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics that read the time stamp counter (x86_rdtsc and x86_rdtscp...
static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorAllEqual(const SDLoc &DL, SDValue LHS, SDValue RHS, ISD::CondCode CC, const APInt &OriginalMask, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static bool is128BitUnpackShuffleMask(ArrayRef< int > Mask, const SelectionDAG &DAG)
static bool isOrXorXorTree(SDValue X, bool Root=true)
Recursive helper for combineVectorSizedSetCCEquality() to see if we have a recognizable memcmp expans...
static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FAND nodes.
static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static ConstantPoolSDNode * getTargetConstantPoolFromBasePtr(SDValue Ptr)
static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V, SelectionDAG &DAG, const SDLoc &DL)
Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
static bool isShuffleEquivalent(ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a shuffle mask is equivalent to an explicit list of arguments.
static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT, const APInt &Zeroable, ArrayRef< int > Mask, SDValue &V1, SDValue &V2, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit floating point shuffles.
static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBUILD_VECTORAsVariablePermute(SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsByteRotateAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then permuting the elements of th...
static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTPOP(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool ZeroUppers)
static void createPackShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Unary, unsigned NumStages=1)
Create a shuffle mask that matches the PACKSS/PACKUS truncation.
static bool isUndefOrEqualInRange(ArrayRef< int > Mask, int CmpVal, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating-point adds/subs.
static SDValue LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an operation into 2 half sized ops and then concatenate the results.
static cl::opt< bool > MulConstantOptimization("mul-constant-optimization", cl::init(true), cl::desc("Replace 'mul x, Const' with more effective instructions like " "SHIFT, LEA, etc."), cl::Hidden)
static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld)
static bool isAnyZero(ArrayRef< int > Mask)
Return true if the value of any element in Mask is the zero sentinel value.
static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKUS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
static SDValue commuteSelect(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerINT_TO_FP_vXi64(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl< int > &Mask, APInt &KnownUndef, APInt &KnownZero)
static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS, SDValue Index, SDValue Base, SDValue Scale, SelectionDAG &DAG)
static SmallVector< int, 4 > getPSHUFShuffleMask(SDValue N)
Get the PSHUF-style mask from PSHUF node.
static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT, SelectionDAG &DAG)
Scalarize a vector store, bitcasting to TargetVT to determine the scalar type.
static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineShuffleToFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isUndefOrZero(int Val)
Val is either the undef or zero sentinel value.
static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If this is a dynamic select (non-constant condition) and we can match this node with one of the varia...
SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, F Builder, bool CheckBWI=true)
static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL].
static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr, MachineBasicBlock *BB)
static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 128-bits from a vector > 128 bits.
static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue &X86CC)
static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG)
Lower a vector shuffle using the SHUFPS instruction.
static SDValue combineStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isHorizOp(unsigned Opcode)
static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector CTLZ using native supported vector CTLZ instruction.
static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Extract one bit from mask vector, like v16i1 or v8i1.
static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl, MVT VT, bool IsSigned, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue *Low=nullptr)
static SDValue lowerShuffleAsBlendOfPSHUFBs(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse)
Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the blend if only one input i...
static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx)
static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS, SDValue Mask, SelectionDAG &DAG)
static bool isSequentialOrUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos + Size,...
static cl::opt< int > BrMergingUnlikelyBias("x86-br-merging-unlikely-bias", cl::init(-1), cl::desc("Decreases 'x86-br-merging-base-cost' in cases that it is unlikely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "unlikely, then it is unlikely that if the conditionals are split " "both sides will be executed, so it may be desirable to decrease " "the instruction cost threshold. Set to -1 to never merge unlikely " "branches."), cl::Hidden)
static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, APInt &KnownUndef, APInt &KnownZero)
Decode a target shuffle mask and inputs and see if any values are known to be undef or zero from thei...
static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBuildVectorv4x32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v4i32 or v4f32.
static bool isTargetShuffleEquivalent(MVT VT, ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, const SelectionDAG &DAG, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a target shuffle mask is equivalent to an explicit pattern.
static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG)
static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG)
static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG)
Fold "masked merge" expressions like (m & x) | (~m & y) into the equivalent ((x ^ y) & m) ^ y) patter...
static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1)
static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, bool Is64Bit, bool Is64BitLP64)
static SDValue combineAndNotIntoANDNP(SDNode *N, SelectionDAG &DAG)
Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineBT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, SDValue ExtIdx)
For an EXTRACT_VECTOR_ELT with a constant index return the real underlying vector and index.
static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnaryOp(unsigned Opcode)
static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each sub-lane.
static SDValue LowerZERO_EXTEND_Mask(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize branch condition evaluation.
static bool hasFPCMov(unsigned X86CC)
Is there a floating point cmov for the specific X86 condition code? Current x86 isa includes the foll...
static int getOneTrueElt(SDValue V)
If V is a build vector of boolean constants and exactly one of those constants is true,...
static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static constexpr int Concat[]
Value * RHS
Value * LHS
if(isa< SExtInst >(LHS)) std auto IsFreeTruncation
static const unsigned FramePtr
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition: APFloat.cpp:5196
static APFloat getAllOnesValue(const fltSemantics &Semantics)
Returns a float which is bitcasted from an all one value int.
Definition: APFloat.cpp:5221
void clearSign()
Definition: APFloat.h:1159
opStatus next(bool nextDown)
Definition: APFloat.h:1115
void changeSign()
Definition: APFloat.h:1158
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition: APFloat.h:957
Class for arbitrary precision integers.
Definition: APInt.h:76
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1385
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:427
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition: APInt.h:207
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition: APInt.h:401
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1491
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1370
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1620
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition: APInt.h:1364
uint64_t extractBitsAsZExtValue(unsigned numBits, unsigned bitPosition) const
Definition: APInt.cpp:489
APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition: APInt.cpp:1002
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition: APInt.h:1463
APInt trunc(unsigned width) const
Truncate to new width.
Definition: APInt.cpp:906
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition: APInt.h:184
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1308
APInt abs() const
Get the absolute value.
Definition: APInt.h:1737
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:349
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:236
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:358
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition: APInt.h:444
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1636
void setSignBit()
Set the sign bit to 1.
Definition: APInt.h:1318
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1439
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1089
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition: APInt.h:187
bool isMinValue() const
Determine if this is the smallest unsigned value.
Definition: APInt.h:395
static APInt getMinValue(unsigned numBits)
Gets minimum unsigned value of APInt for a specific bit width.
Definition: APInt.h:194
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:307
bool intersects(const APInt &RHS) const
This operation tests if there are any pairs of corresponding bits between this APInt and RHS that are...
Definition: APInt.h:1227
bool eq(const APInt &RHS) const
Equality comparison.
Definition: APInt.h:1057
int32_t exactLogBase2() const
Definition: APInt.h:1725
void clearAllBits()
Set every bit to 0.
Definition: APInt.h:1375
void ashrInPlace(unsigned ShiftAmt)
Arithmetic right-shift this APInt by ShiftAmt in place.
Definition: APInt.h:812
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1589
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition: APInt.h:413
unsigned getNumSignBits() const
Computes the number of leading bits of this APInt that are equal to its sign bit.
Definition: APInt.h:1578
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition: APInt.h:1548
static APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition: APInt.cpp:620
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition: APInt.h:197
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition: APInt.h:1482
void flipAllBits()
Toggle every bit to its opposite value.
Definition: APInt.h:1405
unsigned countl_one() const
Count the number of leading one bits.
Definition: APInt.h:1565
void insertBits(const APInt &SubBits, unsigned bitPosition)
Insert the bits from a smaller APInt starting at bitPosition.
Definition: APInt.cpp:368
void clearLowBits(unsigned loBits)
Set bottom loBits bits to 0.
Definition: APInt.h:1395
unsigned logBase2() const
Definition: APInt.h:1703
void setAllBits()
Set every bit to 1.
Definition: APInt.h:1297
bool getBoolValue() const
Convert APInt to a boolean value.
Definition: APInt.h:449
bool isMask(unsigned numBits) const
Definition: APInt.h:466
bool isMaxSignedValue() const
Determine if this is the largest signed value.
Definition: APInt.h:383
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition: APInt.h:312
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition: APInt.h:1128
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:954
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition: APInt.h:1345
APInt shl(unsigned shiftAmt) const
Left-shift function.
Definition: APInt.h:851
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition: APInt.h:1235
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:418
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:284
bool isSignBitSet() const
Determine if sign bit of this APInt is set.
Definition: APInt.h:319
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:274
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:178
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition: APInt.h:1367
APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition: APInt.cpp:453
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition: APInt.h:410
bool isOne() const
Determine if this is a value of 1.
Definition: APInt.h:367
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition: APInt.h:264
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:217
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition: APInt.h:836
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition: APInt.h:829
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1606
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1199
bool isMaxValue() const
Determine if this is the largest unsigned value.
Definition: APInt.h:377
APInt truncSSat(unsigned width) const
Truncate to new width with signed saturation.
Definition: APInt.cpp:942
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
iterator end() const
Definition: ArrayRef.h:154
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:210
iterator begin() const
Definition: ArrayRef.h:153
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:195
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
Definition: Type.cpp:647
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
Definition: Instructions.h:696
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:748
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:867
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:760
@ Add
*p = old + v
Definition: Instructions.h:764
@ FAdd
*p = old + v
Definition: Instructions.h:785
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:778
@ Or
*p = old | v
Definition: Instructions.h:772
@ Sub
*p = old - v
Definition: Instructions.h:766
@ And
*p = old & v
Definition: Instructions.h:768
@ Xor
*p = old ^ v
Definition: Instructions.h:774
@ FSub
*p = old - v
Definition: Instructions.h:788
@ UIncWrap
Increment one up to a maximum value.
Definition: Instructions.h:800
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:776
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:782
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:796
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:780
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:792
@ UDecWrap
Decrement one until a minimum value or zero.
Definition: Instructions.h:804
@ Nand
*p = ~(old & v)
Definition: Instructions.h:770
Value * getPointerOperand()
Definition: Instructions.h:910
BinOp getOperation() const
Definition: Instructions.h:845
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
Definition: Instructions.h:901
Value * getValOperand()
Definition: Instructions.h:914
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
Definition: Instructions.h:887
This is an SDNode representing atomic operations.
bool hasFnAttr(Attribute::AttrKind Kind) const
Return true if the attribute exists for the function.
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:349
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
size_type count() const
count - Returns the number of bits which are set.
Definition: BitVector.h:162
bool none() const
none - Returns true if none of the bits are set.
Definition: BitVector.h:188
The address of a basic block.
Definition: Constants.h:889
A "pseudo-class" with methods for operating on BUILD_VECTORs.
bool getRepeatedSequence(const APInt &DemandedElts, SmallVectorImpl< SDValue > &Sequence, BitVector *UndefElements=nullptr) const
Find the shortest repeating sequence of values in the build vector.
SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
ConstantSDNode * getConstantSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant or null if this is not a constant splat.
Value * getCalledOperand() const
Definition: InstrTypes.h:1735
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:993
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:1022
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:1020
@ ICMP_EQ
equal
Definition: InstrTypes.h:1014
@ ICMP_NE
not equal
Definition: InstrTypes.h:1015
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:1105
static Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1291
static Constant * get(LLVMContext &Context, ArrayRef< uint8_t > Elts)
get() constructors - Return a constant with vector type with an element count and element type matchi...
Definition: Constants.cpp:2897
This is the shared class of boolean and integer constants.
Definition: Constants.h:80
static bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
Definition: Constants.cpp:1588
bool isMachineConstantPoolEntry() const
const Constant * getConstVal() const
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1398
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getIntegerValue(Type *Ty, const APInt &V)
Return the value for an integer or pointer constant, or a vector thereof, with the given scalar value...
Definition: Constants.cpp:400
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
Constant * getAggregateElement(unsigned Elt) const
For aggregates (struct/array/vector) return the constant that corresponds to the specified element if...
Definition: Constants.cpp:432
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
unsigned size() const
Definition: DenseMap.h:99
bool empty() const
Definition: DenseMap.h:98
iterator begin()
Definition: DenseMap.h:75
iterator end()
Definition: DenseMap.h:84
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:145
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
Tagged union holding either a T or a Error.
Definition: Error.h:474
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition: FastISel.h:66
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type::subtype_iterator param_iterator
Definition: DerivedTypes.h:126
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:681
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:701
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition: Function.cpp:713
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:678
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:263
bool hasPersonalityFn() const
Check whether this function has a personality function.
Definition: Function.h:851
Constant * getPersonalityFn() const
Get the personality function associated with this function.
Definition: Function.cpp:1903
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:339
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:675
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:973
const GlobalValue * getGlobal() const
static StringRef dropLLVMManglingEscape(StringRef Name)
If the given string begins with the GlobalValue name mangling escape character '\1',...
Definition: GlobalValue.h:567
bool isAbsoluteSymbolRef() const
Returns whether this is a reference to an absolute symbol.
Definition: Globals.cpp:380
ThreadLocalMode getThreadLocalMode() const
Definition: GlobalValue.h:271
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
This instruction compares its operands according to the predicate given to the constructor.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2666
std::vector< ConstraintInfo > ConstraintInfoVector
Definition: InlineAsm.h:121
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:83
const BasicBlock * getParent() const
Definition: Instruction.h:152
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:149
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:87
Class to represent integer types.
Definition: DerivedTypes.h:40
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
Definition: DerivedTypes.h:72
static bool LowerToByteSwap(CallInst *CI)
Try to replace a call instruction with a call to a bswap intrinsic.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
Definition: Instructions.h:184
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:266
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
bool usesWindowsCFI() const
Definition: MCAsmInfo.h:799
MCSymbol * getOrCreateParentFrameOffsetSymbol(const Twine &FuncName)
Definition: MCContext.cpp:220
MCSymbol * getOrCreateLSDASymbol(const Twine &FuncName)
Definition: MCContext.cpp:225
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:40
Set of metadata that should be preserved when using BuildMI().
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
@ INVALID_SIMPLE_VALUE_TYPE
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
bool is32BitVector() const
Return true if this is a 32-bit vector type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:585
bool bitsLT(MVT VT) const
Return true if this has less bits than VT.
bool is512BitVector() const
Return true if this is a 512-bit vector type.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool is256BitVector() const
Return true if this is a 256-bit vector type.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
MVT getDoubleNumVectorElementsVT() const
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
MVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
bool isEHPad() const
Returns true if the block is a landing pad.
reverse_iterator rend()
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
void push_back(MachineInstr *MI)
void setCallFrameSize(unsigned N)
Set the call frame size on entry to this basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
bool isLiveIn(MCPhysReg Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
unsigned succ_size() const
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
Instructions::iterator instr_iterator
succ_reverse_iterator succ_rbegin()
void eraseFromParent()
This method unlinks 'this' from the containing function and deletes it.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< succ_iterator > successors()
iterator insertAfter(iterator I, MachineInstr *MI)
Insert MI into the instruction list after I.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
succ_reverse_iterator succ_rend()
void setMachineBlockAddressTaken()
Set this block to indicate that its address is used as something other than the target of a terminato...
void setIsEHPad(bool V=true)
Indicates the block is a landing pad.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
void setFrameAddressIsTaken(bool T)
void setReturnAddressIsTaken(bool s)
void setHasCopyImplyingStackAdjustment(bool B)
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
int getFunctionContextIndex() const
Return the index for the function context object.
const WinEHFuncInfo * getWinEHFuncInfo() const
getWinEHFuncInfo - Return information about how the current function uses Windows exception handling.
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
MachineModuleInfo & getMMI() const
bool shouldSplitStack() const
Should we be emitting segmented stack stuff for the function.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
void moveCallSiteInfo(const MachineInstr *Old, const MachineInstr *New)
Move the call site info from Old to \New call site info.
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDisp(const MachineOperand &Disp, int64_t off, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
bool killsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr kills the specified register.
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:568
unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
@ EK_LabelDifference32
EK_LabelDifference32 - Each entry is the address of the block minus the address of the jump table.
@ EK_BlockAddress
EK_BlockAddress - Each entry is a plain address of block, e.g.: .word LBB123.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
const Value * getValue() const
Return the base address of the memory access.
const MCContext & getContext() const
const Module * getModule() const
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This class is used to represent an MGATHER node.
This is a base class used to represent MGATHER and MSCATTER nodes.
This class is used to represent an MLOAD node.
This base class is used to represent MLOAD and MSTORE nodes.
const SDValue & getMask() const
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
This class is used to represent an MSCATTER node.
This class is used to represent an MSTORE node.
bool isCompressingStore() const
Returns true if the op does a compression to the vector before storing.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isVolatile() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID for this memory operation.
Align getOriginalAlign() const
Returns alignment and volatility of the memory access.
bool readMem() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isNonTemporal() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition: Module.cpp:331
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:307
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:116
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
iterator_range< use_iterator > uses()
SDNodeFlags getFlags() const
MVT getSimpleValueType(unsigned ResNo) const
Return the type of a specified result as a simple type.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getNumOperands() const
Return the number of values used by this operation.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
static bool areOnlyUsersOf(ArrayRef< const SDNode * > Nodes, const SDNode *N)
Return true if all the users of N are contained in Nodes.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool isUndef() const
Return true if the type of the node type undefined.
bool hasNUsesOfValue(unsigned NUses, unsigned Value) const
Return true if there are exactly NUSES uses of the indicated value.
void setFlags(SDNodeFlags NewFlags)
op_iterator op_end() const
op_iterator op_begin() const
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
const APInt & getConstantOperandAPInt(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getOpcode() const
unsigned getNumOperands() const
Help to insert SDNodeFlags automatically in transforming.
Definition: SelectionDAG.h:361
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:225
bool willNotOverflowAdd(bool IsSigned, SDValue N0, SDValue N1) const
Determine if the result of the addition of 2 nodes can never overflow.
static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode)
Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
Definition: SelectionDAG.h:924
SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op)
Return the specified value casted to the target's desired shift amount type.
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:722
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
Definition: SelectionDAG.h:954
SDValue getSplatSourceVector(SDValue V, int &SplatIndex)
If V is a splatted value, return the source vector and its splat index.
unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS)
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:474
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
SDValue getConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offs=0, bool isT=false, unsigned TargetFlags=0)
SDNode * isConstantIntBuildVectorOrConstantInt(SDValue N) const
Test whether the given value is a constant int or similar node.
SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
bool shouldOptForSize() const
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:478
bool isEqualTo(SDValue A, SDValue B) const
Test whether two SDValues are known to compare equal.
static constexpr unsigned MaxRecursionDepth
Definition: SelectionDAG.h:448
SDValue expandVACopy(SDNode *Node)
Expand the specified ISD::VACOPY node as the Legalize pass would.
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
Definition: SelectionDAG.h:732
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:828
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, bool isTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
SDValue simplifySelect(SDValue Cond, SDValue TVal, SDValue FVal)
Try to simplify a select/vselect into 1 of its operands or a constant.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:472
SDValue expandVAArg(SDNode *Node)
Expand the specified ISD::VAARG node as the Legalize pass would.
bool doesNodeExist(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops)
Check if a node exists without modifying its flags.
bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
Definition: SelectionDAG.h:659
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getCommutedVectorShuffle(const ShuffleVectorSDNode &SV)
Returns an ISD::VECTOR_SHUFFLE node semantically equivalent to the shuffle node in input but with swa...
std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
const APInt * getValidShiftAmountConstant(SDValue V, const APInt &DemandedElts) const
If a SHL/SRA/SRL node V has a constant or splat constant shift amount that is less than the element b...
bool isGuaranteedNotToBeUndefOrPoison(SDValue Op, bool PoisonOnly=false, unsigned Depth=0) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
SDValue getRegister(unsigned Reg, EVT VT)
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:473
std::pair< SDValue, SDValue > getStrictFPExtendOrRound(SDValue Op, SDValue Chain, const SDLoc &DL, EVT VT)
Convert Op, which must be a STRICT operation of float type, to the float type VT, by either extending...
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:773
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
bool isKnownNeverZeroFloat(SDValue Op) const
Test whether the given floating point SDValue is known to never be positive or negative zero.
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:676
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool MaskedVectorIsZero(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
Return true if 'Op' is known to be zero in DemandedElts.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:768
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:469
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
Definition: SelectionDAG.h:799
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:845
SDValue FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops)
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVMContext * getContext() const
Definition: SelectionDAG.h:485
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL, bool LegalTypes=true)
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
SDValue getMCSymbol(MCSymbol *Sym, EVT VT)
SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:739
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:554
SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
static unsigned getOpcode_EXTEND(unsigned Opcode)
Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
Definition: SelectionDAG.h:908
SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp, ArrayRef< ISD::NodeType > CandidateBinOps, bool AllowPartials=false)
Match a binop + shuffle pyramid that represents a horizontal reduction over the elements of a vector ...
SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
static bool isBitRotateMask(ArrayRef< int > Mask, unsigned EltSizeInBits, unsigned MinSubElts, unsigned MaxSubElts, unsigned &NumSubElts, unsigned &RotateAmt)
Checks if the shuffle is a bit rotation of the first operand across multiple subelements,...
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
static void commuteMask(MutableArrayRef< int > Mask)
Change values in a shuffle permute mask assuming the two vector operands have swapped position.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
size_type count() const
Returns the number of bits which are set.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
size_type size() const
Definition: SmallSet.h:161
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:717
iterator erase(const_iterator CI)
Definition: SmallVector.h:750
typename SuperClass::const_iterator const_iterator
Definition: SmallVector.h:591
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
pointer data()
Return a pointer to the vector's buffer, even if empty().
Definition: SmallVector.h:299
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:317
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
constexpr StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition: StringRef.h:557
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:257
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:134
size_t size_type
Definition: StringRef.h:56
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:137
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:266
static constexpr size_t npos
Definition: StringRef.h:52
bool equals_insensitive(StringRef RHS) const
Check for string equality, ignoring case.
Definition: StringRef.h:170
size_t find_first_not_of(char C, size_t From=0) const
Find the first character in the string that is not C or npos if not found.
Definition: StringRef.cpp:251
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
static StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition: Type.cpp:373
Information about stack frame layout on the target.
virtual bool hasFP(const MachineFunction &MF) const =0
hasFP - Return true if the specified function should have a dedicated frame pointer register.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
virtual bool areJTsAllowed(const Function *Fn) const
Return true if lowering to a jump table is allowed.
void setLibcallCallingConv(RTLIB::Libcall Call, CallingConv::ID CC)
Set the CallingConv that should be used for the specified libcall.
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp convert the backend supports.
bool isTruncStoreLegal(EVT ValVT, EVT MemVT) const
Return true if the specified store with truncation is legal on this target.
virtual bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
virtual bool isCommutativeBinOp(unsigned Opcode) const
Returns true if the opcode is a commutative binary operation.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const
Return true if it is profitable to fold a pair of shifts into a mask.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
BooleanContent getBooleanContents(bool isVec, bool isFloat) const
For targets without i1 registers, this gives the nature of the high-bits of boolean values held in ty...
virtual MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const
Returns preferred type for switch condition.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setLibcallName(RTLIB::Libcall Call, const char *Name)
Rename the default libcall routine name for the specified libcall.
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
BooleanContent
Enum that describes how the target represents true/false values.
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
Return true if the target supports a memory access of this type for the given address space and align...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal on this target.
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
AndOrSETCCFoldKind
Enum of different potentially desirable ways to fold (and/or (setcc ...), (setcc ....
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
NegatibleCost
Enum that specifies when a float negation is beneficial.
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual bool shouldConvertPhiType(Type *From, Type *To) const
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
static ISD::NodeType getExtendForContent(BooleanContent Content)
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
virtual SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const
Expands target specific indirect branch for the case of JumpTable expansion.
SDValue getCheaperNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, unsigned Depth=0) const
This is the helper function to return the newly negated expression only when the cost is cheaper.
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue SimplifyMultipleUseDemandedVectorElts(SDValue Op, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
Helper wrapper around SimplifyMultipleUseDemandedBits, demanding all bits from only some vector eleme...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
virtual const char * LowerXConstraint(EVT ConstraintVT) const
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl< SDValue > &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL=SDValue(), SDValue LH=SDValue()) const
Attempt to expand an n-bit div/rem/divrem by constant using a n/2-bit urem by constant and other arit...
bool isPositionIndependent() const
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
bool verifyReturnAddressArgumentIsConstant(SDValue Op, SelectionDAG &DAG) const
virtual bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth=0) const
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, SDValue Index) const
Get a pointer to vector element Idx located in memory for a vector of type VecVT starting at a base a...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
virtual bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
bool useTLSDESC() const
Returns true if this target uses TLS Descriptors.
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
unsigned NoSignedZerosFPMath
NoSignedZerosFPMath - This flag is enabled when the -enable-no-signed-zeros-fp-math is specified on t...
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetInstrInfo * getInstrInfo() const
Target - Wrapper for Target specific information.
bool isOSMSVCRT() const
Is this a "Windows" OS targeting a "MSVCRT.dll" environment.
Definition: Triple.h:662
bool isOSDarwin() const
Is this a "Darwin" OS (macOS, iOS, tvOS, watchOS, XROS, or DriverKit).
Definition: Triple.h:553
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:330
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
static IntegerType * getInt1Ty(LLVMContext &C)
Type * getArrayElementType() const
Definition: Type.h:404
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:154
uint64_t getArrayNumElements() const
bool isX86_MMXTy() const
Return true if this is X86 MMX.
Definition: Type.h:201
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:143
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:157
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1808
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
use_iterator use_begin()
Definition: Value.h:360
bool use_empty() const
Definition: Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1074
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Definition: Type.cpp:683
Type * getElementType() const
Definition: DerivedTypes.h:436
bool has128ByteRedZone(const MachineFunction &MF) const
Return true if the function has a redzone (accessible bytes past the frame of the top of stack functi...
bool Uses64BitFramePtr
True if the 64-bit frame or stack pointer should be used.
unsigned getGlobalBaseReg(MachineFunction *MF) const
getGlobalBaseReg - Return a virtual register initialized with the the global base register value.
X86MachineFunctionInfo - This class is derived from MachineFunction and contains private X86 target-s...
ArrayRef< size_t > getPreallocatedArgOffsets(const size_t Id)
void setRestoreBasePointer(const MachineFunction *MF)
size_t getPreallocatedStackSize(const size_t Id)
unsigned getPtrSizedFrameRegister(const MachineFunction &MF) const
bool hasBasePointer(const MachineFunction &MF) const
Register getFrameRegister(const MachineFunction &MF) const override
const uint32_t * getDarwinTLSCallPreservedMask() const
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
Register getStackRegister() const
unsigned getSlotSize() const
Register getBaseRegister() const
const uint32_t * getNoPreservedMask() const override
bool canExtendTo512BW() const
Definition: X86Subtarget.h:249
bool hasAnyFMA() const
Definition: X86Subtarget.h:213
bool isOSWindows() const
Definition: X86Subtarget.h:335
bool isTargetMachO() const
Definition: X86Subtarget.h:301
bool useIndirectThunkBranches() const
Definition: X86Subtarget.h:235
bool hasSSE1() const
Definition: X86Subtarget.h:200
bool hasThreeDNow() const
Definition: X86Subtarget.h:211
bool isPICStyleGOT() const
Definition: X86Subtarget.h:341
bool hasSSE42() const
Definition: X86Subtarget.h:205
const X86TargetLowering * getTargetLowering() const override
Definition: X86Subtarget.h:125
bool hasMFence() const
Use mfence if we have SSE2 or we're on x86-64 (even if we asked for no-sse2).
Definition: X86Subtarget.h:289
bool canUseCMOV() const
Definition: X86Subtarget.h:199
bool isPICStyleStubPIC() const
Definition: X86Subtarget.h:344
bool isTargetWindowsMSVC() const
Definition: X86Subtarget.h:313
bool canUseCMPXCHG8B() const
Definition: X86Subtarget.h:192
bool isTargetDarwin() const
Definition: X86Subtarget.h:293
bool isTargetWin64() const
Definition: X86Subtarget.h:337
bool isTarget64BitLP64() const
Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
Definition: X86Subtarget.h:185
const Triple & getTargetTriple() const
Definition: X86Subtarget.h:291
const X86InstrInfo * getInstrInfo() const override
Definition: X86Subtarget.h:129
bool useAVX512Regs() const
Definition: X86Subtarget.h:266
bool hasSSE3() const
Definition: X86Subtarget.h:202
bool isCallingConvWin64(CallingConv::ID CC) const
Definition: X86Subtarget.h:350
bool hasAVX512() const
Definition: X86Subtarget.h:208
bool canExtendTo512DQ() const
Definition: X86Subtarget.h:246
bool hasSSE41() const
Definition: X86Subtarget.h:204
bool hasMMX() const
Definition: X86Subtarget.h:210
bool isTargetELF() const
Definition: X86Subtarget.h:299
bool hasSSEPrefetch() const
Definition: X86Subtarget.h:221
bool canUseCMPXCHG16B() const
Definition: X86Subtarget.h:193
unsigned char classifyGlobalReference(const GlobalValue *GV, const Module &M) const
bool hasSSE2() const
Definition: X86Subtarget.h:201
bool hasSSSE3() const
Definition: X86Subtarget.h:203
bool hasInt256() const
Definition: X86Subtarget.h:209
bool isPICStyleRIPRel() const
Definition: X86Subtarget.h:342
bool isTargetCygMing() const
Definition: X86Subtarget.h:333
unsigned char classifyLocalReference(const GlobalValue *GV) const
Classify a global variable reference for the current subtarget according to how we should reference i...
unsigned char classifyBlockAddressReference() const
Classify a blockaddress reference for the current subtarget according to how we should reference it i...
bool isTargetPS() const
Definition: X86Subtarget.h:297
const X86RegisterInfo * getRegisterInfo() const override
Definition: X86Subtarget.h:139
bool hasAVX() const
Definition: X86Subtarget.h:206
bool isTargetWindowsGNU() const
Definition: X86Subtarget.h:325
unsigned getPreferVectorWidth() const
Definition: X86Subtarget.h:239
bool isTargetWindowsItanium() const
Definition: X86Subtarget.h:329
bool isTargetNaCl64() const
Definition: X86Subtarget.h:309
const X86FrameLowering * getFrameLowering() const override
Definition: X86Subtarget.h:131
bool useBWIRegs() const
Definition: X86Subtarget.h:275
unsigned char classifyGlobalFunctionReference(const GlobalValue *GV, const Module &M) const
Classify a global function reference for the current subtarget.
bool hasAVX2() const
Definition: X86Subtarget.h:207
bool shouldFormOverflowOp(unsigned Opcode, EVT VT, bool MathUsed) const override
Overflow nodes should get combined/lowered to optimal instructions (they should allow eliminating exp...
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
bool isLegalAddImmediate(int64_t Imm) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool preferSextInRegOfTruncate(EVT TruncVT, EVT VT, EVT ExtVT) const override
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool preferABDSToABSWithNSW(EVT VT) const override
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
std::pair< SDValue, SDValue > BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer, MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const
bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded vector elements, returning true on success...
SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, const SDLoc &DL, const AsmOperandInfo &Constraint, SelectionDAG &DAG) const override
Handle Lowering flag assembly outputs.
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const override
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth) const override
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
bool convertSelectOfConstantsToMath(EVT VT) const override
Return true if a select of constants (select Cond, C1, C2) should be transformed into simple math ops...
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint letter, return the type of constraint for this target.
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
bool isVectorShiftByScalarCheap(Type *Ty) const override
This is used to enable splatted operand transforms for vector shifts and vector funnel shifts.
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool isLegalStoreImmediate(int64_t Imm) const override
Return true if the specified immediate is legal for the value input of a store instruction.
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
bool isCtlzFast() const override
Return true if ctlz instruction is fast.
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
bool supportSwiftError() const override
Return true if the target supports swifterror attribute.
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
bool shouldSplatInsEltVarIndex(EVT VT) const override
Return true if inserting a scalar into a variable element of an undef vector is more efficiently hand...
bool shouldSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Return true if sinking I's operands to the same basic block as I is profitable, e....
bool isInlineAsmTargetBranch(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo) const override
On x86, return true if the operand with index OpNo is a CALL or JUMP instruction, which can use eithe...
MVT hasFastEqualityCompare(unsigned NumBits) const override
Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op, const APInt &DemandedElts, unsigned MaskIndex, TargetLoweringOpt &TLO, unsigned Depth) const
bool isLegalICmpImmediate(int64_t Imm) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool hasInlineStackProbe(const MachineFunction &MF) const override
Returns true if stack probing through inline assembly is requested.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
unsigned preferedOpcodeForCmpEqPiecesOfOperand(EVT VT, unsigned ShiftOpc, bool MayTransformRotate, const APInt &ShiftOrRotateAmt, const std::optional< APInt > &AndMask) const override
bool isXAndYEqZeroPreferableToXAndYEqY(ISD::CondCode Cond, EVT VT) const override
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
bool ExpandInlineAsm(CallInst *CI) const override
This hook allows the target to expand an inline asm call to be explicit llvm code if it wants to.
bool hasAndNot(SDValue Y) const override
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const override
Return true if we believe it is correct and profitable to reduce the load node to a smaller type.
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool preferScalarizeSplat(SDNode *N) const override
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const override
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
bool useLoadStackGuardNode() const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
bool hasAndNotCompare(SDValue Y) const override
Return true if the target should transform: (X & Y) == Y —> (~X & Y) == 0 (X & Y) !...
bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override
Return true if it is profitable to convert a select of FP constants into a constant pool load whose a...
StringRef getStackProbeSymbolName(const MachineFunction &MF) const override
Returns the name of the symbol used to emit stack probes or the empty string if not applicable.
bool hasBitTest(SDValue X, SDValue Y) const override
Return true if the target has a bit-test instruction: (X & (1 << Y)) ==/!= 0 This knowledge can be us...
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
bool isShuffleMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
bool useStackGuardXorFP() const override
If this function returns true, stack protection checks should XOR the frame pointer (or whichever poi...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine the number of bits in the operation that are sign bits.
bool shouldScalarizeBinop(SDValue) const override
Scalar ops always have equal or better analysis/performance/power than the vector equivalent,...
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type Ty1 to type Ty2.
bool decomposeMulByConstant(LLVMContext &Context, EVT VT, SDValue C) const override
Return true if it is profitable to transform an integer multiplication-by-constant into simpler opera...
bool areJTsAllowed(const Function *Fn) const override
Returns true if lowering to a jump table is allowed.
bool isCommutativeBinOp(unsigned Opcode) const override
Returns true if the opcode is a commutative binary operation.
bool isScalarFPTypeInSSEReg(EVT VT) const
Return true if the specified scalar FP type is computed in an SSE register, not on the X87 floating p...
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const override
Returns preferred type for switch condition.
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool isVectorClearMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Similar to isShuffleMaskLegal.
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &Info, const char *Constraint) const override
Examine constraint string and operand type and determine a weight value.
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Customize the preferred legalization strategy for certain types.
bool shouldConvertPhiType(Type *From, Type *To) const override
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool hasStackProbeSymbol(const MachineFunction &MF) const override
Returns true if stack probing through a function call is requested.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type Ty1 implicit zero-extends the valu...
bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
This function returns true if the memory access is aligned or if the target allows this specific unal...
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, const SDLoc &DL) const override
TargetLowering::AndOrSETCCFoldKind isDesirableToCombineLogicOpOfSETCC(const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const override
Return prefered fold type, Abs if this is a vector, AddAnd if its an integer, None otherwise.
bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override
There are two ways to clear extreme bits (either low or high): Mask: x & (-1 << y) (the instcombine c...
bool addressingModeSupportsTLS(const GlobalValue &GV) const override
Returns true if the targets addressing mode can target thread local storage (TLS).
SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const override
Expands target specific indirect branch for the case of JumpTable expansion.
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isBinOp(unsigned Opcode) const override
Add x86-specific opcodes to the default list.
bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const override
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDValue unwrapAddress(SDValue N) const override
CondMergingParams getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs, const Value *Rhs) const override
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the value type to use for ISD::SETCC.
X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI)
bool isVectorLoadExtDesirable(SDValue) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
const Constant * getTargetConstantFromLoad(LoadSDNode *LD) const override
This method returns the constant pool value that will be loaded by LD.
EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const override
For types supported by the target, this is an identity function.
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
unsigned getStackProbeSize(const MachineFunction &MF) const
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
Replace the results of node with an illegal result type with new values built out of custom code.
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
bool needsFixedCatchObjects() const override
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:187
self_iterator getIterator()
Definition: ilist_node.h:109
#define INT64_MIN
Definition: DataTypes.h:74
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition: APInt.cpp:2978
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ X86_ThisCall
Similar to X86_StdCall.
Definition: CallingConv.h:122
@ X86_StdCall
stdcall is mostly used by the Win32 API.
Definition: CallingConv.h:99
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition: CallingConv.h:87
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ X86_FastCall
'fast' analog of X86_StdCall.
Definition: CallingConv.h:103
bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:750
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:236
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1132
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1128
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:723
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition: ISDOpcodes.h:476
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition: ISDOpcodes.h:147
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition: ISDOpcodes.h:497
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:250
@ ATOMIC_LOAD_NAND
Definition: ISDOpcodes.h:1275
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:559
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:714
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition: ISDOpcodes.h:1161
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_LOAD_MAX
Definition: ISDOpcodes.h:1277
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1247
@ STRICT_FCEIL
Definition: ISDOpcodes.h:426
@ ATOMIC_LOAD_UMIN
Definition: ISDOpcodes.h:1278
@ FRAME_TO_ARGS_OFFSET
FRAME_TO_ARGS_OFFSET - This node represents offset from frame pointer to first (possible) on-stack ar...
Definition: ISDOpcodes.h:124
@ RESET_FPENV
Set floating-point environment to default state.
Definition: ISDOpcodes.h:1008
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:487
@ FMAXNUM_IEEE
Definition: ISDOpcodes.h:985
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1037
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:783
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:483
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:199
@ RETURNADDR
Definition: ISDOpcodes.h:95
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition: ISDOpcodes.h:151
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
Definition: ISDOpcodes.h:1260
@ STRICT_FMINIMUM
Definition: ISDOpcodes.h:436
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:790
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:543
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:688
@ MEMBARRIER
MEMBARRIER - Compiler barrier only; generate a no-op.
Definition: ISDOpcodes.h:1234
@ ATOMIC_FENCE
OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope) This corresponds to the fence instruction.
Definition: ISDOpcodes.h:1239
@ SIGN_EXTEND_VECTOR_INREG
SIGN_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register sign-extension of the low ...
Definition: ISDOpcodes.h:820
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:255
@ STRICT_FSETCCS
Definition: ISDOpcodes.h:477
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:913
@ STRICT_FLOG2
Definition: ISDOpcodes.h:421
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1273
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:903
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:229
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1274
@ INIT_TRAMPOLINE
INIT_TRAMPOLINE - This corresponds to the init_trampoline intrinsic.
Definition: ISDOpcodes.h:1205
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:939
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition: ISDOpcodes.h:411
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1406
@ GlobalTLSAddress
Definition: ISDOpcodes.h:79
@ EH_LABEL
EH_LABEL - Represents a label in mid basic block used to track locations needed for debug and excepti...
Definition: ISDOpcodes.h:1108
@ EH_RETURN
OUTCHAIN = EH_RETURN(INCHAIN, OFFSET, HANDLER) - This node represents 'eh_return' gcc dwarf builtin,...
Definition: ISDOpcodes.h:135
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:885
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:774
@ STRICT_UINT_TO_FP
Definition: ISDOpcodes.h:450
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:620
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition: ISDOpcodes.h:101
@ BR
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:1053
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:722
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1227
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:994
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition: ISDOpcodes.h:758
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:930
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1083
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:327
@ ATOMIC_LOAD_MIN
Definition: ISDOpcodes.h:1276
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1062
@ GC_TRANSITION_START
GC_TRANSITION_START/GC_TRANSITION_END - These operators mark the beginning and end of GC transition s...
Definition: ISDOpcodes.h:1319
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:349
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:727
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1243
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:211
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition: ISDOpcodes.h:627
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition: ISDOpcodes.h:1157
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:208
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:323
@ STRICT_FTRUNC
Definition: ISDOpcodes.h:430
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:880
@ STRICT_FP_TO_FP16
Definition: ISDOpcodes.h:916
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:651
@ STRICT_FP16_TO_FP
Definition: ISDOpcodes.h:915
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:705
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:600
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1271
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:573
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:984
@ STRICT_FMAXIMUM
Definition: ISDOpcodes.h:435
@ STRICT_FMAXNUM
Definition: ISDOpcodes.h:424
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:535
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:203
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:780
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1217
@ FP_TO_UINT_SAT
Definition: ISDOpcodes.h:856
@ STRICT_FMINNUM
Definition: ISDOpcodes.h:425
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:742
@ ATOMIC_LOAD_UMAX
Definition: ISDOpcodes.h:1279
@ LOCAL_RECOVER
LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
Definition: ISDOpcodes.h:114
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:971
@ UBSANTRAP
UBSANTRAP - Trap with an immediate describing the kind of sanitizer failure.
Definition: ISDOpcodes.h:1221
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:331
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1047
@ ConstantPool
Definition: ISDOpcodes.h:82
@ ANY_EXTEND_VECTOR_INREG
ANY_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register any-extension of the low la...
Definition: ISDOpcodes.h:809
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:798
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:674
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:888
@ STRICT_FROUND
Definition: ISDOpcodes.h:428
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:736
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:303
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition: ISDOpcodes.h:449
@ STRICT_BF16_TO_FP
Definition: ISDOpcodes.h:924
@ STRICT_FFLOOR
Definition: ISDOpcodes.h:427
@ STRICT_FROUNDEVEN
Definition: ISDOpcodes.h:429
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:922
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition: ISDOpcodes.h:94
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1269
@ STRICT_FP_TO_UINT
Definition: ISDOpcodes.h:443
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:465
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:442
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:990
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1270
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:836
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1188
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition: ISDOpcodes.h:158
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:470
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:680
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1214
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:184
@ GET_FPENV_MEM
Gets the current floating-point environment.
Definition: ISDOpcodes.h:1013
@ STRICT_FP_TO_BF16
Definition: ISDOpcodes.h:925
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition: ISDOpcodes.h:400
@ STRICT_FLOG10
Definition: ISDOpcodes.h:420
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:524
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ STRICT_FEXP2
Definition: ISDOpcodes.h:418
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1268
@ ExternalSymbol
Definition: ISDOpcodes.h:83
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:869
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:414
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition: ISDOpcodes.h:831
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:907
@ STRICT_FNEARBYINT
Definition: ISDOpcodes.h:423
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition: ISDOpcodes.h:855
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition: ISDOpcodes.h:141
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:786
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition: ISDOpcodes.h:1152
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1076
@ BlockAddress
Definition: ISDOpcodes.h:84
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:763
@ GC_TRANSITION_END
Definition: ISDOpcodes.h:1320
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:493
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:340
@ AssertZext
Definition: ISDOpcodes.h:62
@ STRICT_FRINT
Definition: ISDOpcodes.h:422
@ SET_FPENV_MEM
Sets the current floating point environment.
Definition: ISDOpcodes.h:1018
@ ADJUST_TRAMPOLINE
ADJUST_TRAMPOLINE - This corresponds to the adjust_trampoline intrinsic.
Definition: ISDOpcodes.h:1211
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:313
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:192
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:515
bool isExtVecInRegOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1605
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
bool isBuildVectorOfConstantSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantSDNode or undef.
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool matchUnaryPredicate(SDValue Op, std::function< bool(ConstantSDNode *)> Match, bool AllowUndefs=false)
Hook for matching ConstantSDNode predicate.
bool isExtOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1600
CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
Definition: ISDOpcodes.h:1421
bool isTrueWhenEqual(CondCode Cond)
Return true if the specified condition returns true if the two operands to the condition are equal.
Definition: ISDOpcodes.h:1587
bool isUNINDEXEDLoad(const SDNode *N)
Returns true if the specified node is an unindexed load.
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
bool isFreezeUndef(const SDNode *N)
Return true if the specified node is FREEZE(UNDEF).
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
Definition: ISDOpcodes.h:1562
bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
bool isBuildVectorOfConstantFPSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantFPSDNode or undef.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1529
bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1509
bool isUnsignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs an unsigned comparison when used with intege...
Definition: ISDOpcodes.h:1568
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1465
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
Definition: PatternMatch.h:485
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
cst_pred_ty< is_sign_mask > m_SignMask()
Match an integer or vector with only the sign bit(s) set.
Definition: PatternMatch.h:625
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
Definition: PatternMatch.h:933
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:836
BinaryOp_match< LHS, RHS, Instruction::Xor, true > m_c_Xor(const LHS &L, const RHS &R)
Matches an Xor with LHS and RHS in either order.
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
Definition: PatternMatch.h:560
CmpClass_match< LHS, RHS, ICmpInst, ICmpInst::Predicate > m_ICmp(ICmpInst::Predicate &Pred, const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
CmpClass_match< LHS, RHS, ICmpInst, ICmpInst::Predicate, true > m_c_ICmp(ICmpInst::Predicate &Pred, const LHS &L, const RHS &R)
Matches an ICmp with a predicate over LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
Definition: PatternMatch.h:299
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall
RTLIB::Libcall enum - This enum defines all of the runtime library calls the backend can emit.
Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition: LLVMContext.h:54
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:57
@ GeneralDynamic
Definition: CodeGen.h:46
@ X86
Windows x64, Windows Itanium (IA-64)
@ PTR32_UPTR
Definition: X86.h:209
@ FS
Definition: X86.h:206
@ PTR64
Definition: X86.h:210
@ PTR32_SPTR
Definition: X86.h:208
@ GS
Definition: X86.h:205
Reg
All possible values of the reg field in the ModR/M byte.
@ MO_TLSLD
MO_TLSLD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
Definition: X86BaseInfo.h:425
@ MO_GOTPCREL_NORELAX
MO_GOTPCREL_NORELAX - Same as MO_GOTPCREL except that R_X86_64_GOTPCREL relocations are guaranteed to...
Definition: X86BaseInfo.h:405
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
Definition: X86BaseInfo.h:502
@ MO_NTPOFF
MO_NTPOFF - On a symbol operand this indicates that the immediate is the negative thread-pointer offs...
Definition: X86BaseInfo.h:464
@ MO_INDNTPOFF
MO_INDNTPOFF - On a symbol operand this indicates that the immediate is the absolute address of the G...
Definition: X86BaseInfo.h:446
@ MO_GOTNTPOFF
MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry w...
Definition: X86BaseInfo.h:470
@ MO_TPOFF
MO_TPOFF - On a symbol operand this indicates that the immediate is the thread-pointer offset for the...
Definition: X86BaseInfo.h:452
@ MO_TLVP_PIC_BASE
MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate is some TLS offset from the ...
Definition: X86BaseInfo.h:490
@ MO_TLSGD
MO_TLSGD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
Definition: X86BaseInfo.h:417
@ MO_NO_FLAG
MO_NO_FLAG - No flag for the operand.
Definition: X86BaseInfo.h:377
@ MO_TLVP
MO_TLVP - On a symbol operand this indicates that the immediate is some TLS offset.
Definition: X86BaseInfo.h:486
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the reference is actually to the "__imp...
Definition: X86BaseInfo.h:474
@ MO_GOTTPOFF
MO_GOTTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry wi...
Definition: X86BaseInfo.h:439
@ MO_SECREL
MO_SECREL - On a symbol operand this indicates that the immediate is the offset from beginning of sec...
Definition: X86BaseInfo.h:494
@ MO_DTPOFF
MO_DTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
Definition: X86BaseInfo.h:458
@ MO_TLSLDM
MO_TLSLDM - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
Definition: X86BaseInfo.h:433
@ MO_GOTPCREL
MO_GOTPCREL - On a symbol operand this indicates that the immediate is offset to the GOT entry for th...
Definition: X86BaseInfo.h:401
@ FST
This instruction implements a truncating store from FP stack slots.
@ CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FMAX
Floating point max and min.
@ BT
X86 bit-test instructions.
@ HADD
Integer horizontal add/sub.
@ MOVQ2DQ
Copies a 64-bit value from an MMX vector to the low word of an XMM vector, with the high word zero fi...
@ BLENDI
Blend where the selector is an immediate.
@ CMP
X86 compare and logical compare instructions.
@ BLENDV
Dynamic (non-constant condition) vector blend where only the sign bits of the condition elements are ...
@ ADDSUB
Combined add and sub on an FP vector.
@ STRICT_FCMP
X86 strict FP compare instructions.
@ STRICT_CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FHADD
Floating point horizontal add/sub.
@ BSR
Bit scan reverse.
@ SETCC
X86 SetCC.
@ NT_BRIND
BRIND node with NoTrack prefix.
@ SELECTS
X86 Select.
@ FSETCCM
X86 FP SETCC, similar to above, but with output as an i1 mask and and a version with SAE.
@ PEXTRB
Extract an 8-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRB.
@ FXOR
Bitwise logical XOR of floating point values.
@ BRCOND
X86 conditional branches.
@ FSETCC
X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
@ PINSRB
Insert the lower 8-bits of a 32-bit value to a vector, corresponds to X86::PINSRB.
@ INSERTPS
Insert any element of a 4 x float vector into any element of a destination 4 x floatvector.
@ PSHUFB
Shuffle 16 8-bit values within a vector.
@ PEXTRW
Extract a 16-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRW.
@ AADD
RAO arithmetic instructions.
@ FANDN
Bitwise logical ANDNOT of floating point values.
@ GlobalBaseReg
On Darwin, this node represents the result of the popl at function entry, used for PIC code.
@ FMAXC
Commutative FMIN and FMAX.
@ EXTRQI
SSE4A Extraction and Insertion.
@ FLD
This instruction implements an extending load to FP stack slots.
@ PSADBW
Compute Sum of Absolute Differences.
@ FOR
Bitwise logical OR of floating point values.
@ FIST
This instruction implements a fp->int store from FP stack slots.
@ FP_TO_INT_IN_MEM
This instruction implements FP_TO_SINT with the integer destination in memory and a FP reg source.
@ LADD
LOCK-prefixed arithmetic read-modify-write instructions.
@ MMX_MOVW2D
Copies a GPR into the low 32-bit word of a MMX vector and zero out the high word.
@ Wrapper
A wrapper node for TargetConstantPool, TargetJumpTable, TargetExternalSymbol, TargetGlobalAddress,...
@ PINSRW
Insert the lower 16-bits of a 32-bit value to a vector, corresponds to X86::PINSRW.
@ CMPCCXADD
Compare and Add if Condition is Met.
@ MMX_MOVD2W
Copies a 32-bit value from the low word of a MMX vector to a GPR.
@ FILD
This instruction implements SINT_TO_FP with the integer source in memory and FP reg result.
@ MOVDQ2Q
Copies a 64-bit value from the low word of an XMM vector to an MMX vector.
@ ANDNP
Bitwise Logical AND NOT of Packed FP values.
@ BSF
Bit scan forward.
@ VAARG_64
These instructions grab the address of the next argument from a va_list.
@ FAND
Bitwise logical AND of floating point values.
@ CMOV
X86 conditional moves.
@ WrapperRIP
Special wrapper used under X86-64 PIC mode for RIP relative displacements.
@ FSHL
X86 funnel/double shift i16 instructions.
@ FRSQRT
Floating point reciprocal-sqrt and reciprocal approximation.
@ TO_NEAREST_INT
Definition: X86BaseInfo.h:42
@ CUR_DIRECTION
Definition: X86BaseInfo.h:46
bool mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into a vector splat instruction as a memory oper...
bool isZeroNode(SDValue Elt)
Returns true if Elt is a constant zero or floating point constant +0.0.
CondCode GetOppositeBranchCondition(CondCode CC)
GetOppositeBranchCondition - Return the inverse of the specified cond, e.g.
bool mayFoldIntoZeroExtend(SDValue Op)
Check if Op is an operation that could be folded into a zero extend x86 instruction.
@ AddrNumOperands
Definition: X86BaseInfo.h:36
bool mayFoldIntoStore(SDValue Op)
Check if Op is a value that could be used to fold a store into some other x86 instruction as a memory...
bool isExtendedSwiftAsyncFrameSupported(const X86Subtarget &Subtarget, const MachineFunction &MF)
True if the target supports the extended frame for async Swift functions.
bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into some other x86 instruction as a memory oper...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement)
Returns true of the given offset can be fit into displacement field of the instruction.
bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs)
If Op is a constant whose elements are all the same constant or undefined, return true and return the...
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
std::optional< const char * > toString(const std::optional< DWARFFormValue > &V)
Take an optional DWARFFormValue and try to extract a string value from it.
constexpr double e
Definition: MathExtras.h:31
NodeAddr< FuncNode * > Func
Definition: RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits, unsigned NumDstElts, bool IsAnyExtend, SmallVectorImpl< int > &ShuffleMask)
Decode a zero extension instruction as a shuffle mask.
IterT next_nodbg(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It, then continue incrementing it while it points to a debug instruction.
static bool isGlobalStubReference(unsigned char TargetFlag)
isGlobalStubReference - Return true if the specified TargetFlag operand is a reference to a stub for ...
Definition: X86InstrInfo.h:109
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:456
@ Length
Definition: DWP.cpp:456
void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVHLPS instruction as a v2f64/v4f32 shuffle mask.
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1742
static bool isGlobalRelativeToPICBase(unsigned char TargetFlag)
isGlobalRelativeToPICBase - Return true if the specified global value reference is relative to a 32-b...
Definition: X86InstrInfo.h:127
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
void DecodeZeroMoveLowMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decode a move lower and zero upper instruction as a shuffle mask.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMILPD/VPERMILPS variable mask from a raw array of constants.
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
bool isAllOnesOrAllOnesSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant -1 integer or a splatted vector of a constant -1 integer (with...
Definition: Utils.cpp:1525
void DecodePSHUFLWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshuflw.
static const IntrinsicData * getIntrinsicWithChain(unsigned IntNo)
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2406
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition: MathExtras.h:343
MCRegister getX86SubSuperRegister(MCRegister Reg, unsigned Size, bool High=false)
AddressSpace
Definition: NVPTXBaseInfo.h:21
@ SjLj
setjmp/longjmp based exceptions
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
static void setDirectAddressInInstr(MachineInstr *MI, unsigned Operand, unsigned Reg)
Replace the address used in the instruction with the direct memory reference.
void DecodeVPERMV3Mask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
void DecodeBLENDMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a BLEND immediate mask into a shuffle mask.
void decodeVSHUF64x2FamilyMask(unsigned NumElts, unsigned ScalarSize, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a shuffle packed values at 128-bit granularity (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) immed...
void DecodeVPERMMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for VPERMQ/VPERMPD.
static const MachineInstrBuilder & addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset=0, bool mem=true)
addFrameReference - This function is used to add a reference to the base of an abstract object on the...
void DecodeEXTRQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A EXTRQ instruction as a shuffle mask.
static const MachineInstrBuilder & addFullAddress(const MachineInstrBuilder &MIB, const X86AddressMode &AM)
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:280
static const IntrinsicData * getIntrinsicWithoutChain(unsigned IntNo)
@ SM_SentinelUndef
@ SM_SentinelZero
bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition: Utils.cpp:1507
void DecodePSRLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:342
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1768
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:330
bool isMinSignedConstant(SDValue V)
Returns true if V is a constant min signed integer value.
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:372
void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
unsigned M1(unsigned Val)
Definition: VE.h:376
void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMIL2PD/VPERMIL2PS variable mask from a raw array of constants.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVLHPS instruction as a v2f64/v4f32 shuffle mask.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
static const MachineInstrBuilder & addRegOffset(const MachineInstrBuilder &MIB, unsigned Reg, bool isKill, int Offset)
addRegOffset - This function is used to add a memory reference of the form [Reg + Offset],...
void createUnpackShuffleMask(EVT VT, SmallVectorImpl< int > &Mask, bool Lo, bool Unary)
Generate unpacklo/unpackhi shuffle mask.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:138
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
void DecodeINSERTQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A INSERTQ instruction as a shuffle mask.
SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:143
void DecodeVPERMVMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants.
static void verifyIntrinsicTables()
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Mod
The access may modify the value stored in memory.
void createSplat2ShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Lo)
Similar to unpacklo/unpackhi, but without the 128-bit lane limitation imposed by AVX and specific to ...
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
bool isFuncletEHPersonality(EHPersonality Pers)
Returns true if this is a personality function that invokes handler funclets (which must return to it...
void DecodeVALIGNMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
CombineLevel
Definition: DAGCombine.h:15
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1954
void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Xor
Bitwise or logical XOR of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Add
Sum of integers.
void DecodeScalarMoveMask(unsigned NumElts, bool IsLoad, SmallVectorImpl< int > &ShuffleMask)
Decode a scalar float move instruction as a shuffle mask.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1914
static X86AddressMode getAddressFromInstr(const MachineInstr *MI, unsigned Operand)
Compute the addressing mode from an machine instruction starting with the given operand.
void DecodeVPPERMMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPPERM mask from a raw array of constants such as from BUILD_VECTOR.
DWARFExpression::Operation Op
void DecodePALIGNRMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
void DecodeMOVSLDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
RoundingMode
Rounding mode.
@ TowardZero
roundTowardZero.
@ NearestTiesToEven
roundTiesToEven.
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
unsigned M0(unsigned Val)
Definition: VE.h:375
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a 128-bit INSERTPS instruction as a v4f32 shuffle mask.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpcklps/unpcklpd and punpckl*.
void DecodePSLLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1921
void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpckhps/unpckhpd and punpckh*.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2039
@ TRUNCATE_TO_MEM_VI16
@ INTR_TYPE_SCALAR_MASK_SAE
@ INTR_TYPE_1OP_SAE
@ TRUNCATE_TO_MEM_VI32
@ INTR_TYPE_2OP_SAE
@ TRUNCATE_TO_REG
@ INTR_TYPE_3OP_SCALAR_MASK_SAE
@ INTR_TYPE_3OP_MASK_SAE
@ INTR_TYPE_2OP_MASK
@ TRUNCATE_TO_MEM_VI8
@ CVTNEPS2BF16_MASK
@ CMP_MASK_SCALAR_CC
@ INTR_TYPE_1OP_MASK_SAE
@ FIXUPIMM_MASKZ
@ INTR_TYPE_SCALAR_MASK
@ INTR_TYPE_3OP_IMM8
@ INTR_TYPE_2OP_MASK_SAE
@ INTR_TYPE_SCALAR_MASK_RND
@ INTR_TYPE_1OP_MASK
@ COMPRESS_EXPAND_IN_REG
@ INTR_TYPE_4OP_IMM8
void DecodePSHUFMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufd/pshufw/vpermilpd/vpermilps.
void DecodeMOVDDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
void array_pod_sort(IteratorTy Start, IteratorTy End)
array_pod_sort - This sorts an array with the specified start and end extent.
Definition: STLExtras.h:1607
void DecodeVectorBroadcast(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decodes a broadcast of the first element of a vector.
void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for shufp*.
void DecodePSHUFHWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufhw.
void DecodeMOVSHDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition: MathExtras.h:203
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
void DecodePSHUFBMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a PSHUFB mask from a raw array of constants such as from BUILD_VECTOR.
int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
static const MachineInstrBuilder & addDirectMem(const MachineInstrBuilder &MIB, unsigned Reg)
addDirectMem - This function is used to add a direct memory reference to the current instruction – th...
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
#define EQ(a, b)
Definition: regexec.c:112
This is used by foldAnyOrAllBitsSet() to capture a source value (Root) and the bit indexes (Mask) nee...
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:249
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:230
static constexpr roundingMode rmTowardZero
Definition: APFloat.h:234
static const fltSemantics & x87DoubleExtended() LLVM_READNONE
Definition: APFloat.cpp:263
static const fltSemantics & IEEEquad() LLVM_READNONE
Definition: APFloat.cpp:251
static const fltSemantics & IEEEdouble() LLVM_READNONE
Definition: APFloat.cpp:250
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:247
static const fltSemantics & BFloat() LLVM_READNONE
Definition: APFloat.cpp:248
opStatus
IEEE-754R 7: Default exception handling.
Definition: APFloat.h:246
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Extended Value Type.
Definition: ValueTypes.h:34
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition: ValueTypes.h:93
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:380
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:120
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:290
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:146
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:448
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:233
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition: ValueTypes.h:349
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:628
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:397
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition: ValueTypes.h:203
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:366
bool is512BitVector() const
Return true if this is a 512-bit vector type.
Definition: ValueTypes.h:213
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition: ValueTypes.h:58
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition: ValueTypes.h:208
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:202
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:156
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:101
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:438
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:151
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition: ValueTypes.h:198
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:297
static std::optional< bool > eq(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_EQ result.
Definition: KnownBits.cpp:494
KnownBits anyextOrTrunc(unsigned BitWidth) const
Return known bits for an "any" extension or truncation of the value we're tracking.
Definition: KnownBits.h:182
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition: KnownBits.h:104
bool isZero() const
Returns true if value is all zero.
Definition: KnownBits.h:77
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition: KnownBits.h:238
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:63
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition: KnownBits.h:157
bool hasConflict() const
Returns true if there is conflicting information.
Definition: KnownBits.h:47
unsigned countMaxPopulation() const
Returns the maximum number of bits that could be one.
Definition: KnownBits.h:285
void setAllZero()
Make all bits known to be zero and discard any previous information.
Definition: KnownBits.h:89
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:40
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition: KnownBits.h:168
bool isConstant() const
Returns true if we know the value of all bits.
Definition: KnownBits.h:50
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:71
static KnownBits abdu(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits for abdu(LHS, RHS).
Definition: KnownBits.cpp:234
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition: KnownBits.h:221
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:292
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition: KnownBits.h:307
KnownBits zextOrTrunc(unsigned BitWidth) const
Return known bits for a zero extension or truncation of the value we're tracking.
Definition: KnownBits.h:192
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:244
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition: KnownBits.h:141
static KnownBits computeForAddSub(bool Add, bool NSW, bool NUW, const KnownBits &LHS, const KnownBits &RHS)
Compute known bits resulting from adding LHS and RHS.
Definition: KnownBits.cpp:57
bool isNegative() const
Returns true if this value is known to be negative.
Definition: KnownBits.h:101
void setAllOnes()
Make all bits known to be one and discard any previous information.
Definition: KnownBits.h:95
static KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
Definition: KnownBits.cpp:777
static std::optional< bool > sgt(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_SGT result.
Definition: KnownBits.cpp:532
bool isAllOnes() const
Returns true if value is all one bits.
Definition: KnownBits.h:83
const APInt & getConstant() const
Returns the value when all bits have a known value.
Definition: KnownBits.h:57
This class contains a discriminated union of information about pointers in memory operands,...
bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
MachinePointerInfo getWithOffset(int64_t O) const
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoSignedZeros() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This contains information for each constraint that we are lowering.
MVT ConstraintVT
The ValueType for the operand value.
std::string ConstraintCode
This contains the actual string for the code, like "m".
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
CallLoweringInfo & setChain(SDValue InChain)
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
bool CombineTo(SDValue O, SDValue N)
X86AddressMode - This struct holds a generalized full x86 address mode.